In [2]:
"""
A ZenML pipeline that:
1. Fetches cleaned Q-C-A data from MongoDB (with metadata).
2. Performs negative sampling to create:
   - An LTR dataset: (question, context, label, answer, [metadata])
   - An LLM dataset: (input_text, target_text, [metadata])

Optionally, stores or logs these results (CSV, MongoDB, W&B).
"""

import random
import pandas as pd
from typing import Tuple

from zenml.pipelines import pipeline
from zenml.steps import step
from datetime import datetime
from pymongo import MongoClient

import wandb

In [3]:
client = MongoClient('mongodb://localhost:27017/')
db = client['medimaven_db']
collection = db['qa_master_processed']

# Convert cursor to DataFrame
data = list(collection.find({}, {"_id": 0}))
df = pd.DataFrame(data)
df

Unnamed: 0,Dataset,focus,synonyms,qtype,question,context,answer,speciality,tags,created_at,updated_at,context_length
0,MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",information,"What is are T cell immunodeficiency, congenita...","alymphoid cystic thymic dysgenesis,congenital ...","T cell immunodeficiency, congenital alopecia, ...",,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,324
1,MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",frequency,How many people are affected by T cell immunod...,"alymphoid cystic thymic dysgenesis,congenital ...","T cell immunodeficiency, congenital alopecia, ...",,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,344
2,MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",genetic changes,What are the genetic changes related to T cell...,"alymphoid cystic thymic dysgenesis,congenital ...","T cell immunodeficiency, congenital alopecia, ...",,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,352
3,MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",inheritance,"Is T cell immunodeficiency, congenital alopeci...","alymphoid cystic thymic dysgenesis,congenital ...",This condition is inherited in an autosomal re...,,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,325
4,MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",treatment,What are the treatments for T cell immunodefic...,"alymphoid cystic thymic dysgenesis,congenital ...",These resources address the diagnosis or manag...,,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,340
...,...,...,...,...,...,...,...,...,...,...,...,...
52006,iCliniQ,,,,I am not gaining weight. Please guide me if I ...,"Hi doctor,\nI am 24 years old male. For the pa...","Hello,Welcome to icliniq.com.First of all, che...",Neurology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,401
52007,iCliniQ,,,,Do I need treatment for premature ejaculation?,"Hello doctor,\nI am 37 years old, a gynecologi...","Hello,\nWelcome to icliniq.com.\nNo, it is not...",Cardiology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,600
52008,iCliniQ,,,,I had unprotected sex. What are my chances of ...,"Hello doctor,\nI had unprotected sex with my b...","Hi,\nWelcome to icliniq.com.\nPlease do not be...",Obstetrics and Gynecology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,653
52009,iCliniQ,,,,How many days after HIV do rashes appear?,"Hello doctor,\n15 days back, I met a girl and ...","Hello,\nWelcome to icliniq.com. A rash is a ve...",Dermatology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,1221


In [15]:
hq_df = df[df['Dataset'] == 'iCliniQ'].reset_index(drop=True)
hq_df

Unnamed: 0,Dataset,focus,synonyms,qtype,question,context,answer,speciality,tags,created_at,updated_at,context_length
0,iCliniQ,,,,What are effective therapies for metastatic br...,Metastatic breast cancer occurs when cancer sp...,"Hello,\nWelcome to icliniq.com.\nI can underst...",Medical oncology,[],2025-03-03 03:41:54.993,2025-03-03 03:41:54.993,699
1,iCliniQ,,,,How does HIV spread?,HIV spreads by certain body fluids from an inf...,"Hello,\nWelcome to icliniq.com.\nI read your q...",Dermatology,[],2025-03-03 03:41:54.993,2025-03-03 03:41:54.993,730
2,iCliniQ,,,,Can recurrent hoarseness without GERD indicate...,Recurrent hoarseness may result from vocal str...,"Hi,\nWelcome to icliniq.com.\nI have read your...",Otolaryngology (E.N.T),[],2025-03-03 03:41:54.993,2025-03-03 03:41:54.993,471
3,iCliniQ,,,,Is long term Pantocid IT use safe?,Long term use of Pantocid IT may cause nutrien...,"Hi,\nWelcome to icliniq.com.\nI have read your...",Medical Gastroenterology,[],2025-03-03 03:41:54.993,2025-03-03 03:41:54.993,718
4,iCliniQ,,,,Can type 2 diabetes resolve after delivery?,"After giving birth, a person with type 1 diabe...","Hi,\nWelcome to icliniq.com.\nI have gone thro...",Pulmonology (Asthma Doctors),[],2025-03-03 03:41:54.993,2025-03-03 03:41:54.993,709
...,...,...,...,...,...,...,...,...,...,...,...,...
48068,iCliniQ,,,,I am not gaining weight. Please guide me if I ...,"Hi doctor,\nI am 24 years old male. For the pa...","Hello,Welcome to icliniq.com.First of all, che...",Neurology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,401
48069,iCliniQ,,,,Do I need treatment for premature ejaculation?,"Hello doctor,\nI am 37 years old, a gynecologi...","Hello,\nWelcome to icliniq.com.\nNo, it is not...",Cardiology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,600
48070,iCliniQ,,,,I had unprotected sex. What are my chances of ...,"Hello doctor,\nI had unprotected sex with my b...","Hi,\nWelcome to icliniq.com.\nPlease do not be...",Obstetrics and Gynecology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,653
48071,iCliniQ,,,,How many days after HIV do rashes appear?,"Hello doctor,\n15 days back, I met a girl and ...","Hello,\nWelcome to icliniq.com. A rash is a ve...",Dermatology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,1221


In [16]:
# Gather all unique contexts once
all_contexts = df["context"].unique().tolist()
hq_contexts = hq_df['context'].unique().tolist()
ltr_records = []

In [17]:
all_contexts


['alymphoid cystic thymic dysgenesis,congenital alopecia and nail dystrophy associated with severe functional T cell immunodeficiency,Pignata Guarino syndrome,winged helix deficiency T cell immunodeficiency, congenital alopecia, and nail dystrophy What is are T cell immunodeficiency, congenital alopecia, and nail dystrophy ?',
 'alymphoid cystic thymic dysgenesis,congenital alopecia and nail dystrophy associated with severe functional T cell immunodeficiency,Pignata Guarino syndrome,winged helix deficiency T cell immunodeficiency, congenital alopecia, and nail dystrophy How many people are affected by T cell immunodeficiency, congenital alopecia, and nail dystrophy ?',
 'alymphoid cystic thymic dysgenesis,congenital alopecia and nail dystrophy associated with severe functional T cell immunodeficiency,Pignata Guarino syndrome,winged helix deficiency T cell immunodeficiency, congenital alopecia, and nail dystrophy What are the genetic changes related to T cell immunodeficiency, congenita

In [18]:
hq_contexts

['Metastatic breast cancer occurs when cancer spreads. Treatments include immunotherapy, chemotherapy, and targeted therapy based on tumor factors. Hello doctor,\nMy mother was diagnosed with stage 3 breast cancer, which was completely removed. However, two years later, she experienced metastasis that spread to the brain. Now, the cancer has spread to the liver, and she has developed jaundice. Her bilirubin levels have increased from 7.5 to 10. Is there anything that can be done with medication? The common bile duct CBD is not dilated she also has ascites, for which a combination of Spironolactone and Torasemide will be administered. Can anything be done to manage the jaundice?\nKindly suggest.',
 'HIV spreads by certain body fluids from an infected individual. It is not transmitted by casual touch, hugging, sharing dishes, or insect bites. Hello doctor,Last night I went for dinner and at a table nearby a lady was using a toothpick. When she put it out of her mouth I saw that about hal

In [20]:
hq_grouped = hq_df.groupby('question')
hq_grouped

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x150d43b90>

In [22]:
for question, group in hq_grouped:
    print('question', question)
    print('group', group)
    break

question 1 year old fell, hurt head. Complications?
group        Dataset focus synonyms qtype  \
15069  iCliniQ                        

                                         question  \
15069  1 year old fell, hurt head. Complications?   

                                                 context  \
15069  Frequent fall and hit on the head, if associat...   

                                                  answer  speciality tags  \
15069  Hello,\nWelcome to icliniq.com.\nFrequent fall...  Pediatrics   []   

                   created_at              updated_at  context_length  
15069 2025-03-03 03:41:54.998 2025-03-03 03:41:54.998             460  


In [28]:
tr_records = []

In [None]:
n_negatives = 2
for question, group in hq_grouped:

    # For each ground-truth pair
    for _, row in group.iterrows():

        pos_context = row['context']
        answer = row['answer']

        # We'll keep any extra metadata columns for reference
        row_dict = row.to_dict()
            # Positive record
        pos_record = {
            "question": question,
            "context": pos_context,
            "label": 1,
            "answer": answer
        }

        # Copy over any extra metadata fields
        for col in row_dict:
            if col not in ["question", "context", "answer"]:
                pos_record[col] = row_dict[col]

        ltr_records.append(pos_record)

        negative_candidates = [ctx for ctx in all_contexts if ctx != pos_context]
        n_neg_contexts = random.sample(negative_candidates, min(n_negatives, len(negative_candidates)))
        for neg_c in n_neg_contexts:
            neg_record = {
                'question':question,
                'context': neg_c,
                'label': 0,
                'answer': answer
            }

            # copy over metadata
            for col in row_dict:
                if col not in ["question", "context", "answer"]:
                    neg_record[col] = row_dict[col]

            ltr_records.append(neg_record)



    

In [31]:
len(ltr_records)

144219

In [33]:
ltr_df = pd.DataFrame(ltr_records)
ltr_df

Unnamed: 0,question,context,label,answer,Dataset,focus,synonyms,qtype,speciality,tags,created_at,updated_at,context_length
0,"1 year old fell, hurt head. Complications?","Frequent fall and hit on the head, if associat...",1,"Hello,\nWelcome to icliniq.com.\nFrequent fall...",iCliniQ,,,,Pediatrics,[],2025-03-03 03:41:54.998,2025-03-03 03:41:54.998,460
1,"1 year old fell, hurt head. Complications?",It is very rare to develop cancer from tonsill...,0,"Hello,\nWelcome to icliniq.com.\nFrequent fall...",iCliniQ,,,,Pediatrics,[],2025-03-03 03:41:54.998,2025-03-03 03:41:54.998,460
2,"1 year old fell, hurt head. Complications?",A blind spot is a spot where there is an obscu...,0,"Hello,\nWelcome to icliniq.com.\nFrequent fall...",iCliniQ,,,,Pediatrics,[],2025-03-03 03:41:54.998,2025-03-03 03:41:54.998,460
3,1.8 years old is not crawling yet. What can be...,Babies usually start crawling by 12 or 13 mont...,1,"Hello,\nWelcome to icliniq.com.\n1. Your daugh...",iCliniQ,,,,Pediatrics,[],2025-03-03 03:41:55.006,2025-03-03 03:41:55.006,330
4,1.8 years old is not crawling yet. What can be...,"Hello doctor,\nFor the past three years, my 2D...",0,"Hello,\nWelcome to icliniq.com.\n1. Your daugh...",iCliniQ,,,,Pediatrics,[],2025-03-03 03:41:55.006,2025-03-03 03:41:55.006,330
...,...,...,...,...,...,...,...,...,...,...,...,...,...
144214,what is the ideal way to treat herpes infectio...,A mouth ulcer can develop accidentally by biti...,0,"Hello,\nWelcome to icliniq.com.\nI read your q...",iCliniQ,,,,General Practitioner,[],2025-03-03 03:41:55.006,2025-03-03 03:41:55.006,1384
144215,what is the ideal way to treat herpes infectio...,Syrinx is a fluid filled cavity within the spi...,0,"Hello,\nWelcome to icliniq.com.\nI read your q...",iCliniQ,,,,General Practitioner,[],2025-03-03 03:41:55.006,2025-03-03 03:41:55.006,1384
144216,what would be the reason for getting tired fre...,"In most cases, fatigue can occur as a result o...",1,"Hello,\nWelcome to icliniq.com.\nI can underst...",iCliniQ,,,,Psychiatry,[],2025-03-03 03:41:55.005,2025-03-03 03:41:55.005,431
144217,what would be the reason for getting tired fre...,Emergency contraception like I pill can preven...,0,"Hello,\nWelcome to icliniq.com.\nI can underst...",iCliniQ,,,,Psychiatry,[],2025-03-03 03:41:55.005,2025-03-03 03:41:55.005,431


    ### --- Step B2: Build LLM Dataset (Q + ground-truth context -> answer) --- #
    ### We'll keep only positive Q–C pairs from the original df to generate (input_text, target_text).
    ### You can add metadata columns if you wish.

    ### We'll rename "answer" to "target_text" 
    ### and create "input_text" = question + special token + context

In [34]:
llm_records = []
for _, row in df.iterrows():
    question = row["question"]
    context = row["context"]
    answer = row["answer"]
    row_dict = row.to_dict()


    input_text = f"{question} [SEP] {context}"

    llm_record = {
        "input_text": input_text,
        "target_text": answer
    }
    # copy any metadata
    for col in row_dict:
        if col not in ["question", "context", "answer"]:
            llm_record[col] = row_dict[col]
        
    llm_records.append(llm_record)

In [35]:
len(llm_records)

52011

In [36]:
llm_df = pd.DataFrame(llm_records)

llm_df


Unnamed: 0,input_text,target_text,Dataset,focus,synonyms,qtype,speciality,tags,created_at,updated_at,context_length
0,"What is are T cell immunodeficiency, congenita...","T cell immunodeficiency, congenital alopecia, ...",MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",information,,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,324
1,How many people are affected by T cell immunod...,"T cell immunodeficiency, congenital alopecia, ...",MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",frequency,,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,344
2,What are the genetic changes related to T cell...,"T cell immunodeficiency, congenital alopecia, ...",MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",genetic changes,,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,352
3,"Is T cell immunodeficiency, congenital alopeci...",This condition is inherited in an autosomal re...,MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",inheritance,,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,325
4,What are the treatments for T cell immunodefic...,These resources address the diagnosis or manag...,MedQuad,"T-cell immunodeficiency, congenital alopecia, ...","alymphoid cystic thymic dysgenesis,congenital ...",treatment,,[],2025-03-03 03:41:54.991,2025-03-03 03:41:54.991,340
...,...,...,...,...,...,...,...,...,...,...,...
52006,I am not gaining weight. Please guide me if I ...,"Hello,Welcome to icliniq.com.First of all, che...",iCliniQ,,,,Neurology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,401
52007,Do I need treatment for premature ejaculation?...,"Hello,\nWelcome to icliniq.com.\nNo, it is not...",iCliniQ,,,,Cardiology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,600
52008,I had unprotected sex. What are my chances of ...,"Hi,\nWelcome to icliniq.com.\nPlease do not be...",iCliniQ,,,,Obstetrics and Gynecology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,653
52009,How many days after HIV do rashes appear? [SEP...,"Hello,\nWelcome to icliniq.com. A rash is a ve...",iCliniQ,,,,Dermatology,[],2025-03-03 03:41:55.009,2025-03-03 03:41:55.009,1221


In [37]:
# ------------------ THE PIPELINE ------------------ #
def neg_sampling_pipeline(fetch_data_from_mongo, create_ltr_and_llm_datasets):
    """
    ZenML pipeline that:
    1. Fetches Q-C-A data (with metadata) from MongoDB.
    2. Creates negative samples for LTR + LLM dataset.
    """
    df = fetch_data_from_mongo()
    ltr_df, llm_df = create_ltr_and_llm_datasets(df)
    # The pipeline ends by returning these dataframes.
    # If you want to store them, you can add more steps, or
    # you can store them in the same step.
    return ltr_df, llm_df

In [38]:
import wandb

In [None]:
wandb.init()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: Paste an API key from your profile and hit enter: