# Min Clean Each Dataset and Combine

In [2]:
import pandas as pd
from pathlib import Path
import re

def clean_text(text):
    """
    Clean the input text by removing HTML tags and unwanted characters.
    This function converts the input to a string, removes any HTML tags,
    filters out characters that are not alphanumeric, punctuation (.,!?), 
    or whitespace, and trims leading and trailing whitespace.

    Args:
        text (str): The text to be cleaned.
    Returns:
        str: The cleaned text.
    """
    # Remove HTML tags from the text
    text = re.sub(r"<.*?>", "", str(text))
    # Remove characters that are not letters, numbers, punctuation, or whitespace
    text = re.sub(r"[^a-zA-Z0-9.,!?\s]", " ", text)
    #colapse 2 white spaces to one
    text = re.sub(r'\s{2,}', ' ', text)
    # Return the cleaned text with whitespace trimmed
    return text.strip()

# if __name__ == '__main__':
#     df_medquad = pd.read_csv("data/processed/medquad.csv")
#     df_icliniq = pd.read_csv("data/processed/icliniq.csv")

#     # Merge
#     df_combined = pd.concat([df_medquad, df_icliniq], ignore_index=True)

    

#     # Clean
#     df_combined["question"] = df_combined["question"].apply(clean_text)
#     df_combined["answer"]   = df_combined["answer"].apply(clean_text)

#     # Remove duplicates
#     df_combined.drop_duplicates(subset=["question","answer"], inplace=True)

#     df_combined.to_csv("data/processed/qa_master.csv", index=False)

In [3]:
df_medquad = pd.read_csv("../data/processed/medquad.csv")
df_icliniq = pd.read_csv("../data/processed/icliniq.csv")

### 1. MEDQUAD

In [4]:
df_medquad = df_medquad.drop(['url', 'document_id', 'semantic_group'], axis=1)
df_medquad.head()

Unnamed: 0,source,focus,synonyms,pid,qtype,question,answer
0,GHR,keratoderma with woolly hair,KWWH,1,information,What is (are) keratoderma with woolly hair ?,Keratoderma with woolly hair is a group of rel...
1,GHR,keratoderma with woolly hair,KWWH,2,frequency,How many people are affected by keratoderma wi...,Keratoderma with woolly hair is rare; its prev...
2,GHR,keratoderma with woolly hair,KWWH,3,genetic changes,What are the genetic changes related to kerato...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen..."
3,GHR,keratoderma with woolly hair,KWWH,4,inheritance,Is keratoderma with woolly hair inherited ?,Most cases of keratoderma with woolly hair hav...
4,GHR,keratoderma with woolly hair,KWWH,5,treatment,What are the treatments for keratoderma with w...,These resources address the diagnosis or manag...


In [5]:
print(df_medquad.isna().sum().reset_index())

      index     0
0    source     0
1     focus    14
2  synonyms  5518
3       pid     0
4     qtype     0
5  question     0
6    answer     5


In [6]:
# ddropping columns where answers are null
df_medquad.dropna(subset=['answer'], inplace=True)


In [7]:
df_medquad['context'] =  df_medquad['synonyms'].fillna('')+ ' '+df_medquad['focus'].fillna('')+' ' + df_medquad['question'].fillna('')
df_medquad['Dataset'] = 'MedQuad'

df_medquad = df_medquad[['Dataset','focus','synonyms', 'qtype','question', 'context', 'answer' ]]
df_medquad.head()




Unnamed: 0,Dataset,focus,synonyms,qtype,question,context,answer
0,MedQuad,keratoderma with woolly hair,KWWH,information,What is (are) keratoderma with woolly hair ?,KWWH keratoderma with woolly hair What is (are...,Keratoderma with woolly hair is a group of rel...
1,MedQuad,keratoderma with woolly hair,KWWH,frequency,How many people are affected by keratoderma wi...,KWWH keratoderma with woolly hair How many peo...,Keratoderma with woolly hair is rare; its prev...
2,MedQuad,keratoderma with woolly hair,KWWH,genetic changes,What are the genetic changes related to kerato...,KWWH keratoderma with woolly hair What are the...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen..."
3,MedQuad,keratoderma with woolly hair,KWWH,inheritance,Is keratoderma with woolly hair inherited ?,KWWH keratoderma with woolly hair Is keratoder...,Most cases of keratoderma with woolly hair hav...
4,MedQuad,keratoderma with woolly hair,KWWH,treatment,What are the treatments for keratoderma with w...,KWWH keratoderma with woolly hair What are the...,These resources address the diagnosis or manag...


In [8]:
df_medquad.focus.unique()

array(['keratoderma with woolly hair', 'Knobloch syndrome', 'coloboma',
       ..., 'High Blood Pressure and Kidney Disease',
       'What I need to know about Cirrhosis',
       '4 Steps to Manage Your Diabetes for Life'],
      shape=(5126,), dtype=object)

## 2. iCliniq

In [9]:
df_icliniq.head()

Unnamed: 0,Speciality,Title,Abstract,Question,Answer
0,Medical oncology,What are effective therapies for metastatic br...,Metastatic breast cancer occurs when cancer sp...,"Hello doctor,\nMy mother was diagnosed with st...","Hello,\nWelcome to icliniq.com.\nI can underst..."
1,Dermatology,How does HIV spread?,HIV spreads by certain body fluids from an inf...,"Hello doctor,Last night I went for dinner and ...","Hello,\nWelcome to icliniq.com.\nI read your q..."
2,Otolaryngology (E.N.T),Can recurrent hoarseness without GERD indicate...,Recurrent hoarseness may result from vocal str...,"Hi doctor,I am a 59-year-old male and a nonsmo...","Hi,\nWelcome to icliniq.com.\nI have read your..."
3,Medical Gastroenterology,Is long-term Pantocid-IT use safe?,Long-term use of Pantocid-IT may cause nutrien...,"Hi doctor,\nI am a 35-year-old male. My height...","Hi,\nWelcome to icliniq.com.\nI have read your..."
4,Pulmonology (Asthma Doctors),Can type 2 diabetes resolve after delivery?,"After giving birth, a person with type 1 diabe...","Hi doctor,My sister delivered a baby one month...","Hi,\nWelcome to icliniq.com.\nI have gone thro..."


In [10]:
df_icliniq.Abstract

0        Metastatic breast cancer occurs when cancer sp...
1        HIV spreads by certain body fluids from an inf...
2        Recurrent hoarseness may result from vocal str...
3        Long-term use of Pantocid-IT may cause nutrien...
4        After giving birth, a person with type 1 diabe...
                               ...                        
48452                                                  NaN
48453                                                  NaN
48454                                                  NaN
48455                                                  NaN
48456                                                  NaN
Name: Abstract, Length: 48457, dtype: object

In [11]:
df_icliniq.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48457 entries, 0 to 48456
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Speciality  48343 non-null  object
 1   Title       48457 non-null  object
 2   Abstract    43343 non-null  object
 3   Question    48457 non-null  object
 4   Answer      48457 non-null  object
dtypes: object(5)
memory usage: 1.8+ MB


In [12]:
df_icliniq['context'] = df_icliniq['Abstract'].fillna('')+' ' + df_icliniq['Question'].fillna('')
df_icliniq = df_icliniq[['Speciality', 'Title', 'context', 'Answer']]
df_icliniq['Dataset'] = 'iCliniQ'

df_icliniq.rename(columns={
    "Speciality": "speciality",
    "Title": "question",
    "Answer": "answer"
}, inplace=True)

df_icliniq.head()


Unnamed: 0,speciality,question,context,answer,Dataset
0,Medical oncology,What are effective therapies for metastatic br...,Metastatic breast cancer occurs when cancer sp...,"Hello,\nWelcome to icliniq.com.\nI can underst...",iCliniQ
1,Dermatology,How does HIV spread?,HIV spreads by certain body fluids from an inf...,"Hello,\nWelcome to icliniq.com.\nI read your q...",iCliniQ
2,Otolaryngology (E.N.T),Can recurrent hoarseness without GERD indicate...,Recurrent hoarseness may result from vocal str...,"Hi,\nWelcome to icliniq.com.\nI have read your...",iCliniQ
3,Medical Gastroenterology,Is long-term Pantocid-IT use safe?,Long-term use of Pantocid-IT may cause nutrien...,"Hi,\nWelcome to icliniq.com.\nI have read your...",iCliniQ
4,Pulmonology (Asthma Doctors),Can type 2 diabetes resolve after delivery?,"After giving birth, a person with type 1 diabe...","Hi,\nWelcome to icliniq.com.\nI have gone thro...",iCliniQ


## Combine

In [13]:
df_combined = pd.concat([df_medquad, df_icliniq], ignore_index=True)
df_combined


Unnamed: 0,Dataset,focus,synonyms,qtype,question,context,answer,speciality
0,MedQuad,keratoderma with woolly hair,KWWH,information,What is (are) keratoderma with woolly hair ?,KWWH keratoderma with woolly hair What is (are...,Keratoderma with woolly hair is a group of rel...,
1,MedQuad,keratoderma with woolly hair,KWWH,frequency,How many people are affected by keratoderma wi...,KWWH keratoderma with woolly hair How many peo...,Keratoderma with woolly hair is rare; its prev...,
2,MedQuad,keratoderma with woolly hair,KWWH,genetic changes,What are the genetic changes related to kerato...,KWWH keratoderma with woolly hair What are the...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen...",
3,MedQuad,keratoderma with woolly hair,KWWH,inheritance,Is keratoderma with woolly hair inherited ?,KWWH keratoderma with woolly hair Is keratoder...,Most cases of keratoderma with woolly hair hav...,
4,MedQuad,keratoderma with woolly hair,KWWH,treatment,What are the treatments for keratoderma with w...,KWWH keratoderma with woolly hair What are the...,These resources address the diagnosis or manag...,
...,...,...,...,...,...,...,...,...
64859,iCliniQ,,,,I am not gaining weight. Please guide me if I ...,"Hi doctor,\nI am 24 years old male. For the p...","Hello,Welcome to icliniq.com.First of all, che...",Neurology
64860,iCliniQ,,,,Do I need treatment for premature ejaculation?,"Hello doctor,\nI am 37 years old, a gynecolog...","Hello,\nWelcome to icliniq.com.\nNo, it is not...",Cardiology
64861,iCliniQ,,,,I had unprotected sex. What are my chances of ...,"Hello doctor,\nI had unprotected sex with my ...","Hi,\nWelcome to icliniq.com.\nPlease do not be...",Obstetrics and Gynecology
64862,iCliniQ,,,,How many days after HIV do rashes appear?,"Hello doctor,\n15 days back, I met a girl and...","Hello,\nWelcome to icliniq.com.\n\nA rash is a...",Dermatology


In [14]:
df_combined = df_combined.fillna('')

In [15]:
df_combined["question"].apply(clean_text)

0               What is are keratoderma with woolly hair ?
1        How many people are affected by keratoderma wi...
2        What are the genetic changes related to kerato...
3              Is keratoderma with woolly hair inherited ?
4        What are the treatments for keratoderma with w...
                               ...                        
64859    I am not gaining weight. Please guide me if I ...
64860       Do I need treatment for premature ejaculation?
64861    I had unprotected sex. What are my chances of ...
64862            How many days after HIV do rashes appear?
64863    What are the treatment options for Internal He...
Name: question, Length: 64864, dtype: object

In [16]:
df_combined

Unnamed: 0,Dataset,focus,synonyms,qtype,question,context,answer,speciality
0,MedQuad,keratoderma with woolly hair,KWWH,information,What is (are) keratoderma with woolly hair ?,KWWH keratoderma with woolly hair What is (are...,Keratoderma with woolly hair is a group of rel...,
1,MedQuad,keratoderma with woolly hair,KWWH,frequency,How many people are affected by keratoderma wi...,KWWH keratoderma with woolly hair How many peo...,Keratoderma with woolly hair is rare; its prev...,
2,MedQuad,keratoderma with woolly hair,KWWH,genetic changes,What are the genetic changes related to kerato...,KWWH keratoderma with woolly hair What are the...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen...",
3,MedQuad,keratoderma with woolly hair,KWWH,inheritance,Is keratoderma with woolly hair inherited ?,KWWH keratoderma with woolly hair Is keratoder...,Most cases of keratoderma with woolly hair hav...,
4,MedQuad,keratoderma with woolly hair,KWWH,treatment,What are the treatments for keratoderma with w...,KWWH keratoderma with woolly hair What are the...,These resources address the diagnosis or manag...,
...,...,...,...,...,...,...,...,...
64859,iCliniQ,,,,I am not gaining weight. Please guide me if I ...,"Hi doctor,\nI am 24 years old male. For the p...","Hello,Welcome to icliniq.com.First of all, che...",Neurology
64860,iCliniQ,,,,Do I need treatment for premature ejaculation?,"Hello doctor,\nI am 37 years old, a gynecolog...","Hello,\nWelcome to icliniq.com.\nNo, it is not...",Cardiology
64861,iCliniQ,,,,I had unprotected sex. What are my chances of ...,"Hello doctor,\nI had unprotected sex with my ...","Hi,\nWelcome to icliniq.com.\nPlease do not be...",Obstetrics and Gynecology
64862,iCliniQ,,,,How many days after HIV do rashes appear?,"Hello doctor,\n15 days back, I met a girl and...","Hello,\nWelcome to icliniq.com.\n\nA rash is a...",Dermatology


In [17]:
# ----- Clean Text Columns -----
# Apply the clean_text function to remove unwanted characters from text columns
df_combined["question"] = df_combined["question"].apply(clean_text)
df_combined["answer"]   = df_combined["answer"].apply(clean_text)
df_combined["context"]  = df_combined["context"].apply(clean_text)

In [18]:

# ----- Remove Duplicates -----
# Drop duplicate rows based on the combination of 'question' and 'answer'
df_combined.drop_duplicates(subset=["question", "answer"], inplace=True)

In [19]:
# df_combined.to_csv("../data/processed/qa_master.csv", index=False)

In [28]:
from pymongo import MongoClient
from datetime import datetime, UTC

# MongoDB Connection
client = MongoClient("mongodb://localhost:27017/")
db = client["medimaven_db"]
medical_qa_collection = db["medical_qa"]

In [22]:
    # Convert to list of dictionaries for MongoDB insertion
records = df_combined.to_dict(orient="records")
records

[{'Dataset': 'MedQuad',
  'focus': 'keratoderma with woolly hair',
  'synonyms': 'KWWH',
  'qtype': 'information',
  'question': 'What is are keratoderma with woolly hair ?',
  'context': 'KWWH keratoderma with woolly hair What is are keratoderma with woolly hair ?',
  'answer': 'Keratoderma with woolly hair is a group of related conditions that affect the skin and hair and in many cases increase the risk of potentially life threatening heart problems. People with these conditions have hair that is unusually coarse, dry, fine, and tightly curled. In some cases, the hair is also sparse. The woolly hair texture typically affects only scalp hair and is present from birth. Starting early in life, affected individuals also develop palmoplantar keratoderma, a condition that causes skin on the palms of the hands and the soles of the feet to become thick, scaly, and calloused. Cardiomyopathy, which is a disease of the heart muscle, is a life threatening health problem that can develop in peopl

In [20]:
import airflow
print("Airflow is installed, version:", airflow.__version__)


Airflow is installed, version: 2.7.2


In [30]:
for record in records:
    record["tags"] = []  # Optional: You can generate tags if needed
    record["created_at"] = datetime.now(UTC)
    record["updated_at"] = datetime.now(UTC)

In [32]:
if records:
    medical_qa_collection.insert_many(records)
    print(f"Inserted {len(records)} medical Q&A records into MongoDB.")

Inserted 64798 medical Q&A records into MongoDB.


In [37]:
import os
import subprocess

def ensure_mongodb_running():
    """Checks if MongoDB is running, and starts it if not."""
    try:
        # Try connecting to MongoDB
        subprocess.run(["mongosh", "--eval", "db.runCommand({ ping: 1 })"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
        print("✅ MongoDB is already running.")
    except subprocess.CalledProcessError:
        print("⚠️ MongoDB is NOT running. Attempting to start it...")
        os.system("brew services start mongodb-community")
        print("✅ MongoDB is running")

# Call this at the beginning of the script
ensure_mongodb_running()

⚠️ MongoDB is NOT running. Attempting to start it...
==> Successfully started `mongodb-community` (label: homebrew.mxcl.mongodb-community)
✅ MongoDB is running
