# Explore and Sample MedQuAD

In [76]:
import pandas as pd

## Load & Clean Data

- Dataset: MedQuAD 
    - 47,457 layperson medical question-answer pairs

In [77]:
df = pd.read_parquet("hf://datasets/lavita/MedQuAD/data/train-00000-of-00001-e36383d177026d53.parquet")
print('Shape:', df.shape)
df.head()

Shape: (47441, 13)


Unnamed: 0,document_id,document_source,document_url,category,umls_cui,umls_semantic_types,umls_semantic_group,synonyms,question_id,question_focus,question_type,question,answer
0,559,GHR,https://ghr.nlm.nih.gov/condition/keratoderma-...,,C0343073,T047,Disorders,KWWH,0000559-1,keratoderma with woolly hair,information,What is (are) keratoderma with woolly hair ?,Keratoderma with woolly hair is a group of rel...
1,559,GHR,https://ghr.nlm.nih.gov/condition/keratoderma-...,,C0343073,T047,Disorders,KWWH,0000559-2,keratoderma with woolly hair,frequency,How many people are affected by keratoderma wi...,Keratoderma with woolly hair is rare; its prev...
2,559,GHR,https://ghr.nlm.nih.gov/condition/keratoderma-...,,C0343073,T047,Disorders,KWWH,0000559-3,keratoderma with woolly hair,genetic changes,What are the genetic changes related to kerato...,"Mutations in the JUP, DSP, DSC2, and KANK2 gen..."
3,559,GHR,https://ghr.nlm.nih.gov/condition/keratoderma-...,,C0343073,T047,Disorders,KWWH,0000559-4,keratoderma with woolly hair,inheritance,Is keratoderma with woolly hair inherited ?,Most cases of keratoderma with woolly hair hav...
4,559,GHR,https://ghr.nlm.nih.gov/condition/keratoderma-...,,C0343073,T047,Disorders,KWWH,0000559-5,keratoderma with woolly hair,treatment,What are the treatments for keratoderma with w...,These resources address the diagnosis or manag...


### Drop Missing Answers

In [78]:
df.isna().sum()

document_id                5
document_source            0
document_url               0
category               15431
umls_cui               16024
umls_semantic_types    16066
umls_semantic_group    16024
synonyms               22772
question_id                0
question_focus            14
question_type              0
question                   0
answer                 31034
dtype: int64

In [79]:
df.dropna(subset=['answer'], inplace=True)
print('Shape:', df.shape)

Shape: (16407, 13)


### Drop Duplicate QAs

In [80]:
print('Duplicates:', df.duplicated(subset=['question', 'answer']).sum())
df.drop_duplicates(subset=['question', 'answer'], inplace=True)
print('New Shape:', df.shape)

Duplicates: 48
New Shape: (16359, 13)


### Clean Up Questions and Answers

In [81]:
# Get max word count of question and answer
def get_max_length(df, column):
    return df[column].str.split().str.len().max()

max_question_length = get_max_length(df, 'question')
max_answer_length = get_max_length(df, 'answer')
print('Max question length:', max_question_length)
print('Max answer length:', max_answer_length)

Max question length: 27
Max answer length: 4281


In [82]:
# Remove excess whitespace from answer
df["answer"] = df["answer"].str.replace(r"[^\S\r\n]+", " ", regex=True).str.strip()
# Remove space before question mark
df["question"] = df["question"].str.replace(r"\s+\?", "?", regex=True)
# Remove question from answer
df["answer"] = df["answer"].str.replace(r"^.*?\?\s*", "", regex=True).str.strip()
# Keep questions that are not too long
df = df[df['question'].str.split().str.len() <= 20]
max_answer_length = get_max_length(df, 'question')
print('New max question length:', max_answer_length)
# Keep answers that are not too long
df = df[df['answer'].str.split().str.len() <= 60]
max_answer_length = get_max_length(df, 'answer')
print('New max answer length:', max_answer_length)
print('Shape:', df.shape)

New max question length: 20
New max answer length: 60
Shape: (3373, 13)


### Drop Small question_type Groups

In [83]:
df['question_type'].value_counts()

question_type
frequency          1019
treatment           788
inheritance         641
information         197
outlook             197
research            148
causes              128
exams and tests      79
symptoms             63
susceptibility       47
considerations       30
prevention           24
complications         7
genetic changes       5
Name: count, dtype: int64

In [84]:
small_groups = ['stages', 'complications', 'support groups', 'prevention', 'considerations', 'genetic changes']
df = df.loc[~(df['question_type'].isin(small_groups))]
print('Shape:', df.shape)

Shape: (3307, 13)


## Sample and Save QAs

In [85]:
def balanced_group_sample(df, group_cols, n_per_group):
    # Group by specified cols
    grouped = df.groupby(group_cols)
    # Sample n rows (or all, if size is less than n) per group
    balanced_sample = grouped.apply(
        lambda x: x.sample(n=min(n_per_group, len(x)), random_state=42),
        include_groups=False
    ).reset_index()
    return balanced_sample

sampled_df = balanced_group_sample(df, group_cols=["question_type"], n_per_group=750)

# Check shape and distribution
print('Shape:', sampled_df.shape)
sampled_df['question_type'].value_counts()


Shape: (3000, 14)


question_type
frequency          750
treatment          750
inheritance        641
information        197
outlook            197
research           148
causes             128
exams and tests     79
symptoms            63
susceptibility      47
Name: count, dtype: int64

In [86]:
# Save sampled data to CSV
sampled_df.to_csv('../data/medquad_sampled.csv', index=False)
# Save QA only to jsonl
sampled_df = sampled_df[['question', 'answer']]
sampled_df.to_json('../data/medquad_sampled.jsonl', orient='records', lines=True)
sampled_df

Unnamed: 0,question,answer
0,"What causes Periodic fever, aphthous stomatiti...","The cause of PFAPA is unknown, although viral ..."
1,What causes Prinzmetal's variant angina?,Prinzmetal's variant angina is caused by coron...
2,What causes Craniopharyngioma?,Craniopharyngiomas are thought to arise from e...
3,What causes Lemierre syndrome?,"In about 90% of cases, Lemierre syndrome is ca..."
4,What causes Childhood Brain and Spinal Cord Tu...,The cause of most childhood brain and spinal c...
...,...,...
2995,What are the treatments for Yellow nail syndrome?,You can find further information on treatment ...
2996,What are the treatments for Mitochondrial comp...,Treatment options for complex II deficiency ma...
2997,What are the treatments for Hereditary Spastic...,"There are no specific treatments to prevent, s..."
2998,What are the treatments for Vohwinkel syndrome?,These resources address the diagnosis or manag...
