# **Data Processing**

https://huggingface.co/datasets/gretelai/symptom_to_diagnosis


## Download the datasets

In [1]:
import pandas as pd

splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}
df1 = pd.read_json("hf://datasets/gretelai/symptom_to_diagnosis/" + splits["train"], lines=True)

splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}
df2 = pd.read_json("hf://datasets/gretelai/symptom_to_diagnosis/" + splits["test"], lines=True)

## Data Exploration

In [2]:
# check the diseases
df = pd.concat([df1, df2], ignore_index = True)
df["output_text"].unique()

array(['cervical spondylosis', 'impetigo', 'urinary tract infection',
       'arthritis', 'dengue', 'common cold', 'drug reaction',
       'fungal infection', 'malaria', 'allergy', 'bronchial asthma',
       'varicose veins', 'migraine', 'hypertension',
       'gastroesophageal reflux disease', 'pneumonia', 'psoriasis',
       'diabetes', 'jaundice', 'chicken pox', 'typhoid',
       'peptic ulcer disease'], dtype=object)

In [3]:
# check the number of diseases in the dataset
len(df["output_text"].unique())

22

In [4]:
# a preview of the dataset
df.head()

Unnamed: 0,output_text,input_text
0,cervical spondylosis,I've been having a lot of pain in my neck and ...
1,impetigo,I have a rash on my face that is getting worse...
2,urinary tract infection,I have been urinating blood. I sometimes feel ...
3,arthritis,I have been having trouble with my muscles and...
4,dengue,I have been feeling really sick. My body hurts...


## Data Cleaning

In [5]:
# check for the missing values
print(df.isnull().values.any())

False


In [6]:
# Split input_text into individual sentences for easier manipulation and extraction of information.
split_sentence = df["input_text"].str.split(r"\. ", expand = True)
split_sentence.columns = [f'sentence_{i+1}' for i in range(split_sentence.shape[1])]

# Add a period to all sentences that are not the last and do not already end with a period
for col in split_sentence.columns[:-1]:  # Exclude the last column
    split_sentence[col] = split_sentence[col].apply(lambda x: x if pd.isnull(x) or x.endswith('.') else x + '.')

# Ensure the last column does not have an extra period if it's already there
last_col = split_sentence.columns[-1]
split_sentence[last_col] = split_sentence[last_col].apply(lambda x: x if pd.isnull(x) else x.rstrip('.') + '.')

In [7]:
df_split = pd.concat([df, split_sentence], axis=1)
df_split

Unnamed: 0,output_text,input_text,sentence_1,sentence_2,sentence_3,sentence_4,sentence_5,sentence_6,sentence_7
0,cervical spondylosis,I've been having a lot of pain in my neck and ...,I've been having a lot of pain in my neck and ...,I've also been having trouble with my balance ...,I've been coughing a lot and my limbs feel weak.,,,,
1,impetigo,I have a rash on my face that is getting worse...,I have a rash on my face that is getting worse.,"It is red, inflamed, and has blisters that are...",It is really painful.,,,,
2,urinary tract infection,I have been urinating blood. I sometimes feel ...,I have been urinating blood.,I sometimes feel sick to my stomach when I uri...,I often feel like I have a fever.,,,,
3,arthritis,I have been having trouble with my muscles and...,I have been having trouble with my muscles and...,My neck is really tight and my muscles feel weak.,I have swollen joints and it is hard to move a...,It is also really uncomfortable to walk.,,,
4,dengue,I have been feeling really sick. My body hurts...,I have been feeling really sick.,My body hurts a lot and I have no appetite.,I have also developed rashes on my arms and face.,The back of my eyes hurt a lot.,,,
...,...,...,...,...,...,...,...,...,...
1060,dengue,I have been experiencing muscle pain that make...,I have been experiencing muscle pain that make...,I have lost my apetite and feel vomiting.,My legs and back pain a lot.,I have been feeling very weak and tired.,.,,
1061,psoriasis,"I have red, irritated skin on my arms, face, a...","I have red, irritated skin on my arms, face, a...",It's often itchy and uncomfortable.,My nails are also inflamed and have small dent...,I've never seen anything like this before.,,,
1062,bronchial asthma,"I've been having a hard time breathing, and I'...","I've been having a hard time breathing, and I'...",I'm also feeling really tired and weak.,,,,,
1063,bronchial asthma,I've been coughing a lot for a few days now. I...,I've been coughing a lot for a few days now.,"It's been hard for me to catch my breath, and ...",I've also been producing a lot of mucus when I...,,,,


## Create the test set for the evaluation

In [8]:
df_more_than_2_sentence = df_split[df_split.iloc[:, 2:].notnull().sum(axis=1)>2]

In [9]:
df_more_than_2_sentence["output_text"].unique()

array(['cervical spondylosis', 'impetigo', 'urinary tract infection',
       'arthritis', 'dengue', 'common cold', 'malaria', 'allergy',
       'bronchial asthma', 'migraine', 'drug reaction',
       'gastroesophageal reflux disease', 'pneumonia', 'psoriasis',
       'diabetes', 'jaundice', 'chicken pox', 'typhoid',
       'peptic ulcer disease', 'varicose veins', 'fungal infection',
       'hypertension'], dtype=object)

In [10]:
len(df_more_than_2_sentence["output_text"].unique())

22

In [11]:
df_more_than_2_sentence.groupby("output_text")["output_text"].count()

output_text
allergy                            34
arthritis                          30
bronchial asthma                   37
cervical spondylosis               11
chicken pox                        25
common cold                        47
dengue                             32
diabetes                           25
drug reaction                      22
fungal infection                   15
gastroesophageal reflux disease    22
hypertension                       11
impetigo                           24
jaundice                           40
malaria                            46
migraine                            9
peptic ulcer disease               32
pneumonia                          46
psoriasis                          29
typhoid                            32
urinary tract infection            29
varicose veins                     20
Name: output_text, dtype: int64

In [13]:
df_sampled = df_more_than_2_sentence.groupby("output_text", group_keys=False)\
                                    .sample(n=3, replace=True, random_state=1)

df_sampled

Unnamed: 0,output_text,input_text,sentence_1,sentence_2,sentence_3,sentence_4,sentence_5,sentence_6,sentence_7
294,allergy,"My skin is itchy and red. Sometimes, it can pe...",My skin is itchy and red.,"Sometimes, it can peel.","My lips and cheeks swell, and it is really ann...",I sometimes get headaches and watery eyes beca...,,,
173,allergy,I have a sore throat and I am sneezing all the...,I have a sore throat and I am sneezing all the...,"Sometimes my face swells up, especially my lip...",I can't stop sneezing once I start.,,,,
199,allergy,"I feel sick to my stomach, lightheaded, and di...","I feel sick to my stomach, lightheaded, and di...",My throat is swollen and I can't breathe well.,I sometimes feel pain in my chest and nausea a...,,,,
455,arthritis,I've been feeling really weak in my muscles an...,I've been feeling really weak in my muscles an...,My joints have been swelling up and it's hard ...,Walking has been really painful too.,,,,
125,arthritis,"I've been having a lot of pain in my joints, e...","I've been having a lot of pain in my joints, e...",It's hard to move around and I'm always feelin...,I've also been having trouble sleeping because...,,,,
...,...,...,...,...,...,...,...,...,...
941,urinary tract infection,I've been having a hard time peeing. It hurts ...,I've been having a hard time peeing.,"It hurts when I go, and it's bloody.","My head hurts, and my urine smells really bad.","I can't control when I have to go, and it's re...",,,
444,urinary tract infection,"I've been having to pee a lot, and it hurts wh...","I've been having to pee a lot, and it hurts wh...","Sometimes I see blood in my pee, and it smells...",I'm worried that I have an infection.,What should I do?.,,,
231,varicose veins,I have been having cramps in my calves when I ...,I have been having cramps in my calves when I ...,I have also noticed bruise marks on my calves.,I feel tired very soon.,,,,
489,varicose veins,I have some red and inflamed skin on my legs. ...,I have some red and inflamed skin on my legs.,I think I can see some of the swollen blood ve...,I'm really worried about it.,,,,


In [14]:
# add "no more information" after the last sentence for each row
for index, row in df_sampled.iterrows():
    has_null = False
    for col in df_sampled.columns[3:]:
        if pd.isnull(row[col]):
            df_sampled.at[index, col] = "no more information"
            has_null = True
            break
            
    if not has_null:
        # If no null columns found, add a new column for this row
        next_col_num = len([col for col in df_sampled.columns if col.startswith('sentence_')]) + 1
        new_col = f'sentence_{next_col_num}'
        if new_col not in df_sampled.columns:
            df_sampled[new_col] = None
        df_sampled.at[index, new_col] = "no more information"

df_sampled

Unnamed: 0,output_text,input_text,sentence_1,sentence_2,sentence_3,sentence_4,sentence_5,sentence_6,sentence_7,sentence_8
294,allergy,"My skin is itchy and red. Sometimes, it can pe...",My skin is itchy and red.,"Sometimes, it can peel.","My lips and cheeks swell, and it is really ann...",I sometimes get headaches and watery eyes beca...,no more information,,,
173,allergy,I have a sore throat and I am sneezing all the...,I have a sore throat and I am sneezing all the...,"Sometimes my face swells up, especially my lip...",I can't stop sneezing once I start.,no more information,,,,
199,allergy,"I feel sick to my stomach, lightheaded, and di...","I feel sick to my stomach, lightheaded, and di...",My throat is swollen and I can't breathe well.,I sometimes feel pain in my chest and nausea a...,no more information,,,,
455,arthritis,I've been feeling really weak in my muscles an...,I've been feeling really weak in my muscles an...,My joints have been swelling up and it's hard ...,Walking has been really painful too.,no more information,,,,
125,arthritis,"I've been having a lot of pain in my joints, e...","I've been having a lot of pain in my joints, e...",It's hard to move around and I'm always feelin...,I've also been having trouble sleeping because...,no more information,,,,
...,...,...,...,...,...,...,...,...,...,...
941,urinary tract infection,I've been having a hard time peeing. It hurts ...,I've been having a hard time peeing.,"It hurts when I go, and it's bloody.","My head hurts, and my urine smells really bad.","I can't control when I have to go, and it's re...",no more information,,,
444,urinary tract infection,"I've been having to pee a lot, and it hurts wh...","I've been having to pee a lot, and it hurts wh...","Sometimes I see blood in my pee, and it smells...",I'm worried that I have an infection.,What should I do?.,no more information,,,
231,varicose veins,I have been having cramps in my calves when I ...,I have been having cramps in my calves when I ...,I have also noticed bruise marks on my calves.,I feel tired very soon.,no more information,,,,
489,varicose veins,I have some red and inflamed skin on my legs. ...,I have some red and inflamed skin on my legs.,I think I can see some of the swollen blood ve...,I'm really worried about it.,no more information,,,,


In [33]:
df_sampled.to_csv("sampled_df.csv", index = False)