<a href="https://colab.research.google.com/github/faithNassiwa/predictive-diagnosis-assistant/blob/main/models/Dataset2_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing Training Dataset

In [19]:
import pandas as pd
import numpy as np


In [20]:
# Mount google drive to access folder with data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
# Load training dataset
training_path = '/content/drive/MyDrive/DS5500/Data/Dataset2/release_train_patients'
training_df = pd.read_csv(training_path)
training_df.head()

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,18,"[['Bronchitis', 0.19171203430383882], ['Pneumo...",M,URTI,"['E_48', 'E_50', 'E_53', 'E_54_@_V_161', 'E_54...",E_91
1,21,"[['HIV (initial infection)', 0.518950056440760...",M,HIV (initial infection),"['E_9', 'E_27', 'E_50', 'E_51', 'E_53', 'E_54_...",E_50
2,19,"[['Bronchitis', 0.11278064619119596], ['Pneumo...",F,Pneumonia,"['E_53', 'E_54_@_V_179', 'E_54_@_V_192', 'E_55...",E_77
3,34,"[['URTI', 0.23859396799565236], ['Cluster head...",F,URTI,"['E_48', 'E_53', 'E_54_@_V_183', 'E_55_@_V_89'...",E_53
4,36,"[['URTI', 0.23677812769175735], ['Influenza', ...",M,URTI,"['E_49', 'E_50', 'E_53', 'E_54_@_V_183', 'E_55...",E_201


In [21]:
import json

# Load evidence json file
evidence_path = '/content/drive/MyDrive/DS5500/Data/Dataset2/release_evidences.json'

with open(evidence_path) as f:
  evidence_dict = json.load(f)

print(evidence_dict)

{'E_91': {'name': 'E_91', 'code_question': 'E_91', 'question_fr': 'Avez-vous objectivé ou ressenti de la fièvre?', 'question_en': 'Do you have a fever (either felt or measured with a thermometer)?', 'is_antecedent': False, 'default_value': 0, 'value_meaning': {}, 'possible-values': [], 'data_type': 'B'}, 'E_55': {'name': 'E_55', 'code_question': 'E_53', 'question_fr': 'Avez-vous de la douleur quelque part?', 'question_en': 'Do you feel pain somewhere?', 'is_antecedent': False, 'default_value': 'V_123', 'value_meaning': {'V_123': {'fr': 'nulle part', 'en': 'nowhere'}, 'V_14': {'fr': 'aile iliaque(D)', 'en': 'iliac wing(R)'}, 'V_15': {'fr': 'aile iliaque(G)', 'en': 'iliac wing(L)'}, 'V_16': {'fr': 'aine(D)', 'en': 'groin(R)'}, 'V_17': {'fr': 'aine(G)', 'en': 'groin(L)'}, 'V_19': {'fr': 'aisselle(G)', 'en': 'axilla(L)'}, 'V_18': {'fr': 'aisselle(D)', 'en': 'axilla(R)'}, 'V_20': {'fr': 'amygdale(D)', 'en': 'tonsil(R)'}, 'V_21': {'fr': 'amygdale(G)', 'en': 'tonsil(L)'}, 'V_22': {'fr': 'anus

In [22]:
# Get evidence codes and corresponding questions
evidence_mapping = {}
for key, value in evidence_dict.items():
  evidence_mapping[key] = value['question_en']

print(evidence_mapping)


{'E_91': 'Do you have a fever (either felt or measured with a thermometer)?', 'E_55': 'Do you feel pain somewhere?', 'E_53': 'Do you have pain somewhere, related to your reason for consulting?', 'E_57': 'Does the pain radiate to another location?', 'E_54': 'Characterize your pain:', 'E_59': 'How fast did the pain appear?', 'E_56': 'How intense is the pain?', 'E_58': 'How precisely is the pain located?', 'E_159': 'Did you lose consciousness?', 'E_133': 'Where is the affected region located?', 'E_129': 'Do you have any lesions, redness or problems on your skin that you believe are related to the condition you are consulting for?', 'E_130': 'What color is the rash?', 'E_134': 'How intense is the pain caused by the rash?', 'E_132': 'Is the rash swollen?', 'E_136': 'How severe is the itching?', 'E_135': 'Is the lesion (or are the lesions) larger than 1cm?', 'E_131': 'Do your lesions peel off?', 'E_154': 'Is your skin much paler than usual?', 'E_155': 'Do you feel your heart is beating fast 

In [23]:
# Get evidence value codes and corresponding answers
value_mapping = {}
for key, value in evidence_dict.items():
  for k,v in value['value_meaning'].items():
    value_mapping[k] = v['en']

print(value_mapping)

{'V_123': 'nowhere', 'V_14': 'iliac wing(R)', 'V_15': 'iliac wing(L)', 'V_16': 'groin(R)', 'V_17': 'groin(L)', 'V_19': 'axilla(L)', 'V_18': 'axilla(R)', 'V_20': 'tonsil(R)', 'V_21': 'tonsil(L)', 'V_22': 'anus', 'V_26': 'back of the neck', 'V_25': 'back of head', 'V_30': 'biceps(R)', 'V_31': 'biceps(L)', 'V_32': 'mouth', 'V_33': 'thyroid cartilage', 'V_34': 'ankle(R)', 'V_35': 'ankle(L)', 'V_36': 'clitoris', 'V_37': 'coccyx', 'V_38': 'cervical spine', 'V_39': 'thoracic spine', 'V_40': 'lumbar spine', 'V_41': 'commissure(R)', 'V_42': 'commissure(L)', 'V_49': 'iliac crest(R)', 'V_50': 'iliac crest(L)', 'V_51': 'thigh(R)', 'V_52': 'thigh(L)', 'V_57': 'lower teeth(R)', 'V_58': 'lower teeth(L)', 'V_59': 'upper teeth(R)', 'V_60': 'upper teeth(L)', 'V_67': 'finger (index)(R)', 'V_68': 'finger (index)(L)', 'V_69': 'finger (middle)(R)', 'V_70': 'finger (middle)(L)', 'V_63': 'finger (ring finger)(R)', 'V_64': 'finger (ring finger)(L)', 'V_65': 'finger (little finger)(R)', 'V_66': 'finger (little 

In [None]:
# Extract key columns from training df
data_temp = training_df[['AGE', 'SEX', 'PATHOLOGY']].copy()
data_temp.head()


Unnamed: 0,AGE,SEX,PATHOLOGY
0,18,M,URTI
1,21,M,HIV (initial infection)
2,19,F,Pneumonia
3,34,F,URTI
4,36,M,URTI


In [26]:
# Preprocess training dataset -- Took 2 minutes

import re
import ast

data_dict = {}
for index, row in training_df.iterrows():
  data_entry = {}
  evidences = ast.literal_eval(row['EVIDENCES']) # convert str containing python object
  for evidence in evidences:
    if evidence in evidence_mapping: # evidence has no categorical value appended
      mapped_evidence = evidence_mapping[evidence]
      data_entry[mapped_evidence] = 1
    else:
      evidence_key_search = re.findall(r'E_\d+', evidence)
      value_key_search = re.findall(r'V_\d+', evidence)
      if evidence_key_search and value_key_search:
        evidence_key = evidence_key_search[0]
        value_key = value_key_search[0]
        column_name = evidence_mapping[evidence_key]
        data_entry[column_name] = value_mapping[value_key]
  data_dict[index] = data_entry


In [27]:
# Snippet of data_dict generated
data_dict[0]

{'Do you live with 4 or more people?': 1,
 'Have you had significantly increased sweating?': 1,
 'Do you have pain somewhere, related to your reason for consulting?': 1,
 'Characterize your pain:': 'heavy',
 'Do you feel pain somewhere?': 'temple(L)',
 'Does the pain radiate to another location?': 'nowhere',
 'Do you have a cough that produces colored or more abundant sputum than usual?': 1,
 'Do you smoke cigarettes?': 1,
 'Do you have a fever (either felt or measured with a thermometer)?': 1,
 'Do you have a sore throat?': 1,
 'Do you have a cough?': 1,
 'Have you traveled out of the country in the last 4 weeks?': 'N',
 'Are you exposed to secondhand cigarette smoke on a daily basis?': 1}

In [29]:
# Add data_dict to dataframe -- About a minute
data_temp2 = pd.DataFrame.from_dict(data_dict, orient='index')
data_temp2.head()


Unnamed: 0,Do you live with 4 or more people?,Have you had significantly increased sweating?,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,Do you have a cough that produces colored or more abundant sputum than usual?,Do you smoke cigarettes?,Do you have a fever (either felt or measured with a thermometer)?,Do you have a sore throat?,...,Have you breastfed one of your children for more than 9 months?,Have you felt confused or disorientated lately?,"In the last month, have you been in contact with anyone infected with the Ebola virus?",Have you noticed any unusual bleeding or bruising related to your consultation today?,Do you live in the suburbs?,Do you ever temporarily stop breathing while you’re asleep?,Do you have a decrease in appetite?,Does your mother suffer from asthma?,Do you live in a rural area?,Are you of Asian descent?
0,1.0,1.0,1.0,heavy,temple(L),nowhere,1.0,1.0,1.0,1.0,...,,,,,,,,,,
3,1.0,,1.0,heavy,temple(L),nowhere,,,,1.0,...,,,,,,,,,,
35,1.0,,1.0,burning,under the jaw,nowhere,,,1.0,,...,,,,,,,,,,
37,1.0,,1.0,heavy,temple(L),nowhere,,1.0,1.0,1.0,...,,,,,,,,,,
42,1.0,,1.0,burning,under the jaw,nowhere,,,1.0,,...,,,,,,,,,,


In [None]:
# Combine dataframes on index
resulting_df = data_temp.join(data_temp2)
resulting_df.head()


Unnamed: 0,AGE,SEX,PATHOLOGY,Do you live with 4 or more people?,Have you had significantly increased sweating?,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,Do you have a cough that produces colored or more abundant sputum than usual?,...,Have you breastfed one of your children for more than 9 months?,Have you felt confused or disorientated lately?,"In the last month, have you been in contact with anyone infected with the Ebola virus?",Have you noticed any unusual bleeding or bruising related to your consultation today?,Do you live in the suburbs?,Do you ever temporarily stop breathing while you’re asleep?,Do you have a decrease in appetite?,Does your mother suffer from asthma?,Do you live in a rural area?,Are you of Asian descent?
0,18,M,URTI,1.0,1.0,1.0,heavy,temple(L),nowhere,1.0,...,,,,,,,,,,
1,21,M,HIV (initial infection),,1.0,1.0,exhausting,temple(L),nowhere,,...,,,,,,,,,,
2,19,F,Pneumonia,,,1.0,sharp,posterior chest wall(L),nowhere,1.0,...,,,,,,,,,,
3,34,F,URTI,1.0,,1.0,heavy,temple(L),nowhere,,...,,,,,,,,,,
4,36,M,URTI,,1.0,1.0,heavy,temple(L),nowhere,,...,,,,,,,,,,


In [None]:
# Fill NaN cells with zeros
resulting_df_filled = resulting_df.fillna(0)
resulting_df_filled.head()

Unnamed: 0,AGE,SEX,PATHOLOGY,Do you live with 4 or more people?,Have you had significantly increased sweating?,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,Do you have a cough that produces colored or more abundant sputum than usual?,...,Have you breastfed one of your children for more than 9 months?,Have you felt confused or disorientated lately?,"In the last month, have you been in contact with anyone infected with the Ebola virus?",Have you noticed any unusual bleeding or bruising related to your consultation today?,Do you live in the suburbs?,Do you ever temporarily stop breathing while you’re asleep?,Do you have a decrease in appetite?,Does your mother suffer from asthma?,Do you live in a rural area?,Are you of Asian descent?
0,18,M,URTI,1.0,1.0,1.0,heavy,temple(L),nowhere,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,21,M,HIV (initial infection),0.0,1.0,1.0,exhausting,temple(L),nowhere,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,19,F,Pneumonia,0.0,0.0,1.0,sharp,posterior chest wall(L),nowhere,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,34,F,URTI,1.0,0.0,1.0,heavy,temple(L),nowhere,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,36,M,URTI,0.0,1.0,1.0,heavy,temple(L),nowhere,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Export dataframe to CSV -- Don't Re-Run
resulting_df_filled.to_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_train.csv')

In [None]:
# Get number of patient records / length of df
len(resulting_df_filled)

1025602

In [None]:
# Get unique pathologies / diagnoses
diagnosis_list = resulting_df_filled['PATHOLOGY'].unique()
print(diagnosis_list)
print(f'{len(diagnosis_list)} unique diagnoses')

['URTI' 'HIV (initial infection)' 'Pneumonia' 'Chronic rhinosinusitis'
 'Viral pharyngitis' 'Anemia' 'Atrial fibrillation' 'Allergic sinusitis'
 'Larygospasm' 'Cluster headache' 'Anaphylaxis' 'Spontaneous pneumothorax'
 'Acute pulmonary edema' 'Tuberculosis' 'Myasthenia gravis' 'Panic attack'
 'Scombroid food poisoning' 'Epiglottitis' 'Inguinal hernia' 'Boerhaave'
 'Pancreatic neoplasm' 'Bronchitis' 'SLE' 'Acute laryngitis'
 'Unstable angina' 'Bronchiectasis' 'Possible NSTEMI / STEMI' 'Chagas'
 'Localized edema' 'Sarcoidosis' 'Spontaneous rib fracture' 'GERD'
 'Bronchospasm / acute asthma exacerbation'
 'Acute COPD exacerbation / infection' 'Guillain-Barré syndrome'
 'Influenza' 'Pulmonary embolism' 'Stable angina' 'Pericarditis'
 'Acute rhinosinusitis' 'Whooping cough' 'Myocarditis'
 'Acute dystonic reactions' 'Pulmonary neoplasm' 'Acute otitis media'
 'PSVT' 'Croup' 'Ebola' 'Bronchiolitis']
49 unique diagnoses


In [6]:
# Read the training dataset
df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_train.csv', low_memory=False)
df.head()

Unnamed: 0.1,Unnamed: 0,AGE,SEX,PATHOLOGY,Do you live with 4 or more people?,Have you had significantly increased sweating?,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,...,Have you breastfed one of your children for more than 9 months?,Have you felt confused or disorientated lately?,"In the last month, have you been in contact with anyone infected with the Ebola virus?",Have you noticed any unusual bleeding or bruising related to your consultation today?,Do you live in the suburbs?,Do you ever temporarily stop breathing while you’re asleep?,Do you have a decrease in appetite?,Does your mother suffer from asthma?,Do you live in a rural area?,Are you of Asian descent?
0,0,18,M,URTI,1.0,1.0,1.0,heavy,temple(L),nowhere,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,21,M,HIV (initial infection),0.0,1.0,1.0,exhausting,temple(L),nowhere,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,19,F,Pneumonia,0.0,0.0,1.0,sharp,posterior chest wall(L),nowhere,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,34,F,URTI,1.0,0.0,1.0,heavy,temple(L),nowhere,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,36,M,URTI,0.0,1.0,1.0,heavy,temple(L),nowhere,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from pprint import pprint
columns_list = df.columns.to_list()
pprint(columns_list)

['Unnamed: 0',
 'AGE',
 'SEX',
 'PATHOLOGY',
 'Do you live with 4 or more people?',
 'Have you had significantly increased sweating?',
 'Do you have pain somewhere, related to your reason for consulting?',
 'Characterize your pain:',
 'Do you feel pain somewhere?',
 'Does the pain radiate to another location?',
 'Do you have a cough that produces colored or more abundant sputum than '
 'usual?',
 'Do you smoke cigarettes?',
 'Do you have a fever (either felt or measured with a thermometer)?',
 'Do you have a sore throat?',
 'Do you have a cough?',
 'Have you traveled out of the country in the last 4 weeks?',
 'Are you exposed to secondhand cigarette smoke on a daily basis?',
 'Do you have swollen or painful lymph nodes?',
 'Have you ever had a sexually transmitted infection?',
 'Have you had diarrhea or an increase in stool frequency?',
 'Have you had unprotected sex with more than one partner in the last 6 '
 'months?',
 'Do you have any lesions, redness or problems on your skin that 

# Preprocessing Test Dataset and Validate Dataset

In [50]:
import re
import ast
def preprocess_df(df, evidence_mapping, value_mapping):
  """
  Inputs:
    df: English version of the New Dataset for Automatic Medical Diagnosis
    evidence_mapping: dictionary containing evidence code mapped to question_en
    value_mapping: dictionary containing value code mapped to answer_en
  Outputs:
    data_dict: dictionary containing processed data
  """
  data_dict = {}
  for index, row in df.iterrows():
    data_entry = {}
    evidences = ast.literal_eval(row['EVIDENCES']) # convert str containing python object
    for evidence in evidences:
      if evidence in evidence_mapping: # evidence has no categorical value appended
        mapped_evidence = evidence_mapping[evidence]
        data_entry[mapped_evidence] = 1
      else:
        evidence_key_search = re.findall(r'E_\d+', evidence)
        value_key_search = re.findall(r'V_\d+', evidence)
        if evidence_key_search and value_key_search:
          evidence_key = evidence_key_search[0]
          value_key = value_key_search[0]
          column_name = evidence_mapping[evidence_key]
          data_entry[column_name] = value_mapping[value_key]
    data_dict[index] = data_entry
  return data_dict

## Testing Dataset

In [51]:
test_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/release_test_patients')
test_df.head()


Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,49,"[['Bronchitis', 0.20230062181160519], ['GERD',...",F,GERD,"['E_53', 'E_54_@_V_112', 'E_54_@_V_161', 'E_54...",E_201
1,2,"[['Bronchospasm / acute asthma exacerbation', ...",M,Bronchitis,"['E_53', 'E_54_@_V_181', 'E_55_@_V_55', 'E_55_...",E_53
2,49,"[['Acute dystonic reactions', 0.62670508481658...",M,Acute dystonic reactions,"['E_15', 'E_128', 'E_147', 'E_168', 'E_172', '...",E_128
3,64,"[['Bronchitis', 0.2748608320637265], ['Acute l...",M,Acute laryngitis,"['E_48', 'E_49', 'E_53', 'E_54_@_V_181', 'E_55...",E_53
4,70,"[['URTI', 0.21257615919851483], ['Influenza', ...",F,URTI,"['E_41', 'E_50', 'E_53', 'E_54_@_V_161', 'E_54...",E_201


In [66]:
# Preprocess Test Dataset - Took about 20 seconds
data_dict = preprocess_df(test_df, evidence_mapping, value_mapping) # Took about 20 secs
data_dict[0]

{'Do you have pain somewhere, related to your reason for consulting?': 1,
 'Characterize your pain:': 'burning',
 'Do you feel pain somewhere?': 'hypochondrium(R)',
 'Does the pain radiate to another location?': 'upper chest',
 'Are you significantly overweight compared to people of the same height as you?': 1,
 'Do you drink alcohol excessively or do you have an addiction to alcohol?': 1,
 'Do you have a hiatal hernia?': 1,
 'Have you recently had stools that were black (like coal)?': 1,
 'Do you think you are pregnant or are you currently pregnant?': 1,
 'Do you have a burning sensation that starts in your stomach then goes up into your throat, and can be associated with a bitter taste in your mouth?': 1,
 'Do you have a cough?': 1,
 'Have you traveled out of the country in the last 4 weeks?': 'N',
 'Are your symptoms worse when lying down and alleviated while sitting up?': 1}

In [53]:
# Extract key columns from training df
data_temp = test_df[['AGE', 'SEX', 'PATHOLOGY']].copy()
data_temp.head()

Unnamed: 0,AGE,SEX,PATHOLOGY
0,49,F,GERD
1,2,M,Bronchitis
2,49,M,Acute dystonic reactions
3,64,M,Acute laryngitis
4,70,F,URTI


In [54]:
# Add data_dict to dataframe
data_temp2 = pd.DataFrame.from_dict(data_dict, orient='index')
data_temp2.head()

Unnamed: 0,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,Are you significantly overweight compared to people of the same height as you?,Do you drink alcohol excessively or do you have an addiction to alcohol?,Do you have a hiatal hernia?,Have you recently had stools that were black (like coal)?,Do you think you are pregnant or are you currently pregnant?,"Do you have a burning sensation that starts in your stomach then goes up into your throat, and can be associated with a bitter taste in your mouth?",...,Have you been unintentionally losing weight or have you lost your appetite?,Did you vomit after coughing?,Are you of Asian descent?,Have you been in contact with someone who has had pertussis (whoooping cough)?,Do you wheeze while inhaling or is your breathing noisy after coughing spells?,Do you have a decrease in appetite?,Do you live in a rural area?,Do you live in the suburbs?,Does your mother suffer from asthma?,Do you ever temporarily stop breathing while you’re asleep?
0,1.0,burning,hypochondrium(R),upper chest,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
1,1.0,burning,pharynx,nowhere,,,,,,,...,,,,,,,,,,
3,1.0,burning,pharynx,nowhere,,,,,,,...,,,,,,,,,,
4,1.0,heavy,occiput,nowhere,,,,,,,...,,,,,,,,,,
5,1.0,sensitive,temple(L),nowhere,,,,,,,...,,,,,,,,,,


In [55]:
# Combine dataframes on index
resulting_df = data_temp.join(data_temp2)
resulting_df.head()

Unnamed: 0,AGE,SEX,PATHOLOGY,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,Are you significantly overweight compared to people of the same height as you?,Do you drink alcohol excessively or do you have an addiction to alcohol?,Do you have a hiatal hernia?,...,Have you been unintentionally losing weight or have you lost your appetite?,Did you vomit after coughing?,Are you of Asian descent?,Have you been in contact with someone who has had pertussis (whoooping cough)?,Do you wheeze while inhaling or is your breathing noisy after coughing spells?,Do you have a decrease in appetite?,Do you live in a rural area?,Do you live in the suburbs?,Does your mother suffer from asthma?,Do you ever temporarily stop breathing while you’re asleep?
0,49,F,GERD,1.0,burning,hypochondrium(R),upper chest,1.0,1.0,1.0,...,,,,,,,,,,
1,2,M,Bronchitis,1.0,burning,pharynx,nowhere,,,,...,,,,,,,,,,
2,49,M,Acute dystonic reactions,,,,,,,,...,,,,,,,,,,
3,64,M,Acute laryngitis,1.0,burning,pharynx,nowhere,,,,...,,,,,,,,,,
4,70,F,URTI,1.0,heavy,occiput,nowhere,,,,...,,,,,,,,,,


In [56]:
# Fill NaN cells with zeros
resulting_df_filled = resulting_df.fillna(0)
resulting_df_filled.head()

Unnamed: 0,AGE,SEX,PATHOLOGY,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,Are you significantly overweight compared to people of the same height as you?,Do you drink alcohol excessively or do you have an addiction to alcohol?,Do you have a hiatal hernia?,...,Have you been unintentionally losing weight or have you lost your appetite?,Did you vomit after coughing?,Are you of Asian descent?,Have you been in contact with someone who has had pertussis (whoooping cough)?,Do you wheeze while inhaling or is your breathing noisy after coughing spells?,Do you have a decrease in appetite?,Do you live in a rural area?,Do you live in the suburbs?,Does your mother suffer from asthma?,Do you ever temporarily stop breathing while you’re asleep?
0,49,F,GERD,1.0,burning,hypochondrium(R),upper chest,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,M,Bronchitis,1.0,burning,pharynx,nowhere,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,49,M,Acute dystonic reactions,0.0,0,0,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,64,M,Acute laryngitis,1.0,burning,pharynx,nowhere,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,70,F,URTI,1.0,heavy,occiput,nowhere,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# Export dataframe to CSV
resulting_df_filled.to_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_test.csv')

In [58]:
len(resulting_df_filled)

134529

## Validate Dataset

In [49]:
validate_df = pd.read_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/release_validate_patients')
validate_df.head()

Unnamed: 0,AGE,DIFFERENTIAL_DIAGNOSIS,SEX,PATHOLOGY,EVIDENCES,INITIAL_EVIDENCE
0,55,"[['Anemia', 0.25071110167158567], ['Atrial fib...",F,Anemia,"['E_7', 'E_24', 'E_26', 'E_53', 'E_54_@_V_180'...",E_154
1,10,"[['Guillain-Barré syndrome', 0.135558991316712...",F,Panic attack,"['E_16', 'E_29', 'E_50', 'E_53', 'E_54_@_V_182...",E_171
2,68,"[['Influenza', 0.1900250899717378], ['Viral ph...",F,Influenza,"['E_50', 'E_53', 'E_54_@_V_183', 'E_54_@_V_198...",E_53
3,13,"[['Anemia', 0.18697604010451876], ['Atrial fib...",M,Anemia,"['E_7', 'E_24', 'E_26', 'E_53', 'E_54_@_V_180'...",E_53
4,48,"[['Boerhaave', 1.0]]",M,Boerhaave,"['E_53', 'E_54_@_V_71', 'E_54_@_V_112', 'E_54_...",E_53


In [67]:
# Preprocess Validate Dataset -- Took about 20 seconds
data_dict = preprocess_df(validate_df, evidence_mapping, value_mapping)
data_dict[0]

{'Do you have a poor diet?': 1,
 'Have you ever had a diagnosis of anemia?': 1,
 'Do you have any family members who have been diagnosed with anemia?': 1,
 'Do you have pain somewhere, related to your reason for consulting?': 1,
 'Characterize your pain:': 'a cramp',
 'Do you feel pain somewhere?': 'temple(L)',
 'Does the pain radiate to another location?': 'nowhere',
 'Do you feel slightly dizzy or lightheaded?': 1,
 'Do you feel lightheaded and dizzy or do you feel like you are about to faint?': 1,
 'Do you feel so tired that you are unable to do your usual activities or are you stuck in your bed all day long?': 1,
 'Do you constantly feel fatigued or do you have non-restful sleep?': 1,
 'Do you have chronic kidney failure?': 1,
 'Have you recently had stools that were black (like coal)?': 1,
 'Are you taking any new oral anticoagulants ((NOACs)?': 1,
 'Is your skin much paler than usual?': 1,
 'Have you traveled out of the country in the last 4 weeks?': 'South East Asia',
 'Is your 

In [60]:
# Extract key columns from training df
data_temp = validate_df[['AGE', 'SEX', 'PATHOLOGY']].copy()
data_temp.head()

Unnamed: 0,AGE,SEX,PATHOLOGY
0,55,F,Anemia
1,10,F,Panic attack
2,68,F,Influenza
3,13,M,Anemia
4,48,M,Boerhaave


In [61]:
# Add data_dict to dataframe
data_temp2 = pd.DataFrame.from_dict(data_dict, orient='index')
data_temp2.head()


Unnamed: 0,Do you have a poor diet?,Have you ever had a diagnosis of anemia?,Do you have any family members who have been diagnosed with anemia?,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,Do you feel slightly dizzy or lightheaded?,Do you feel lightheaded and dizzy or do you feel like you are about to faint?,Do you feel so tired that you are unable to do your usual activities or are you stuck in your bed all day long?,...,Do you have a decrease in appetite?,Do you wheeze while inhaling or is your breathing noisy after coughing spells?,Have you breastfed one of your children for more than 9 months?,Have you felt confused or disorientated lately?,"In the last month, have you been in contact with anyone infected with the Ebola virus?",Have you noticed any unusual bleeding or bruising related to your consultation today?,Does your mother suffer from asthma?,Do you live in a rural area?,Do you live in the suburbs?,Do you ever temporarily stop breathing while you’re asleep?
0,1.0,1.0,1.0,1.0,a cramp,temple(L),nowhere,1.0,1.0,1.0,...,,,,,,,,,,
3,1.0,1.0,1.0,1.0,tugging,temple(L),nowhere,1.0,1.0,,...,,,,,,,,,,
35,1.0,1.0,1.0,1.0,exhausting,forehead,nowhere,,1.0,1.0,...,,,,,,,,,,
54,1.0,1.0,,1.0,exhausting,forehead,nowhere,1.0,1.0,1.0,...,,,,,,,,,,
99,1.0,,1.0,1.0,tugging,temple(R),nowhere,,1.0,,...,,,,,,,,,,


In [62]:
# Combine dataframes on index
resulting_df = data_temp.join(data_temp2)
resulting_df.head()

Unnamed: 0,AGE,SEX,PATHOLOGY,Do you have a poor diet?,Have you ever had a diagnosis of anemia?,Do you have any family members who have been diagnosed with anemia?,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,...,Do you have a decrease in appetite?,Do you wheeze while inhaling or is your breathing noisy after coughing spells?,Have you breastfed one of your children for more than 9 months?,Have you felt confused or disorientated lately?,"In the last month, have you been in contact with anyone infected with the Ebola virus?",Have you noticed any unusual bleeding or bruising related to your consultation today?,Does your mother suffer from asthma?,Do you live in a rural area?,Do you live in the suburbs?,Do you ever temporarily stop breathing while you’re asleep?
0,55,F,Anemia,1.0,1.0,1.0,1.0,a cramp,temple(L),nowhere,...,,,,,,,,,,
1,10,F,Panic attack,,,,1.0,a cramp,breast(R),nowhere,...,,,,,,,,,,
2,68,F,Influenza,,,,1.0,exhausting,pharynx,nowhere,...,,,,,,,,,,
3,13,M,Anemia,1.0,1.0,1.0,1.0,tugging,temple(L),nowhere,...,,,,,,,,,,
4,48,M,Boerhaave,,,,1.0,sickening,epigastric,scapula(R),...,,,,,,,,,,


In [63]:
# Fill NaN cells with zeros
resulting_df_filled = resulting_df.fillna(0)
resulting_df_filled.head()

Unnamed: 0,AGE,SEX,PATHOLOGY,Do you have a poor diet?,Have you ever had a diagnosis of anemia?,Do you have any family members who have been diagnosed with anemia?,"Do you have pain somewhere, related to your reason for consulting?",Characterize your pain:,Do you feel pain somewhere?,Does the pain radiate to another location?,...,Do you have a decrease in appetite?,Do you wheeze while inhaling or is your breathing noisy after coughing spells?,Have you breastfed one of your children for more than 9 months?,Have you felt confused or disorientated lately?,"In the last month, have you been in contact with anyone infected with the Ebola virus?",Have you noticed any unusual bleeding or bruising related to your consultation today?,Does your mother suffer from asthma?,Do you live in a rural area?,Do you live in the suburbs?,Do you ever temporarily stop breathing while you’re asleep?
0,55,F,Anemia,1.0,1.0,1.0,1.0,a cramp,temple(L),nowhere,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10,F,Panic attack,0.0,0.0,0.0,1.0,a cramp,breast(R),nowhere,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,68,F,Influenza,0.0,0.0,0.0,1.0,exhausting,pharynx,nowhere,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13,M,Anemia,1.0,1.0,1.0,1.0,tugging,temple(L),nowhere,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,48,M,Boerhaave,0.0,0.0,0.0,1.0,sickening,epigastric,scapula(R),...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
# Export dataframe to CSV
resulting_df_filled.to_csv('/content/drive/MyDrive/DS5500/Data/Dataset2/processed_validate.csv')

In [65]:
len(resulting_df_filled)


132448