In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MultiLabelBinarizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import re
from sklearn.linear_model import LinearRegression
from fancyimpute import IterativeImputer
from sklearn.neighbors import NearestNeighbors
from sklearn.impute import KNNImputer
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
tf.__version__

'2.2.0'

### Import Dataset

In [3]:
dataset = pd.read_csv('Covid19_LineList_Records.csv')
pd.set_option('display.max_columns', None)
dataset.head()

Unnamed: 0.1,Unnamed: 0,age,gender,caseConfirmationDate,livesInGroundZero,traveledToGroundZero,recordSource,lineListSource,hasTravelHistory,id,location.id,locationType,groundZeroExposure,notes,relevantTravelHistoryLocation,sequenceAvailable,symptomStartDate,hospitalAdmissionDate,relevantTravelHistoryDates,outcome,didDie,didRecover,outcomeDate,symptoms,ageRange,chronicDisease
0,0,27.0,female,2020-04-04T00:00:00Z,False,False,https://www.prensalibre.com/guatemala/comunita...,OPEN,False,00002e80-d790-4d77-aa02-7da86338c41b,GuatemalaCity_GuatemalaCity_Guatemala,,,,,,,,,,,,,,,
1,1,,,2020-03-26T00:00:00Z,False,False,,OPEN,False,00004cec-61fe-43a0-b4c4-cbb2a2c4e932,NewcastleuponTyneTyneandWear_England_UnitedKin...,,,,,,,,,,,,,,,
2,2,,,2020-03-25T00:00:00Z,False,False,https://coronavirus.health.ny.gov/county-count...,OPEN,False,00009339-ab52-4de7-afad-3a8ca35f700e,NewYork_UnitedStates,,,,,,,,,,,,,,,
3,3,,,2020-03-22T00:00:00Z,False,False,https://coronavirus.health.ny.gov/county-count...,OPEN,False,0000ef82-6365-4ee6-bb44-00b8575dbf3c,NewYorkCity_NewYork_UnitedStates,,,,,,,,,,,,,,,
4,4,,,2020-03-29T00:00:00Z,False,False,https://www.mercurynews.com/2020/03/20/map-cor...,OPEN,False,000174f4-8b80-4832-b7d5-333e4a3ccfcf,California_UnitedStates,,,,,,,,,,,,,,,


In [4]:
#GET DATASET INFO
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266871 entries, 0 to 266870
Data columns (total 26 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   Unnamed: 0                     266871 non-null  int64  
 1   age                            11959 non-null   float64
 2   gender                         15510 non-null   object 
 3   caseConfirmationDate           263353 non-null  object 
 4   livesInGroundZero              266871 non-null  bool   
 5   traveledToGroundZero           266871 non-null  bool   
 6   recordSource                   247324 non-null  object 
 7   lineListSource                 266871 non-null  object 
 8   hasTravelHistory               266871 non-null  bool   
 9   id                             266871 non-null  object 
 10  location.id                    266848 non-null  object 
 11  locationType                   6868 non-null    object 
 12  groundZeroExposure            

### Clean The Dataset

In [5]:
#Create new df out of only the columns we want
patient_level = dataset[["age", "gender", "symptoms", "location.id", "chronicDisease", "didDie"]]

#Drop null values
patient_level = patient_level.dropna(subset = ["didDie", "gender"])

patient_level

Unnamed: 0,age,gender,symptoms,location.id,chronicDisease,didDie
183,43.0,male,,XiningCity_Qinghai_China,,False
875,48.0,female,,HougangStreet61__Singapore,,False
925,58.0,female,,NationalCentreforInfectiousDiseases__Singapore,,False
996,46.0,male,fever,WenzhouCity_Zhejiang_China,,False
1012,70.0,male,,Tokyo_Japan,,False
...,...,...,...,...,...,...
264532,70.0,male,Severe,Banjul__Gambia,Diabetes,False
264567,47.0,female,,ChangiAirport__Singapore,,False
265358,65.0,male,acute respiratory distress syndrome:pneumonia,Quezon_MetroManila_Philippines,hypertension:chronic kidney disease,True
265720,57.0,male,cardiogenic shock:acute coronary syndrome:hear...,Manila_MetroManila_Philippines,hypertension,True


In [22]:
#VISUALIZE SYMPTOMS COLUMN
#patient_level["symptoms"].unique()

### We notice that some values in the "symptoms" and "chronicDisease" columns are comma or colon separated strings. Some strings are also mispelled, and/or duplicates with slightly different wording

In [7]:
#CHOOSE COLUMNS TO MAKE INTO NEW DF
patient_level2 = dataset[["age", "gender", "symptoms", "location.id", "chronicDisease", "didDie"]]

#DROP NULL VALUES
patient_level2 = patient_level2.dropna(subset = ["didDie", "gender"])

#REPLACE NAN VALUES IN "SYMPTOMS" and "CHRONIC DISEASE" COLUMN WITH "NONE"
patient_level2['symptoms'] = patient_level2['symptoms'].fillna('asymptomatic')
patient_level2['chronicDisease'] = patient_level2['chronicDisease'].fillna('none')

#STRIP ALL CHARACTERS BEFORE THE LAST "_" IN THE LOCATION.ID COLUMN TO RETURN JUST THE NAME OF THE COUNTRY
patient_level2['location.id'] = patient_level2['location.id'].str.split('_').str[-1].str.strip()

#REPLACE THE COLONS AND SEMICOLONS WITH COMMAS, SPLIT VALUES BY COMMA
patient_level2['symptoms'] = patient_level2['symptoms'].str.replace(':', ',')
patient_level2['symptoms'] = patient_level2['symptoms'].str.replace(';', ',')
patient_level2['symptoms'] = patient_level2['symptoms'].str.split(",")

patient_level2['chronicDisease'] = patient_level2['chronicDisease'].str.replace(':', ',')
patient_level2['chronicDisease'] = patient_level2['chronicDisease'].str.replace(';', ',')
patient_level2['chronicDisease'] = patient_level2['chronicDisease'].str.split(",")

#CLEAN UP SYMPTOMS THAT MEAN THE SAME THING BUT ARE SPELLED OR WORDED DIFFERENTLY
patient_level2['symptoms'] = [['acute respiratory distress' if 'respiratory' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['acute kidney injury' if 'acute renal failure' in s else 
                               'acute kidney injury' if 'acute kidney injury' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['fatigue' if 'fatigue' in s else 'fatigue' if 'somnolence' in s else 'fatigue' if 'fatigure' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['shortness of breath' if ' breathing' in s else 'shortness of breath' if 'dyspnea' 
                               in s else 'shortness of breath' if 'shortness of breath' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['cold chills' if 'chill' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['acute respiratory distress' if 'distress' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['pneumonia' if 'pneumonia' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['septic shock' if 'sepsis' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['arrhythmia' if 'arrhythmia' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['asymptomatic' if 'none' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['cough' if 'cough' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['runny nose' if 'running nose' in s else 'runny nose' if 'runny nose' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['malaise' if 'malaise' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['muscular soreness' if 'myalgias' in s else 'muscular soreness' if 'mialgia' in s else 'muscular soreness' if 
                               'myalgia' in s else 'muscular soreness' if 'muscular soreness' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['fever' if 'fever' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['sore throat' if 'sore throat' in s else s for s in lst] for lst in patient_level2['symptoms']]
patient_level2['symptoms'] = [['headache' if 'headache' in s else s for s in lst] for lst in patient_level2['symptoms']]

#CLEAN UP CHRONIC DISEASES THAT MEAN THE SAME THING BUT ARE SPELLED OR WORDED DIFFERENTLY
patient_level2['chronicDisease'] = [['hypertension' if 'Hypertension' in s else 'hypertension' if 'hypertenstion' in s else
                                     'hypertension' if 'hypertension' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['diabetes' if 'diabetes' in s else 'diabetes' if 'Diabetes' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['COPD' if 'chronic obstructive pulmonary disease' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['asthma' if 'asthma' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['coronary heart disease' if 'coronary' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['kidney disease' if 'kidney' in s else 'kidney disease' if 'renal' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['cancer' if 'cancer' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['parkinson\'s disease' if 'Parkinson\'s disease' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['heart disease' if 'cardi' in s else 'heart disease' if 'heart' in s else 
                                     'heart disease' if 'atrial' in s else 'heart disease' if 'vent' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['cerebrovascular disease' if 'cereb' in s else 
                                     'cerebrovascular disease' if 'encephalomalacia' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['thyroid disease' if 'thyroid' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['enlarged prostate' if 'prostat' in s else s for s in lst] for lst in patient_level2['chronicDisease']]
patient_level2['chronicDisease'] = [['dyslipidemia' if 'dislipidemia' in s else s for s in lst] for lst in patient_level2['chronicDisease']]

#RENAME LOCATION.ID COLUMN
patient_level2 = patient_level2.rename(columns={'location.id': 'country'})


#patient_level2.columns = patient_level2.columns.str.strip()
#patient_level2 = pd.concat([patient_level2.drop('symptoms', 1), patient_level2['symptoms'].str.get_dummies(sep=",")], 1)


patient_level2

Unnamed: 0,age,gender,symptoms,country,chronicDisease,didDie
183,43.0,male,[asymptomatic],China,[none],False
875,48.0,female,[asymptomatic],Singapore,[none],False
925,58.0,female,[asymptomatic],Singapore,[none],False
996,46.0,male,[fever],China,[none],False
1012,70.0,male,[asymptomatic],Japan,[none],False
...,...,...,...,...,...,...
264532,70.0,male,[Severe],Gambia,[diabetes],False
264567,47.0,female,[asymptomatic],Singapore,[none],False
265358,65.0,male,"[acute respiratory distress, pneumonia]",Philippines,"[hypertension, kidney disease]",True
265720,57.0,male,"[cardiogenic shock, acute coronary syndrome, h...",Philippines,[hypertension],True


In [8]:
#CREATE DUMMY VARIABLES FOR EACH VALUE IN THE LISTS THAT MAKE UP THE SYMPTOMS COLUMN
symptomDummies = pd.concat([patient_level2.drop('symptoms', 1), patient_level2['symptoms'].str.join('|').str.get_dummies()], 1)

#STRIP LEADING SPACES FROM COLUMN NEW NAMES
symptomDummies.columns = symptomDummies.columns.str.strip()

#REMOVE SPECIFIC DUMMY COLUMNS WE DON'T WANT (WE'RE ONLY KEEPING THE MOST COMMON SYMPTOMS)
symptomDummies = symptomDummies.drop(['severe', 'gasp', 'grasp', 'conjunctivitis', 'weak', 'afebrile', 'Severe', 'dizziness', 'multiple electrolyte imbalance', 
                                      'multiple organ failure', 'myocardial dysfunction', 'primary myelofibrosis', 'chest discomfort', 'cardiopulmonary arrest',
                                     'dysphagia', 'discomfort', 'sputum', 'little sputum', 'expectoration', 'systemic weakness', 'anorexia', 'lesions on chest radiographs',
                                     'cardiogenic shock', 'kidney failure and hypertension', 'obnubilation', 'gastritis', 'eye irritation',
                                     'emesis', 'myocardial infarction', 'acute coronary syndrome', 'colds', 'hypoxia', 'discomfort', 
                                      'heart failure', 'acute kidney injury', 'arrhythmia', 'acute myocardial infarction', 'chest pain', 'congestive heart failure'], axis=1)


#new = symptomDummies.loc[symptomDummies['septic shock']==1]
#new

symptom_cd_dummies = pd.concat([symptomDummies.drop('chronicDisease', 1), symptomDummies['chronicDisease'].str.join('|').str.get_dummies()], 1)

#STRIP LEADING SPACES FROM COLUMN NEW NAMES
symptom_cd_dummies.columns = symptom_cd_dummies.columns.str.strip()

symptom_cd_dummies = symptom_cd_dummies.drop(['"thought to have had other pre-existing conditions"', 'hemorrhage of digestive tract', 
                                              'upper git bleeding', 'taking medicine of Madopar', 'hip replacement', 'hypertensive', 'impaired fasting glucose', 'country'], axis=1)

#MOVE didDie COLUMN TO THE END
move_column = symptom_cd_dummies.pop("didDie")
symptom_cd_dummies.insert(34, move_column.name, move_column)

#CHANGE VALUES IN THE didDie COLUMN TO BINARY
die = {True: 1, False: 0}
symptom_cd_dummies.didDie = [die[item] for item in symptom_cd_dummies.didDie]

symptom_cd_dummies

Unnamed: 0,age,gender,acute respiratory distress,asymptomatic,cold chills,cough,diarrhea,fatigue,fever,headache,malaise,muscular soreness,pneumonia,runny nose,septic shock,shortness of breath,sore throat,Tuberculosis,COPD,asthma,atherosclerosis,cancer,cerebrovascular disease,chronic bronchitis,diabetes,dyslipidemia,enlarged prostate,heart disease,hepatitis B,hypertension,kidney disease,none,parkinson's disease,thyroid disease,didDie
183,43.0,male,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
875,48.0,female,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
925,58.0,female,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
996,46.0,male,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1012,70.0,male,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264532,70.0,male,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
264567,47.0,female,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
265358,65.0,male,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1
265720,57.0,male,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [9]:
#GET A COUNT OF VALUES FOR SYMPTOMS AND CHRONIC DISEASES
counts = symptom_cd_dummies.drop(columns=['age', 'gender'], axis=0)
counts = counts.melt(var_name='columns', value_name='index')
counts = pd.crosstab(index=counts['index'], columns=counts['columns'])
counts.sort_values(by=1, axis=1, ascending=False, inplace=True)
counts

columns,none,asymptomatic,didDie,fever,hypertension,pneumonia,cough,acute respiratory distress,diabetes,septic shock,kidney disease,heart disease,fatigue,sore throat,shortness of breath,runny nose,headache,muscular soreness,asthma,COPD,cold chills,cancer,enlarged prostate,malaise,cerebrovascular disease,dyslipidemia,diarrhea,chronic bronchitis,thyroid disease,hepatitis B,parkinson's disease,atherosclerosis,Tuberculosis
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
0,120,209,454,551,561,575,577,579,584,623,624,624,626,627,630,632,633,634,634,635,635,636,638,638,638,639,639,639,639,640,640,640,641
1,522,433,188,91,81,67,65,63,58,19,18,18,16,15,12,10,9,8,8,7,7,6,4,4,4,3,3,3,3,2,2,2,1


In [10]:
#CHANGE VALUES IN THE gender COLUMN TO BINARY SO IT CAN BE USED IN LINEAR REG TO PREDICT MISSING AGE VALUES
#maleFemale = {'male': 1, 'female': 0}
#symptom_cd_dummies.gender = [maleFemale[item] for item in symptom_cd_dummies.gender]

### Data Preprocessing for machine learning model

In [11]:
x = symptom_cd_dummies.iloc[:, :-1].values
y = symptom_cd_dummies.iloc[:, -1].values

#LABEL ENCODE THE GENDER COLUMN
le = LabelEncoder()
x[:, 1] = le.fit_transform(x[:, 1])
x[:,0]

array([43.0, 48.0, 58.0, 46.0, 70.0, 44.0, 26.0, 47.0, 42.0, 38.0, 56.0,
       30.0, 39.0, 69.0, 53.0, 21.0, 65.0, 38.0, 50.0, 16.0, 22.0, 79.0,
       81.0, 48.0, 28.0, 47.0, 69.0, 49.0, 50.0, 36.0, 42.0, 44.0, 42.0,
       30.0, 23.0, 38.0, nan, 73.0, nan, nan, nan, 82.0, 48.0, 27.0, 52.0,
       nan, 72.0, 72.0, 44.0, 32.0, 81.0, 65.0, 53.0, nan, 73.0, 29.0,
       52.0, 26.0, 29.0, 79.0, 74.0, 82.0, 63.0, 65.0, nan, 47.0, 42.0,
       44.0, 35.0, 56.0, 48.0, nan, 51.0, nan, 39.0, 21.0, nan, 66.0,
       30.0, 29.0, 59.0, 77.0, 46.0, 67.0, 79.0, 30.0, 1.0, 55.0, 36.0,
       62.0, 56.0, 58.0, 27.0, 31.0, 45.0, nan, 41.0, 30.0, 76.0, 46.0,
       37.0, 37.0, 25.0, 59.0, 28.0, 44.0, 32.0, 71.0, 38.0, 45.0, nan,
       73.0, 57.0, 39.0, 63.0, 69.0, 78.0, 36.0, 63.0, 39.0, 74.0, 4.0,
       34.0, 63.0, 53.0, 46.0, 54.0, 32.0, 54.0, 40.0, 8.0, 30.0, 27.0,
       85.0, nan, 28.0, 56.0, 50.0, 71.0, nan, 71.0, 60.0, 34.0, 35.0,
       70.0, 20.0, 84.0, 32.0, 52.0, 73.0, 34.0, 54.0, 68.0, 3

In [12]:
#IMPUTE MISSING VALUES IN THE AGE COLUMN
imputer = KNNImputer(n_neighbors=2)

x = imputer.fit_transform(x)

x

array([[43.,  1.,  0., ...,  1.,  0.,  0.],
       [48.,  0.,  0., ...,  1.,  0.,  0.],
       [58.,  0.,  0., ...,  1.,  0.,  0.],
       ...,
       [65.,  1.,  1., ...,  0.,  0.,  0.],
       [57.,  1.,  0., ...,  0.,  0.,  0.],
       [77.,  0.,  0., ...,  1.,  0.,  0.]])

### Split the cleaned dataset into Train and Test sets

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

### Feature Scaling

In [14]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### Build the Artificial Neural Network

In [15]:
#Initializing the ANN
ann = tf.keras.models.Sequential()

#ADDING THE SECOND HIDDEN LAYER
ann.add(tf.keras.layers.Dense(units=6, activation='relu'))

#ADDING THE OUTPUT LAYER
ann.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

### Training The ANN

In [16]:
#COMPILING THE ANN
ann.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

#TRAINING THE ANN
ann.fit(x_train, y_train, batch_size = 32, epochs = 100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x12b27e630>

### Predicting Test Set Results

In [17]:
y_pred = ann.predict(x_test)
y_pred = (y_pred > 0.5)
compare = pd.DataFrame(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))
compare.head()

Unnamed: 0,0,1
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0


### Making Confusion Matrix

In [18]:
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[82  4]
 [13 30]]


0.8682170542635659

### Predicting a new single result:
#### Possible features (in order of index): 
    0. age
    1. gender (binary value)
#### Possible symptoms (in order of index, as binary values):
    2. acute respiratory distress   7. fatigue               12. pneumonia 
    3. asymptomatic                 8. fever                 13. runny nose
    4. cold chills                  9. headache              14. septic shock
    5. cough                        10. malaise               15. shortness of breath
    6. diarrhea                     11. muscular soreness    16. sore throat
   
    
#### Possible chronic conditions (in order of index, as binary values):
    17. tuberculosis                 
    18. COPD                         26. enlarged prostate
    19. asthma                       27. heart disease
    20. atherosclerosis              28. hepatitis B
    21. cancer                       29. hypertension
    22. cerebrovascular disease	  30. kidney disease
    23. chronic bronchitis           31. none (not known)
    24. diabetes                     32. parkinson's disease
    25. dyslipidemia                 33. thyroid disease

In [19]:
#PREDICTING THE OUTCOME OF A 65y/o MALE PATIENT WITH COPD
print(ann.predict(sc.transform([[65,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])) > 0.5)
print(ann.predict(sc.transform([[65,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]])))

[[False]]
[[0.04028921]]


## Making a program that allows users to make predictions on a per patient basis based on input

In [24]:
from IPython.display import display, Markdown

#INITIALIZE LIST TO HOLD MATRIX OF FEATURES
mylist = list(range(34))

#INPUT PATIENT AGE AND GENDER
display(Markdown("### Demographics:"))
age = int(input("PROMPT 1: What is the patient's age? \n-\n"))
gender = input("\nPROMPT 2: What is the patient's sex? (write 'male' or 'female')\n-\n")
genderNum = 1 if gender=='male' else 0

#PROMPT USER INPUT OF SYMPTOMS
display(Markdown("### Health Profile:"))
display(Markdown("#### Symptoms:"))
print("2. acute respiratory distress   3. asymptomatic   4. cold chills   5. cough\n6. diarrhea   7. fatigue   8. fever.  9.headache.  10. malaise\n11. muscle soreness   12. pneumonia   13. runny nose   14. septic shock   15. shortness of breath\n16. sore throat")

#PROMPT USER INPUT OF PRE-EXISTING CONDITIONS
display(Markdown("#### Pre-existing Conditions:"))
print("17. tuberculosis   18.COPD   19.asthma   20. atherosclerosis   21. Cancer   22. cerebrovascular disease\n23. chronic bronchitis  24. diabetes   25. dyslipidemia   26. enlarged prostate   27. heart disease\n28. Hepatitis B   29. hypertension   30. kidney disease.  31. none   32. parkinson's disease.  33. thyroid disease\n")

symptoms = [int(x) for x in input("\nPROMPT 3: Enter the number next to the patient's corresponding symptoms, and/or pre-existing conditions as listed, \nseparated by a space:\n\n").split()]
mylist2 = [1 if int(el)==int(el) in symptoms else 0 for el in mylist]
  

mylist2[0] = age
mylist2[1] = genderNum

#RETURN THE PROBABILITY PREDICTION
probability = ann.predict(sc.transform([mylist2]))
probabilityPercent = probability.round(2)

#RETURN THE PREDICTION IN FORM OF TRUE/FALSE
hardPrediction = probability>0.5

#RETURN THE 'TRUE/FALSE' VALUE OF 'hardPredction' WITHOUT THE BRACKETS
hardPrediction = ''.join(map(str, hardPrediction[0]))

class color:
   BOLD = '\033[1m'
   RED = '\033[91m'
   GREEN = '\033[92m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

def final_prediction(finalResult):
    if (probability > 0.5):
        print(color.BOLD + hardPrediction + color.END)
        print("Our model predicts that a fatal infection is likely for this patient")
        print(color.RED + "HIGH RISK" + color.END)
    else:
        print(color.BOLD + hardPrediction + color.END)
        print("Our model predicts that a fatal infection is not likely for this patient")
        print(color.GREEN + "LOW RISK" + color.END)     

print(color.UNDERLINE + "\nPredicted probability of mortality:" + color.END)
print(*probability[0])
print(color.BOLD + "{0} %\n".format(*probabilityPercent[0]*100) + color.END)
print(color.UNDERLINE + "Fatal infection prediction:" + color.END)
final_prediction(hardPrediction)


### Demographics:

PROMPT 1: What is the patient's age? 
-
47

PROMPT 2: What is the patient's sex? (write 'male' or 'female')
-
male


### Health Profile:

#### Symptoms:

2. acute respiratory distress   3. asymptomatic   4. cold chills   5. cough
6. diarrhea   7. fatigue   8. fever.  9.headache.  10. malaise
11. muscle soreness   12. pneumonia   13. runny nose   14. septic shock   15. shortness of breath
16. sore throat


#### Pre-existing Conditions:

17. tuberculosis   18.COPD   19.asthma   20. atherosclerosis   21. Cancer   22. cerebrovascular disease
23. chronic bronchitis  24. diabetes   25. dyslipidemia   26. enlarged prostate   27. heart disease
28. Hepatitis B   29. hypertension   30. kidney disease.  31. none   32. parkinson's disease.  33. thyroid disease


PROMPT 3: Enter the number next to the patient's corresponding symptoms, and/or pre-existing conditions as listed, 
separated by a space:

7 18
[4m
Predicted probability of mortality:[0m
0.605568
[1m61.0 %
[0m
[4mFatal infection prediction:[0m
[1mTrue[0m
Our model predicts that a fatal infection is likely for this patient
[91mHIGH RISK[0m
