# Introdcution

The purpose of this notebook is to provide a detailed socio-demographic profile of heart-disease mortality rates across state/county in the United States.

### Library required

In [None]:
# Importing libraries
import os #Operating system library
import pandas as pd #data science framework library
import json #json format library
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import glob
!pip install dask  
!pip install "dask[complete]"
from dask import dataframe as dd

!pip install pycountry_convert 
#Print style setup
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)


from matplotlib import ticker 
# import pycountry_convert as pc
import folium
import branca
from datetime import datetime, timedelta,date
from scipy.interpolate import make_interp_spline, BSpline
import plotly.express as px
import json, requests
#import calmap
import seaborn as sns

from keras.layers import Input, Dense, Activation, LeakyReLU
from keras import models
plt.style.use('fivethirtyeight')

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

### Loading dataset

In [None]:
#Column used in the cdc dataset
cols = ['resident_status',
       'month_of_death', 'sex','detail_age',
       'place_of_death_and_decedents_status', 'marital_status',
       'day_of_week_of_death', 'current_data_year', 'injury_at_work',
       'manner_of_death', 'autopsy',
       '39_cause_recode',
       'education_2003_revisionD', 'raceD', 'FIPS']

#Reading the original CDC dataset as is from the parent source:
df2015 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/CDC_2015_fipsImputed.csv', delimiter=',', usecols = cols)
df2015 = df2015.dropna(how = 'any')


  #We have selected 'GEOID' as refrence variable 
cp_cols = ['index', 'TBLID','GEOID', 'GEONAME','PROFTBL','PROFLN', 'TITLE', 'EST_1418','EST_0913','SIG90_1418_0913']
#Census datase
cp02 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/cp02.csv', encoding='latin-1')
cp03 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/cp03.csv', encoding='latin-1')
cp04 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/cp04.csv', encoding='latin-1')
cp05 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/cp05.csv', encoding='latin-1')

#Concatenate datasets
cp_df1 = pd.concat([cp02, cp03, cp04, cp05])

#We will drop most of the features na dwill choose the target variable as primary key `Provider Zip Code` along wiht features we think that they are important for social determanats 
flcols = ['Provider Zip Code', 'County', 'Provider City', 'Number of All Beds',
          'Total Number of Occupied Beds','Able to Test or Obtain Resources to Test All Current Residents Within Next 7 Days',
          'Able to Test or Obtain Resources to Test All Staff and/or Personnel Within Next 7 Days','Shortage of Nursing Staff',
          'Shortage of Clinical Staff', 'Number of Residents Staying in this Facility for At Least 1 Day This Week']
          
#Data.CMS.gov dataset for 2020, 2021, 2022
fl2020 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/faclevel_2020.csv', usecols = flcols, header=0)
fl2021 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/faclevel_2021.csv', usecols = flcols, header=0)
# fl2022 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/faclevel_2022.csv', usecols = flcols) #No values 

#Concatenate datasets
fclevel_df1 = pd.concat([fl2020, fl2021], axis=1)


#plc_cols= ['Year',	'StateAbbr',	'StateDesc',	'LocationName',	'Category',	'Measure',	'TotalPopulation',	'MeasureId',	'DataValueTypeID','Data_Value_Unit',	'Short_Question_Text', 'Geolocation']


plc2020 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/PLACES__Local_Data_for_Better_Health__County_Data_2020_release.csv',  header = 0)
plc2021 = pd.read_csv('/content/gdrive/MyDrive/CDC/cdc_dataset/PLACES__Local_Data_for_Better_Health__County_Data_2021_release.csv', header = 0)


#Concatenate datasets
plc_df1 = pd.concat([plc2020, plc2021])

In [None]:
df2015.shape

### Preprocessing

In [None]:
#Check the diminsion of dataset we
#Print name of columns 
cols1 = list(df2015.columns)
cols2 = list(cp_df1.columns)
cols3 = list(fclevel_df1.columns)
cols4 = list(plc_df1.columns)
print(f'df2015 Columns are {cols1} and\n\n cp_df1 Columns {cols2}\n\n fclevel_df1 Columns and {cols3}\n\n plc_df1 Columns and {cols4}\n\n')
from tabulate import tabulate

In [None]:
#Print data shape
nRow, nCol = df2015.shape
nRow2, nCol2 = cp_df1.shape
nRow3, nCol3 = fclevel_df1.shape
nRow4, nCol4 = plc_df1.shape

print(f'df2015 has {nRow} rows and {nCol} columns and \n\n cp_df1 has {nRow2} rows and {nCol2} columns and \n\n fclevel_df1 has {nRow3} rows and {nCol3} columns and \n\n plc_df1 has {nRow4} rows and {nCol4} columns')

In [None]:
# #Print data type of datasets
dfs = [df2015, cp_df1,fclevel_df1,plc_df1]

for df in dfs:
    print(df.dtypes)

### cleaning

## df2015

In [None]:

from tabulate import tabulate

print(tabulate(df2015[1:3], headers = cols))

In [None]:
# #Missing data
# total = df2015.isnull().sum().sort_values(ascending=False)
# percent = (df2015.isnull().sum()/df2015.isnull().count()).sort_values(ascending=False)
# missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# missing_data.head(20)

In [None]:
#Check duplicate rows
df2015.duplicated().sum()

In [None]:
#Exctracting duplicate rows
df2015.loc[df2015.duplicated(),:]

In [None]:
#Drop Duplication Rows
df2015.drop_duplicates(inplace=True)

In [None]:
df2015.shape

In [None]:
2258768-2257564
#1204 rows have been drpped

In [None]:
df2015.columns

In [None]:
#Rename and use lowercapital letters
df2015 = df2015.rename(columns={'detail_age':'age', 'place_of_death_and_decedents_status':'place_of_death','education_2003_revisionD':'education', '39_cause_recode':'heart_disease','raceD':'race'})

#Profiling construction
#Education profile
df2015['education'].loc[df2015['education']==1] = '8_grade_and_less'
df2015['education'].loc[df2015['education']==2] = '9_12grade'
df2015['education'].loc[df2015['education']==3] = 'highschool'
df2015['education'].loc[df2015['education']==4] = 'college_credit'
df2015['education'].loc[df2015['education']==5] = 'associate_degree'
df2015['education'].loc[df2015['education']==6] = 'bachelor'
df2015['education'].loc[df2015['education']==7] = 'master'
df2015['education'].loc[df2015['education']==8] = 'doctor'

#Race profile
df2015['race'].loc[df2015['race']==1] = 'White'
df2015['race'].loc[df2015['race']==2] = 'Other'
df2015['race'].loc[df2015['race']==3] = 'Black'

#Resident profile
df2015['resident_status'].loc[df2015['resident_status'] == 1] = 1 #Resident 
df2015['resident_status'].loc[df2015['resident_status'] != 4] = 0  #Foreign resident

#Quarter profile
#Month of death aggregated from number of months to quarter per a year
df2015['quarter'] = ''
df2015['quarter'].loc[(df2015['month_of_death']==1)|
                      (df2015['month_of_death']==2)|
                      (df2015['month_of_death']==3)] = 'Q1'#First Quarter 

df2015['quarter'].loc[(df2015['month_of_death']==4)|
                      (df2015['month_of_death']==5)|
                      (df2015['month_of_death']==6)] = 'Q2'#Second Quarter

df2015['quarter'].loc[(df2015['month_of_death']==7)|
                      (df2015['month_of_death']==8)|
                      (df2015['month_of_death']==9)] = 'Q3'#Third Quarter

df2015['quarter'].loc[(df2015['month_of_death']==10)|
                      (df2015['month_of_death']==11)|
                      (df2015['month_of_death']==12)] = 'Q4'#Fourth Quarter

#Place of death profile 
df2015['place_of_death']=df2015['place_of_death'].astype(str) #change 'place_of_death' to be string 
#Places of deaths profile
df2015['place_of_death'].loc[df2015['place_of_death']==3] = 'Medical_Center'
df2015['place_of_death'].loc[df2015['place_of_death']==4] = 'Home'
df2015['place_of_death'].loc[df2015['place_of_death']==5] = 'Hospice'
df2015['place_of_death'].loc[df2015['place_of_death']==6] = 'NursingHome'
df2015['place_of_death'].loc[df2015['place_of_death']==7] = 'Other'

#manner of death profile
df2015['manner_of_death'].loc[df2015['manner_of_death']==1] = 'Accidents'
df2015['manner_of_death'].loc[df2015['manner_of_death']==2] = 'Suicide'
df2015['manner_of_death'].loc[df2015['manner_of_death']==3] = 'Homicide'
df2015['manner_of_death'].loc[df2015['manner_of_death']==4] = 'Pending_investigation'
df2015['manner_of_death'].loc[df2015['manner_of_death']==6] = 'Self-Inflicted'
df2015['manner_of_death'].loc[df2015['manner_of_death']==7] = 'Natural'

#Patient with autopsy profile
df2015['autopsy'].loc[df2015['autopsy'] == 'Y'] = 1 #Yes patient get autopsy
df2015['autopsy'].loc[df2015['autopsy'] == 'y'] = 1 ##Yes patient get autopsy
df2015['autopsy'].loc[df2015['autopsy'] == 'N'] = 0 #No patient get autopsy
df2015['autopsy'].loc[df2015['autopsy'] == 'n'] = 0 #No patient get autopsy

#Gender profile
df2015['sex'].loc[df2015['sex'] == 'M'] = 1 #Male
df2015['sex'].loc[df2015['sex'] == 'F'] = 0 #Female


#Conditions construction

#Patient wit Heart Disease Condition profile:
df2015['heart_disease']=df2015['heart_disease'].astype(int) #Convert String to Int


heart_hd = df2015['heart_disease']
heart_hd.loc[heart_hd == 1] = 0
heart_hd.loc[(heart_hd >= 19)&(heart_hd <= 22)] = 1 #Patient has heart disease leis between the row 19 and 22
heart_hd.loc[heart_hd != 1] = 0

df2015 = df2015.drop(['heart_disease'],axis = 1)
df2015 = pd.concat([df2015,heart_hd],axis = 1)


#Remove  methods
df2015.dropna(axis = 0, how = 'any', inplace = True)
df2015 = df2015[df2015['autopsy'] != 'U'] #U is uknown 
df2015 = df2015[df2015['education'] != 9] # number 9 is uknown 
df2015 = df2015[df2015['marital_status'] != 'U'] #U is uknown 
df2015 = df2015[df2015['injury_at_work']!= 'U'] #U is uknown 
df2015 = df2015[df2015['age'] != 120] #number 120 as years old
df2015 = df2015[df2015['place_of_death'] != 9] # number 9 is uknown 

In [None]:
#Visulize the patient recorded in the dataaset as part of the explotary analysis 

#Create new feature called 'hd_death' which show those who's deaths are caused by heart disease.
hd_death = df2015[df2015['heart_disease'] == 1]
plt.figure(figsize = (20,16))
print('Heart disease caused death are {}% of the total death'.format(str(100*round((hd_death['heart_disease'].value_counts()[1])/len(df2015['heart_disease']),2))))

#Plot Age
plt.subplot2grid((4,3),(0,0)) 
hd_death.age.value_counts().plot(kind='line',label = 'Heart Disease',color = 'red')
df2015.age.value_counts().plot(kind='line',label = 'Total',color = 'grey')
#plt.xlim(0,20)
plt.legend(loc = 'upper left')
plt.title('Age distribution')
plt.ylabel('number')

#Plot Sex
plt.subplot2grid((3,3),(0,1)) 
x_sex = hd_death.sex.unique()
plt.bar(x=x_sex, height=df2015.sex.value_counts(), label='Total', color='grey', alpha=0.8)
plt.bar(x=x_sex, height=hd_death.sex.value_counts(), label='Heart Disease', color='pink', alpha=0.8)

for x in x_sex:
    percent = round(hd_death.sex.value_counts()[x]/df2015.sex.value_counts()[x],2)
    plt.text(x, hd_death.sex.value_counts()[x] + 150, '%s' % (str(int(percent*100))+'%'), ha='center', va='bottom')
for x,y in zip(x_sex,hd_death.sex.value_counts()):
   plt.text(x, y + 150, '%s' % y, ha='center', va='top')
plt.legend(loc = 'upper right')
plt.xticks(x_sex)
plt.title('Sex distribution')
plt.ylabel('number')

#Plot Marital Status
plt.subplot2grid((3,3),(0,2))
x_ms = hd_death.marital_status.unique()
plt.bar(x=x_ms, height=df2015.marital_status.value_counts(), label='Total', color='grey', alpha=0.8)
plt.bar(x=x_ms, height=hd_death.marital_status.value_counts(), label='Heart Disease', color='lightblue', alpha=0.8)

for x in x_ms:
    percent = round(hd_death.marital_status.value_counts()[x]/df2015.marital_status.value_counts()[x],2)
    plt.text(x, hd_death.marital_status.value_counts()[x] + 150, '%s' % str(int(percent*100))+'%', ha='center', va='bottom')
plt.legend(loc = 'upper right')
plt.title('Marital status distribution')
plt.ylabel('number')


#Plot Education level for each patients 
plt.subplot2grid((3,3),(1,0),colspan = 2) 
x_edu = hd_death.education.unique()
plt.bar(x=x_edu, height=df2015.education.value_counts(), label='Total', color='grey', alpha=0.8)
plt.bar(x=x_edu, height=hd_death.education.value_counts(), label='Heart Disease', color='lightgreen', alpha=0.8)

for x in x_edu:
    percent = round(hd_death.education.value_counts()[x]/df2015.education.value_counts()[x],2)
    plt.text(x, hd_death.education.value_counts()[x] + 100,'%s' % str(int(percent*100))+'%', ha='center', va='bottom')
plt.xticks(x_edu)
plt.legend(loc = 'upper right')
plt.title('Educations distribution')
plt.ylabel('number')

#Plot Resident status of patients
plt.subplot2grid((3,3),(1,2)) 
x_rs = hd_death.resident_status.unique()
plt.bar(x=x_rs, height=df2015.resident_status.value_counts(), label='Total', color='grey', alpha=0.8)
plt.bar(x=x_rs, height=hd_death.resident_status.value_counts(), label='Heart Disease', color='orange', alpha=0.8)
for x in x_rs:
    percent = round(hd_death.resident_status.value_counts()[x]/df2015.resident_status.value_counts()[x],2)
    plt.text(x, hd_death.resident_status.value_counts()[x] + 150,'%s' % str(int(percent*100))+'%', ha='center', va='bottom')
plt.legend(loc = 'upper left')
plt.xticks(x_rs)
plt.title('Resident status distribution')
plt.ylabel('number')


#Plot Race for patienst
plt.subplot2grid((3,3),(2,0)) 
x_r = hd_death.race.unique()
plt.bar(x=x_r, height=df2015.race.value_counts(), label='Total', color='grey', alpha=0.8)
plt.bar(x=x_r, height=hd_death.race.value_counts(), label='Heart Disease', color='brown', alpha=0.8)
for x in x_r:
    percent = round(hd_death.race.value_counts()[x]/df2015.race.value_counts()[x],2)
    plt.text(x, hd_death.race.value_counts()[x] + 150,'%s' % str(int(percent*100))+'%', ha='center', va='bottom')
plt.legend(loc = 'upper right')
plt.xticks(x_r)
plt.title('Race distribution')
plt.ylabel('number')

#Plot Place of death for those patients 
plt.subplot2grid((3,3),(2,1)) 
x_p = hd_death.place_of_death.unique()
plt.bar(x=x_p, height=hd_death.place_of_death.value_counts(), label='Total', color='grey', alpha=0.8)
plt.bar(x=x_p, height=df2015.place_of_death.value_counts(), label='Heart Disease', color='gold', alpha=0.8)
for x in x_p:
    percent = round(hd_death.place_of_death.value_counts()[x]/df2015.place_of_death.value_counts()[x],2)
    plt.text(x, hd_death.place_of_death.value_counts()[x] + 150,'%s' % str(int(percent*100))+'%', ha='center', va='bottom')
plt.legend(loc = 'upper right')
plt.xticks(x_p,rotation = -45)
plt.title('place_of_death distribution')
plt.ylabel('number')


#Plot Quarters of death occurance 
plt.subplot2grid((3,3),(2,2))
x_ss = hd_death.quarter.unique()
plt.bar(x=x_ss, height=hd_death.quarter.value_counts(), label='Total', color='grey', alpha=0.8)
plt.bar(x=x_ss, height=hd_death.quarter.value_counts(), label='Quarter', color='lightblue', alpha=0.8)

for x in x_ss:
    percent = round(hd_death.quarter.value_counts()[x]/df2015.quarter.value_counts()[x],2)
    plt.text(x, hd_death.quarter.value_counts()[x] + 150, '%s' % str(int(percent*100))+'%', ha='center', va='bottom')
plt.legend(loc = 'upper right')
plt.title('Quarter distribution')
plt.xticks(x_ss)
plt.ylabel('number')

plt.tight_layout()
plt.show()

## cp_df1


In [None]:
#cp_df1 data set contains estimaation of population n per each zip code/county state/city along with charcterstics of social determinanat factors 
from tabulate import tabulate

print(tabulate(cp_df1[2:10], headers = cp_cols))

In [None]:
#Create a copy of cp_df1 for further computation 
cp = cp_df1

#Create new feature called socio to select a certain valuess in 'TITLE'variable/feature and we need only 'GEOID' and 'EST_1418' to be presented 
socio = (cp.loc[cp.TITLE.isin(['Employed', 'Unemployed', 'Unemployement Rate', 'Median household income (dollars)', 'Mean household income (dollars)', 'No health insurance coverage'])
,['TITLE','GEOID', 'EST_1418']])

socio.head()

In [None]:
#Rename columns just for handy working
socio['TITLE'] = socio['TITLE'].replace({'Median household income (dollars)':'median_incom','Mean household income (dollars)':'mean_incom', 'No health insurance coverage':'no_health_insur'})
socio.head()

In [None]:
#Shape 
socio.shape

In [None]:
#Split 'GEOID' from 05000US01001 into the last for digits to match the 'FIPS' columns ebfore the merge 
socio['GEOID']=socio['GEOID'].astype(str)
# #step 1: exclude 'US' string
socio["GEOID"] = socio["GEOID"].apply(lambda x: x.split("US")[1])
# #Step2: 
socio["GEOID"] = socio["GEOID"].apply(lambda x: str(int(x)))

socio.head()

In [None]:
#Remove ',' and '.' from the 'GEOID'
import re
for c in socio.columns:
	socio[c] = socio[c].apply(lambda x: x.replace(",","") if isinstance(x, str) else x)
for c in socio.columns:
	socio[c] = socio[c].apply(lambda x: x.replace(".","") if isinstance(x, str) else x)
for c in socio.columns:
	socio[c] = socio[c].apply(lambda x: x.replace("(X)","") if isinstance(x, str) else x)
for c in socio.columns:
	socio[c] = socio[c].apply(lambda x: x.replace("(X)","") if isinstance(x, str) else x)
for c in socio.columns:
	socio[c] = socio[c].apply(lambda x: x.replace(' ','') if isinstance(x, str) else x)

socio["EST_1418"].replace(' ','')
socio["EST_1418"].str.contains(r'\S+').sum()
# socio.head()

In [None]:
socio.shape

In [None]:
# Remove nan
socio["EST_1418"] = socio["EST_1418"].apply(lambda x: np.nan if x == "N" else x)

socio["EST_1418"] = socio["EST_1418"].fillna(socio["EST_1418"])

#using special char
special_characters = ['!','#','$','%', '&','@','[',']',' ',']','_','-']
#After i checked, i found 8 row in the socio[socio['EST_1418'] has '', then i remove it.
socio = socio[socio['EST_1418'] != '']

#using for loop and replace to remove special characters
for i in special_characters:
    socio["EST_1418"] = socio["EST_1418"].replace(i,'')
    
# print final sample string    
print("Final String:",socio["EST_1418"])


In [None]:
#Convert EST_1418 to float
socio['EST_1418'] = socio['EST_1418'].astype(float)

### Socio Features  

In [None]:
#We have nan value, we need to compute impute before any chnage 
socio.head()

In [None]:
#Feature: average of household income for employed population
employed = (socio.loc[socio.TITLE.isin(['Employed','mean_incom'])
,['TITLE','GEOID', 'EST_1418']])

employed_mean = employed.groupby('GEOID').EST_1418.agg(['count','mean']).reset_index()
employed_mean
#Rename columns
employed_mean = employed_mean.rename(columns={'count':'employed_occurance','mean':'employed_avg_pop'})

#Feature: average of household income for unemployed population
unemployed = (socio.loc[socio.TITLE.isin(['Unemployed','mean_incom', 'no_health_insur'])
,['TITLE','GEOID', 'EST_1418']])
unemployed_mean = unemployed.groupby('GEOID').EST_1418.agg(['count','mean']).reset_index()
#Rename columns
unemployed_mean = unemployed_mean.rename(columns={'count':'unemployed_occurance','mean':'unemployed_avg_pop'})


#Concate the new features
employement_pop = pd.concat([employed_mean, unemployed_mean], axis=1)
employement_pop

In [None]:
#Feature: average of population with lack of health insurance
uninsured_pop = (socio.loc[socio.TITLE.isin(['no_health_insur'])
,['TITLE','GEOID', 'EST_1418']])
uninsured_pop_avg = uninsured_pop.groupby('GEOID').EST_1418.agg(['count','mean']).reset_index()

uninsured_pop_avg = uninsured_pop_avg.rename(columns={'count':'no_health_occurance','mean':'no_health_avg_pop'})
uninsured_pop_avg

In [None]:
#Feature: Average mean Income per each zip code
income_avg = (socio.loc[socio.TITLE.isin(['mean_incom'])
,['TITLE','GEOID', 'EST_1418']])
income_mean = unemployed.groupby('GEOID').EST_1418.agg(['count','mean']).reset_index()
#Rename columns
income_mean = income_mean.rename(columns={'count':'incom_occurance','mean':'avg_Income'})
income_mean

In [None]:
#Check NA, Duplicate, and specific value
# socio.dtypes
(socio == '').sum()

In [None]:
# socio['EST_1418'] == socio['EST_1418']
print(socio['TITLE']== 'mean_incom')


### Merge socio to df1 












In [None]:
# #Missing data
# total = socio.isnull().sum().sort_values(ascending=False)
# percent = (socio.isnull().sum()/socio.isnull().count()).sort_values(ascending=False)
# missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# missing_data.head(20)

In [None]:
#Check duplicate rows
socio.duplicated().sum()

In [None]:
#Exctracting duplicate rows
socio.loc[socio.duplicated(),:]

In [None]:
#Drop Duplication Rows
socio.drop_duplicates(inplace=True)

In [None]:
socio.shape
#We dropped 91 rows

In [None]:
#Merge Enemployed_mean
df1 = df2015 #Make a copy to df2015

#Convert GEOID and FIPS as Integer 
df1['FIPS']=df1['FIPS'].astype(int)
employed_mean['GEOID']=employed_mean['GEOID'].astype(int)

#Merge socio dataset to df1
df1 = pd.merge(df1, employed_mean,  how='left', left_on=['FIPS'], right_on = ['GEOID'])

In [None]:
#Merge Unemployed_mean
df1 = df1 #Make a copy to df2015

#Convert GEOID and FIPS as Integer 
df1['FIPS']=df1['FIPS'].astype(int)
unemployed_mean['GEOID']=unemployed_mean['GEOID'].astype(int)

#Merge socio dataset to df1
df1 = pd.merge(df1, unemployed_mean,  how='left', left_on=['FIPS'], right_on = ['GEOID'])


In [None]:
#Merge income_avg
df1 = df1 #Make a copy to df2015

#Convert GEOID and FIPS as Integer 
df1['FIPS']=df1['FIPS'].astype(int)
income_mean['GEOID']=income_mean['GEOID'].astype(int)

#Merge socio dataset to df1
df1 = pd.merge(df1, income_mean,  how='left', left_on=['FIPS'], right_on = ['GEOID'])

In [None]:
#Merge uninsured population
df1 = df1 #Make a copy to df2015

#Convert GEOID and FIPS as Integer 
df1['FIPS']=df1['FIPS'].astype(int)
uninsured_pop_avg['GEOID']=uninsured_pop_avg['GEOID'].astype(int)

#Merge socio dataset to df1
df1 = pd.merge(df1, uninsured_pop_avg,  how='left', left_on=['FIPS'], right_on = ['GEOID'])


In [None]:
df1.columns

In [None]:
#Print df1
df1.head()

In [None]:
df1.shape

In [None]:
#Check duplicate rows
df1.duplicated().sum()

In [None]:
#we have small proportion of duplication in the ros, i decided to drop it after investagtion
df1.drop_duplicates(inplace=True)

## faclevel

In [None]:
#faclevel dataset 
fclevel_df1 = fl2021
from tabulate import tabulate

print(tabulate(fclevel_df1[1:10], headers = flcols))

In [None]:
#Rename and use lowercapital letters
fclevel_df1 = fclevel_df1.rename(columns={'Provider Zip Code': 'zipcode', 'County': 'county',
                                          'Provider City':'provider_city','Number of All Beds':'number_of_beds',
                                          'Total Number of Occupied Beds':'total_occu_beds',
                                          'Able to Test or Obtain Resources to Test All Current Residents Within Next 7 Days':'test_ava_resident',
                                          'Able to Test or Obtain Resources to Test All Staff and/or Personnel Within Next 7 Days':'test_ava_staff',
                                          'Shortage of Nursing Staff': 'shortage_nurse','Shortage of Clinical Staff':'shortage_staff',
                                          'Number of Residents Staying in this Facility for At Least 1 Day This Week':'number_resident_staying_seven_days'})

In [None]:
# #missing data
# total = fclevel_df1.isnull().sum().sort_values(ascending=False)
# percent = (fclevel_df1.isnull().sum()/fclevel_df1.isnull().count()).sort_values(ascending=False)
# missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# missing_data.head(20)

In [None]:
fclevel_df1['test_ava_resident']== 'Y'

### We have observed large proportion of missing value in:


*   Number of Residents Staying in this Facility for At Least 1 Day This Week: 57%
### We have observed small proportion of missing value in:
*   Able to Test or Obtain Resources to Test All Staff and/or Personnel Within Next 7 Days: 14%
*   Able to Test or Obtain Resources to Test All Current Residents Within Next 7 Days: 14%
*   Shortage of Clinical Staff: ~2%	
*   Shortage of Nursing Staff: ~2%
*   Number of All Beds: ~1%
*   Total Number of Occupied Beds: ~1%



In [None]:
#Check duplicate rows
fclevel_df1.duplicated().sum()

In [None]:
#Exctracting duplicate rows
fclevel_df1.loc[fclevel_df1.duplicated(),:]
#After investgation the duplication is nothing could harm our data 

In [None]:
#Willl not Drop Duplication Rows
# fclevel_df1.drop_duplicates(inplace=True)

In [None]:
#Remove ',' and '.' from the 'GEOID'
import re
for c in fclevel_df1.columns:
	fclevel_df1[c] = fclevel_df1[c].apply(lambda x: x.replace(",","") if isinstance(x, str) else x)
for c in fclevel_df1.columns:
	fclevel_df1[c] = fclevel_df1[c].apply(lambda x: x.replace(".","") if isinstance(x, str) else x)
for c in fclevel_df1.columns:
	fclevel_df1[c] = fclevel_df1[c].apply(lambda x: x.replace("(X)","") if isinstance(x, str) else x)
for c in fclevel_df1.columns:
	fclevel_df1[c] = fclevel_df1[c].apply(lambda x: x.replace("(X)","") if isinstance(x, str) else x)
for c in fclevel_df1.columns:
	fclevel_df1[c] = fclevel_df1[c].apply(lambda x: x.replace(' ','') if isinstance(x, str) else x)
for c in fclevel_df1.columns:
	fclevel_df1[c] = fclevel_df1[c].apply(lambda x: x.replace('NaN ','') if isinstance(x, str) else x)

# socio["EST_1418"].replace(' ','')
# socio["EST_1418"].str.contains(r'\S+').sum()
# socio.head()

In [None]:
# Remove nan
fclevel_df1["number_of_beds"] = fclevel_df1["number_of_beds"].apply(lambda x: np.nan if x == "NaN" else x)

fclevel_df1["number_of_beds"] = fclevel_df1["number_of_beds"].fillna(fclevel_df1["number_of_beds"])

#using special char
special_characters = ['!','#','$','%', '&','@','[',']',' ',']','_','-']
#After i checked, i found 8 row in the socio[socio['EST_1418'] has '', then i remove it.
fclevel_df1 = fclevel_df1[fclevel_df1["number_of_beds"] != "NaN"]

In [None]:
fclevel_df1.head()

In [None]:
#Check NA
# socio.dtypes
(fclevel_df1 == 'NaN').sum()

In [None]:
fclevel_df1.head()

###  faclevel Feature 

In [None]:
#test_ava_resident profiling
fclevel_df1['test_ava_resident'].loc[fclevel_df1['test_ava_resident'] == 'Y'] = 1 # Test available for resident
fclevel_df1['test_ava_resident'].loc[fclevel_df1['test_ava_resident'] == 'N'] = 0 #Test not available for resident

#test_ava_staff profiling
fclevel_df1['test_ava_staff'].loc[fclevel_df1['test_ava_staff'] == 'Y'] = 1 # Test available for staff
fclevel_df1['test_ava_staff'].loc[fclevel_df1['test_ava_staff'] == 'N'] = 0 #Test not available for staff

#shortage_nurse profiling
fclevel_df1['shortage_nurse'].loc[fclevel_df1['shortage_nurse'] == 'Y'] = 1 # Shortage in nurse 
fclevel_df1['shortage_nurse'].loc[fclevel_df1['shortage_nurse'] == 'N'] = 0 #Suffieceint sursing resources

#shortage_staff profiling
fclevel_df1['shortage_staff'].loc[fclevel_df1['shortage_staff'] == 'Y'] = 1 # Shortage in nurse 
fclevel_df1['shortage_staff'].loc[fclevel_df1['shortage_staff'] == 'N'] = 0 #Suffieceint sursing resources

In [None]:
#Shortage in test resources 
# fclevel_df1 = fclevel_df1[(fclevel_df1['test_ava_resident'] == 0) & (fclevel_df1['test_ava_staff'] == 0)]
fclevel_df1.head()

In [None]:
# Shortage in human resources
# fclevel_df1 = fclevel_df1[(fclevel_df1['shortage_nurse'] == 1) & (fclevel_df1['shortage_staff'] == 1)]
fclevel_df1

In [None]:
#Feature:
#Create avergae of total beds per each zip code
beds_avg = fclevel_df1.groupby('zipcode').number_of_beds.agg(['count','mean']).reset_index()
#Rename columns
beds_avg = beds_avg.rename(columns={'count':'beds_occurance','mean':'beds_avg'})
beds_avg

In [None]:
#Create avergae of beds per each zip code
beds_occ_avg = fclevel_df1.groupby('zipcode').total_occu_beds.agg(['count','mean']).reset_index()
#Rename columns
beds_occ_avg = beds_occ_avg.rename(columns={'count':'beds_ocup_occurance','mean':'beds_occ_avg'})
beds_occ_avg

In [None]:
#Create avergae of number_resident_staying_seven_days per each zip code
avg_number_resident_staying_seven_days = fclevel_df1.groupby('zipcode').number_resident_staying_seven_days.agg(['count','mean']).reset_index()
#Rename columns
avg_number_resident_staying_seven_days = avg_number_resident_staying_seven_days.rename(columns={'count':'resi_staying_occurance','mean':'resi_staying_avg'})
avg_number_resident_staying_seven_days

In [None]:
fclevel_df1.head()

In [None]:
(fclevel_df1 == '').sum()

### Merge 2

In [None]:
#Merge bed average and df1
#Convert zipcode and FIPS as Integer 

df1 = df1
# df1 = df1.iloc[:1000000]
# fclevel_df1 = fclevel_df1.iloc[:1000000]


df1['FIPS']=df1['FIPS'].astype(int)
beds_avg['zipcode']=beds_avg['zipcode'].astype(int)

#Merge socio dataset to df1
df1 = pd.merge(df1, beds_avg,  how='left', left_on=['FIPS'], right_on = ['zipcode'])

In [None]:
#Merge beds_occ_avg and df1
#Convert zipcode and FIPS as Integer 

df1 = df1
# df1 = df1.iloc[:1000000]
# fclevel_df1 = fclevel_df1.iloc[:1000000]


df1['FIPS']=df1['FIPS'].astype(int)
beds_occ_avg['zipcode']=beds_occ_avg['zipcode'].astype(int)

#Merge socio dataset to df1
df1 = pd.merge(df1, beds_occ_avg,  how='left', left_on=['FIPS'], right_on = ['zipcode'])

In [None]:
#Merge avg_number_resident_staying_seven_days and df1
#Convert zipcode and FIPS as Integer 

df1 = df1
# df1 = df1.iloc[:1000000]
# fclevel_df1 = fclevel_df1.iloc[:1000000]


df1['FIPS']=df1['FIPS'].astype(int)
avg_number_resident_staying_seven_days['zipcode']=avg_number_resident_staying_seven_days['zipcode'].astype(int)

#Merge socio dataset to df1
df1 = pd.merge(df1, avg_number_resident_staying_seven_days,  how='left', left_on=['FIPS'], right_on = ['zipcode'])

In [None]:
df1.FIPS.unique().shape

In [None]:
df1.dropna().shape

In [None]:
df1.dropna()

In [None]:
df1.dropna()

In [None]:
df1.duplicated().sum()

In [None]:
df1.isnull()

In [None]:
df1.dropna(how='all')

In [None]:
df1.isnull().all()

In [None]:
df1.dropna()

## plc_df1

In [None]:
plc_df1.head()

### health_socio Feature 


#### Condition used for health profiling
*   Diagnosed diabetes among adults aged >=18 years
*   Coronary heart disease among adults aged >=18 years
*   Cholesterol screening among adults aged >=18 years
*   Binge drinking among adults aged >= 18 years
*   Taking medicine for high blood pressure control among adults aged >=18 years with high blood pressure






In [None]:
#Create new feature called 'risk_factor' include those values associated with heart_disease

plc_df1['risk_factor'] = ''
plc_df1['risk_factor'].loc[(plc_df1['Measure']=='Coronary heart disease among adults aged >=18 years')|
                      (plc_df1['Measure']=='Diagnosed diabetes among adults aged >=18 years')|
                      (plc_df1['Measure']=='Cholesterol screening among adults aged >=18 years')|
                      (plc_df1['Measure']=='Binge drinking among adults aged >= 18 years')|
                      (plc_df1['Measure']=='Taking medicine for high blood pressure control among adults aged >=18 years with high blood pressure')] = 1

In [None]:
#Rename and use lowercapital letters
plc_df2 = plc_df1.rename(columns={'risk_factor': 'risk_factor', 'Year': 'year','LocationID':'locationid'})

In [None]:
plc_df2 = plc_df2[['risk_factor', 'year', 'locationid']]

#One more thing to filter by year ==2015
plc_df2['year'].loc[(plc_df2['year']=='2015')]
plc_df2.head()


In [None]:
# #missing data
# total = plc_df2.isnull().sum().sort_values(ascending=False)
# percent = (plc_df2.isnull().sum()/plc_df2.isnull().count()).sort_values(ascending=False)
# missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# missing_data.head(20)
plc_df2.shape

In [None]:
#Drop the NA values 
# plc_df2.dropna(axis = 0, how = 'any', inplace = True)
(plc_df2 == 'NaN').sum()

### Socio Econ Feature


*   Current lack of health insurance among adults aged 18-64 years
*   List item



In [None]:
df1.columns

### Merge 3

In [None]:
# #Check if there are Null value 
print(plc_df2[plc_df2['year']==2019])

In [None]:
#Then for removing all non-numeric values use to_numeric with parameter errors='coerce' - to replace non-numeric values to NaNs:
plc_df2['locationid'] = pd.to_numeric(plc_df2['locationid'], errors='coerce')

# #Then for removing all non-numeric values use to_numeric with parameter errors='coerce' - to replace non-numeric values to NaNs:
plc_df2['locationid'] = pd.to_numeric(plc_df2['locationid'], errors='coerce')

# #And for remove all rows with NaNs in column x use dropna:
plc_df2 = plc_df2.dropna(subset=['locationid'])

# #Double Check if there are Null value 
print(plc_df2[plc_df2['locationid'].isnull()])

# #Convert 'LocationID' from float to Int
plc_df2['locationid']=plc_df2['locationid'].astype(int)
plc_df2.dtypes

# df1.update(health_socio)
# #Merge df1'FIPS' and health_socio'LocationID'
df1 = pd.merge(df1, plc_df2,  how='left', left_on=['FIPS'], right_on = ['locationid'])
df1.head()

In [None]:
df1['beds_avg'].isna().all()

In [None]:
plc_df2[plc_df2["locationid"] == 12105]

In [None]:
# #Double Check if there are Null value 
# print(df1[df1['beds_avg'].isnull()])
# #Limit the dataframe for compution purposes 
df1 = df1.iloc[:1000000]

# Final Dataset

In [None]:
df1.shape

In [None]:
df1.columns

In [None]:
#We need the follwing features 
df1 = df1[['resident_status', 'sex', 'age', 'place_of_death','marital_status', 'current_data_year', 'manner_of_death', 'autopsy', 'education', 'race','FIPS', 'quarter', 'heart_disease',
          'employed_avg_pop','unemployed_avg_pop','avg_Income','beds_avg','beds_occ_avg','resi_staying_avg', 'risk_factor', 'no_health_avg_pop']]

In [None]:
df1.columns

In [None]:
(df1 == 'NaN').sum()

In [None]:
df.isna().sum()

In [None]:
df = df1

In [None]:
#Imputaion 
meanVal = np.nanmean(df["employed_avg_pop"])
df["employed_avg_pop"] = df["employed_avg_pop"].apply(lambda x: meanVal if np.isnan(x) else x)

df.drop(columns=["beds_avg", 'beds_occ_avg','resi_staying_avg'], inplace=True)


In [None]:
#missing data
# total = df1.isnull().sum().sort_values(ascending=False)
# percent = (df1.isnull().sum()/df1.isnull().count()).sort_values(ascending=False)
# missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# missing_data.head(20)

# df1 = df1.dropna()

In [None]:
# print(df1.dropna(axis=0, how='all'))
df1['age']

# Prep trainign and test 

In [None]:
#Rename 
df = df.rename(columns={'heart_disease':'disease'})

In [None]:
df['age'].head()

In [None]:
# #One hot encoding 
#encoding to be df1
# Do one-hot encoding for marital_status.
df['marital_status'] = df['marital_status'].astype(str)
one_hot1 = pd.get_dummies(df['marital_status'],prefix = 'marital_status')
df = df.drop(['marital_status'],axis = 1)
df = pd.concat([df,one_hot1],axis = 1)

# Do one-hot encoding for marital_status.
df['resident_status'] = df['resident_status'].astype(str)
one_hot1 = pd.get_dummies(df['resident_status'],prefix = 'resident_status')
df = df.drop(['resident_status'],axis = 1)
df = pd.concat([df,one_hot1],axis = 1)

# Do one-hot encoding for place_of_death.
df['place_of_death'] = df['place_of_death'].astype(str)
one_hot2 = pd.get_dummies(df['place_of_death'],prefix = 'place_of_death')
df = df.drop(['place_of_death'],axis = 1)
df = pd.concat([df,one_hot2],axis = 1)

# Do one-hot encoding for manner_of_death.
df['manner_of_death'] = df['manner_of_death'].astype(str)
one_hot3 = pd.get_dummies(df['manner_of_death'],prefix = 'manner_of_death')
df = df.drop(['manner_of_death'],axis = 1)
df = pd.concat([df,one_hot3],axis = 1)


# Do one-hot encoding for sex.
df['sex'] = df['sex'].astype(str)
one_hot5 = pd.get_dummies(df['sex'],prefix = 'sex')
df = df.drop(['sex'],axis = 1)
df = pd.concat([df,one_hot5],axis = 1)

# Do one-hot encoding for race.
df['race'] = df['race'].astype(str)
one_hot6 = pd.get_dummies(df['race'],prefix = 'race')
df = df.drop(['race'],axis = 1)
df = pd.concat([df, one_hot6],axis = 1)

# Do one-hot encoding for education.
df['education'] = df['education'].astype(str)
one_hot7 = pd.get_dummies(df['education'],prefix = 'education')
df = df.drop(['education'],axis = 1)
df = pd.concat([df, one_hot7],axis = 1)

# Do one-hot encoding for quarter.
df['quarter'] = df['quarter'].astype(str)
one_hot8 = pd.get_dummies(df['quarter'],prefix = 'quarter')
df = df.drop(['quarter'],axis = 1)
df = pd.concat([df, one_hot8],axis = 1)

# Do one-hot encoding for injury_at_work
df['autopsy'] = df['autopsy'].astype(str)
one_hot8 = pd.get_dummies(df['autopsy'],prefix = 'autopsy')
df = df.drop(['autopsy'],axis = 1)
df = pd.concat([df, one_hot8],axis = 1)

# Do one-hot encoding for TITLE
df['risk_factor'] = df['risk_factor'].astype(str)
one_hot8 = pd.get_dummies(df['risk_factor'],prefix = 'risk_factor')
df = df.drop(['risk_factor'],axis = 1)
df = pd.concat([df, one_hot8],axis = 1)

In [None]:
df.columns

In [None]:
df['age']==48

In [None]:
# # Drop some features with lower sig. value
df = df.drop(['FIPS','marital_status_S','resident_status_4', 'place_of_death_2', 'place_of_death_4',
                  'place_of_death_5','place_of_death_7','place_of_death_9','manner_of_death_Accidents', 'manner_of_death_5.0'],axis = 1)

In [None]:
#Change to float.
df = df.astype(np.float32)
df.dtypes

In [None]:
df['age'].head()

In [None]:

disease = df["disease"].to_numpy()
age = df["age"].to_numpy()
df.drop(columns=["disease", "age"], inplace=True)
df["disease"] = disease
df["age"] = age

In [None]:
df

In [None]:
df['age']

#Models

# Multi-Task Learning (MTL) - Disease & Age

## Import

In [None]:
import numpy as np
from fastai import *
from fastai.vision import *
from fastai.layers import MSELossFlat, CrossEntropyFlat 
import matplotlib.pyplot as plt
import pandas as pd


## Artificial dataset (to replace with the real dataset)

### Import data

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
df = df.rename(columns={'education_High school graduate (includes equivalency)':'high_school'})

In [None]:
df.columns

In [None]:
# #- 1: create a dictionnary called dic_disease = {0: "No heart disease", 1: "Heart disease"} ?
# - 2: rename your df1 into df. Place all your features at the beginning and "disease" + "age" should be the last two columns.
# -3: Keep the last lines (nb_diseases = len(np.unique .......) and below)
dic_disease = {0: "No heart disease",
               1: "Heart disease"}

nb_diseases = len(np.unique(df["disease"]))


from sklearn.model_selection import train_test_split
df_train, df_valid = train_test_split(df, test_size=0.2, shuffle=True,
                                      random_state=42)
nb_features = len(df.columns) - 2
df.head(5)

### Torch Dataset object

In [None]:
class MultiTaskDataset(Dataset):
  def __init__(self, df):
    self.df = df

  def __len__(self):
    return len(self.df)

  def __getitem__(self, idx):
    x = torch.tensor(self.df.values[idx,:nb_features], dtype=torch.float32)
    labels = self.df.values[idx,nb_features:]
    disease = torch.tensor(int(labels[0]), dtype=torch.int64)
    age = torch.tensor(float(labels[1]), dtype=torch.float32)
    
    return x, (disease, age.log_().div(4.75))  # age.log_() / 4.75 to obtain a value between 0 and 1


  def show(self,idx):
    x, y = self.__getitem__(idx)
    disease, age = y
    print("Disease: {}, Age: {}".format(dic_disease[disease.item()], int(age.mul_(4.75).exp_().item())))

### Create the DataBunch

In [None]:
# Parameters
batch_size = 128
num_workers = 2

In [None]:
train_ds = MultiTaskDataset(df_train)
valid_ds = MultiTaskDataset(df_valid)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
                      num_workers=num_workers)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True,
                      num_workers=num_workers)
data = DataBunch(train_dl, valid_dl)

Example to retrieve the information of a patient using the id

In [None]:
train_ds.show(1)

In [None]:
train_ds.__getitem__(0)

## Multi-Task Model

### Model

In [None]:
class MultiTaskModel(nn.Module):
  def __init__(self, nb_diseases):
    super(MultiTaskModel, self).__init__()

    self.linear1 = torch.nn.Linear(nb_features, 200)
    self.activation = torch.nn.ReLU()
    self.linear2 = torch.nn.Linear(200, 10)

    # create one head per task :)
    self.fc1 = torch.nn.Linear(10, nb_diseases)
    self.fc2 = torch.nn.Linear(10, 1)

  def forward(self, x):
    x = self.linear1(x)
    x = self.activation(x)
    x = self.linear2(x)

    disease = torch.sigmoid(self.fc1(x)) 
    age = torch.sigmoid(self.fc2(x))  # age log between 0 and 1
    return [disease, age]

### Loss

In [None]:
class MultiTaskLossWrapper(nn.Module):
  def __init__(self, task_num):
    super(MultiTaskLossWrapper, self).__init__()
    self.task_num = task_num
    self.log_vars = nn.Parameter(torch.zeros((task_num)))

  def forward(self, preds, disease, age):

    mse, crossEntropy = MSELossFlat(), CrossEntropyFlat()

    loss0 = crossEntropy(preds[0], disease)
    loss1 = mse(preds[1], age)

    precision0 = torch.exp(-self.log_vars[0])
    loss0 = precision0 * loss0 + self.log_vars[0]

    precision1 = torch.exp(-self.log_vars[1])
    loss1 = precision1 * loss1 + self.log_vars[1]
    
    return loss0 + loss1

### Define the metrics and build the Learner

In [None]:
def acc_disease(preds, disease, age):
  return accuracy(preds[0], disease)
def rmse_age(preds, disease, age):
  return root_mean_squared_error(preds[1], age)

metrics = [acc_disease, rmse_age]

model = MultiTaskModel(nb_diseases)

task_num = 2  # number of tasks
loss_func = MultiTaskLossWrapper(task_num).to(data.device)  # just making sure the loss is on the gpu

learn = Learner(data, model, loss_func=loss_func,
                callback_fns=ShowGraph, metrics=metrics)

# spliting the model (to use discriminative learning rates)
learn.split([learn.model.linear1,
             learn.model.linear2,
             nn.ModuleList([learn.model.fc1, learn.model.fc2])]);

# train only the heads first (last layers)
learn.freeze()

## Fit

### Find learning rate

In [None]:
# find learning rate
learn.lr_find()
learn.recorder.plot()

In [None]:
learn.recorder.losses

In [None]:
# According to the graph, choose lr
lr = 1e-5
max_lr = 1e-2

### Fit one cycle

In [None]:
n_epoch = 15
learn.fit_one_cycle(n_epoch, max_lr=max_lr,
                    callbacks=[callbacks.SaveModelCallback(learn,
                                                           every="improvement",
                                                           monitor="valid_loss",
                                                           name="stage-1")])

### Adjust learning rate

In [None]:
learn.load("stage-1")

In [None]:
learn.unfreeze()
learn.lr_find()
learn.recorder.plot()

### Fit another cycle

In [None]:
max_lr = slice(1e-6, 1e-1)
n_epoch = 30

In [None]:
learn.unfreeze()
learn.fit_one_cycle(n_epoch, max_lr=max_lr,
                    callbacks=[callbacks.SaveModelCallback(learn,
                                                           every="improvement",
                                                           monitor="valid_loss",
                                                           name="stage-2")])

In [None]:
learn = learn.load("stage-2")

## Save model & make predictions

### Save/Load

In [None]:
trained_model = learn.model.cpu()  # Moving inference to the CPU
torch.save(trained_model.state_dict(), "saved_model")  # save

"""
# To load:
trained_model = MultiTaskModel(nb_diseases)
trained_model.load_state_dict(torch.load("saved_model"))
"""

### Create predictor object

In [None]:
class Predictor():
  def __init__(self, model, dic_disease):
    self.model = model
    self.disease = dic_disease

  def predict(self, x):
    x = torch.tensor(x, dtype=torch.float32)
    preds = self.model(x.unsqueeze(0))
    disease = self.disease[torch.softmax(preds[0], 1).argmax().item()]
    age = int(torch.exp(preds[1] * 4.75).item())
    
    return disease, age

In [None]:
predictor = Predictor(trained_model, dic_disease)

### Make predictions

In [None]:
patient_id = 14
unknown_patient = df.values[patient_id, :nb_features]#replace with  # extract just the features from the raw dataframe
predictor.predict(unknown_patient)