In [1]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import LabelEncoder

In [2]:
# read in data
transmission_df = pd.read_csv("./Resources/household_transmission_case.csv", index_col=0)
transmission_df.dtypes

device_id                    object
household_size                int64
total_household_children      int64
total_household_adults        int64
state                        object
profile_id_index             object
yyyymmdd_index                int64
profile_id_contact           object
yyyymmdd_contact              int64
serial_interval             float64
age_index                   float64
age_contact                 float64
secondary_transmission        int64
temperature_c_index         float64
dtype: object

In [3]:
# convert "yyyymmdd_index" variable to str for conversion to datetime
transmission_df['date_time'] = transmission_df['yyyymmdd_index'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))

In [4]:
# create a "week_num" variable from converted "date_time" variable
transmission_df["week_num"] = transmission_df['date_time'].apply(lambda x: datetime.date(x).isocalendar()[1])
transmission_df[["week_num"]]

Unnamed: 0,week_num
1,2
2,19
3,26
4,36
5,41
...,...
180377,21
180378,27
180379,35
180380,50


In [5]:
#create a dataframe with only index cases that have had a fever
transmission_df = transmission_df[transmission_df["temperature_c_index"] >= 38]
transmission_df

Unnamed: 0,device_id,household_size,total_household_children,total_household_adults,state,profile_id_index,yyyymmdd_index,profile_id_contact,yyyymmdd_contact,serial_interval,age_index,age_contact,secondary_transmission,temperature_c_index,date_time,week_num
2,000233CC-EDC5-4EFC-A5F8-2A5CE351D067,3,1,2,VA,fb44600f-ac7f-46a7-be65-aebc920d35e9,20210515,,0,,1.0,,0,38.1,2021-05-15,19
3,000233CC-EDC5-4EFC-A5F8-2A5CE351D067,3,1,2,VA,fb44600f-ac7f-46a7-be65-aebc920d35e9,20210629,,0,,2.0,,0,39.8,2021-06-29,26
4,000233CC-EDC5-4EFC-A5F8-2A5CE351D067,3,1,2,VA,fb44600f-ac7f-46a7-be65-aebc920d35e9,20210907,,0,,2.0,,0,38.7,2021-09-07,36
5,00025CC1-48AC-4499-93E4-83BB4749D074,5,2,1,NC,8a3e0dc4-df44-45a9-b6cd-c2b922c235b7,20211015,,0,,7.0,,0,38.2,2021-10-15,41
7,0003361B-CB1A-4794-87E9-0615432C687A,3,1,2,NJ,e1ff44fa-b1a6-4160-a0c0-95efd8aa21c8,20210717,,0,,3.0,,0,38.1,2021-07-17,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
180375,FFFDFDF4-DFB8-4185-8F31-E1A6E1B119BB,5,4,1,CA,a4bd0f29-e7b4-48b0-8424-83379a7a024a,20210330,,0,,1.0,,0,39.2,2021-03-30,13
180376,FFFDFDF4-DFB8-4185-8F31-E1A6E1B119BB,5,4,1,CA,a4bd0f29-e7b4-48b0-8424-83379a7a024a,20210508,,0,,1.0,,0,38.6,2021-05-08,18
180377,FFFDFDF4-DFB8-4185-8F31-E1A6E1B119BB,5,4,1,CA,a4bd0f29-e7b4-48b0-8424-83379a7a024a,20210524,,0,,1.0,,0,39.2,2021-05-24,21
180379,FFFEE544-73D6-4E11-B418-CCE420AA585F,4,2,2,SC,171ab6af-bdb0-4186-844f-16466e8b0f9d,20210902,,0,,0.0,,0,38.0,2021-09-02,35


In [6]:
# convert float values to numeric
transmission_df["age_index"] = pd.to_numeric(transmission_df["age_index"])
transmission_df["age_contact"] = pd.to_numeric(transmission_df["age_contact"])

In [7]:
# create training data from age_index to make predictions on test data for age_contact
train = transmission_df.loc[(transmission_df.age_index.notnull())]  # known age_index values
test = transmission_df.loc[(transmission_df.age_contact.isnull())]  # all nan age_contact values

In [8]:
# select age_index column
y = train["age_index"]

# select pclass and sex
X = train[["household_size", "total_household_adults", "total_household_children", "temperature_c_index"]]

In [9]:
# create RandomForestRegressor model
rfr = RandomForestRegressor(n_estimators=2000, n_jobs=-1)

In [10]:
# Fit a model with data
rfr.fit(X, y)

RandomForestRegressor(n_estimators=2000, n_jobs=-1)

In [11]:
# Use the fitted model to predict the missing values for test data
predictedAges = rfr.predict(test[["household_size", "total_household_adults", "total_household_children", "temperature_c_index"]])

In [12]:
# create predicted age column
transmission_df['age_contact_pred'] = transmission_df['age_contact']

In [13]:
# fill column with null values from age_contact with predicted agesd into new column age_pred
transmission_df.loc[(transmission_df.age_contact_pred.isnull()), 'age_contact_pred'] = predictedAges

In [14]:
#display datarame with predicted values
transmission_df.head()

Unnamed: 0,device_id,household_size,total_household_children,total_household_adults,state,profile_id_index,yyyymmdd_index,profile_id_contact,yyyymmdd_contact,serial_interval,age_index,age_contact,secondary_transmission,temperature_c_index,date_time,week_num,age_contact_pred
2,000233CC-EDC5-4EFC-A5F8-2A5CE351D067,3,1,2,VA,fb44600f-ac7f-46a7-be65-aebc920d35e9,20210515,,0,,1.0,,0,38.1,2021-05-15,19,15.501666
3,000233CC-EDC5-4EFC-A5F8-2A5CE351D067,3,1,2,VA,fb44600f-ac7f-46a7-be65-aebc920d35e9,20210629,,0,,2.0,,0,39.8,2021-06-29,26,9.389346
4,000233CC-EDC5-4EFC-A5F8-2A5CE351D067,3,1,2,VA,fb44600f-ac7f-46a7-be65-aebc920d35e9,20210907,,0,,2.0,,0,38.7,2021-09-07,36,13.826325
5,00025CC1-48AC-4499-93E4-83BB4749D074,5,2,1,NC,8a3e0dc4-df44-45a9-b6cd-c2b922c235b7,20211015,,0,,7.0,,0,38.2,2021-10-15,41,6.2608
7,0003361B-CB1A-4794-87E9-0615432C687A,3,1,2,NJ,e1ff44fa-b1a6-4160-a0c0-95efd8aa21c8,20210717,,0,,3.0,,0,38.1,2021-07-17,28,15.501666


In [15]:
# create variable to house original status of secondary transmission occurring or not
transmission_df['secondary_transmission_case'] = transmission_df['secondary_transmission']

In [16]:
# group by household
tranmission_groups_df = transmission_df.groupby(['profile_id_index', 'date_time'], as_index=False).agg({
    'secondary_transmission': sum,
    'secondary_transmission_case': 'first',
    'household_size' : 'first',
    'profile_id_index': 'first',
    'week_num': 'first',
    'state': 'first',
    'total_household_children': 'first',
    'total_household_adults': 'first',
    'temperature_c_index': 'first',
    'age_contact_pred': 'first',
    'age_index': 'first'
})
tranmission_groups_df

Unnamed: 0,date_time,secondary_transmission,secondary_transmission_case,household_size,profile_id_index,week_num,state,total_household_children,total_household_adults,temperature_c_index,age_contact_pred,age_index
0,2021-11-19,0,0,3,0002d79f-2ace-42ef-a2d5-d9a4853665ef,46,CA,3,0,38.200000,5.090660,2.0
1,2021-12-26,0,0,4,00048cf0-165f-459c-8caa-e76487a763e8,51,MI,2,2,38.000000,15.007055,5.0
2,2021-08-28,0,0,2,0004d794-d96a-43b5-bccf-c92241af2b1c,34,TX,1,0,38.000000,3.432289,6.0
3,2021-11-17,0,0,2,0004d794-d96a-43b5-bccf-c92241af2b1c,46,TX,1,0,41.474998,5.311519,6.0
4,2021-11-09,0,0,3,0004f9b8-f795-4b3f-b132-f87972f254eb,45,NC,2,1,39.000000,8.611310,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...
130615,2021-11-11,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,45,GA,2,2,38.400000,13.779814,0.0
130616,2021-11-15,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,46,GA,2,2,38.100000,14.642188,0.0
130617,2021-11-17,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,46,GA,2,2,38.100000,14.642188,0.0
130618,2021-12-03,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,48,OH,2,2,38.200000,14.658349,0.0


In [17]:
#examine age_index
tranmission_groups_df[["age_index"]].describe()

Unnamed: 0,age_index
count,130356.0
mean,16.290083
std,19.148799
min,-1.0
25%,2.0
50%,7.0
75%,29.0
max,116.0


In [18]:
#examine age_index
tranmission_groups_df[["age_contact_pred"]].describe()

Unnamed: 0,age_contact_pred
count,130620.0
mean,16.637565
std,12.974282
min,0.0
25%,8.82909
50%,13.31358
75%,16.081912
max,102.0


In [19]:
# create bins, labels, and age groups for the index cases with fever
bins= [-1,17.9999,64.999999,150]
labels = [1,2,3]
tranmission_groups_df['agegroup_index'] = pd.cut(tranmission_groups_df['age_index'], bins=bins, labels=labels, right=False)
tranmission_groups_df

Unnamed: 0,date_time,secondary_transmission,secondary_transmission_case,household_size,profile_id_index,week_num,state,total_household_children,total_household_adults,temperature_c_index,age_contact_pred,age_index,agegroup_index
0,2021-11-19,0,0,3,0002d79f-2ace-42ef-a2d5-d9a4853665ef,46,CA,3,0,38.200000,5.090660,2.0,1
1,2021-12-26,0,0,4,00048cf0-165f-459c-8caa-e76487a763e8,51,MI,2,2,38.000000,15.007055,5.0,1
2,2021-08-28,0,0,2,0004d794-d96a-43b5-bccf-c92241af2b1c,34,TX,1,0,38.000000,3.432289,6.0,1
3,2021-11-17,0,0,2,0004d794-d96a-43b5-bccf-c92241af2b1c,46,TX,1,0,41.474998,5.311519,6.0,1
4,2021-11-09,0,0,3,0004f9b8-f795-4b3f-b132-f87972f254eb,45,NC,2,1,39.000000,8.611310,11.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
130615,2021-11-11,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,45,GA,2,2,38.400000,13.779814,0.0,1
130616,2021-11-15,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,46,GA,2,2,38.100000,14.642188,0.0,1
130617,2021-11-17,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,46,GA,2,2,38.100000,14.642188,0.0,1
130618,2021-12-03,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,48,OH,2,2,38.200000,14.658349,0.0,1


In [20]:
# create bins, labels, and age groups for the index cases with fever

bins= [-1,17.9999,64.999999,150]
labels = [1,2,3]
tranmission_groups_df['agegroup_contact_pred'] = pd.cut(tranmission_groups_df['age_contact_pred'], bins=bins, labels=labels, right=False)
tranmission_groups_df

Unnamed: 0,date_time,secondary_transmission,secondary_transmission_case,household_size,profile_id_index,week_num,state,total_household_children,total_household_adults,temperature_c_index,age_contact_pred,age_index,agegroup_index,agegroup_contact_pred
0,2021-11-19,0,0,3,0002d79f-2ace-42ef-a2d5-d9a4853665ef,46,CA,3,0,38.200000,5.090660,2.0,1,1
1,2021-12-26,0,0,4,00048cf0-165f-459c-8caa-e76487a763e8,51,MI,2,2,38.000000,15.007055,5.0,1,1
2,2021-08-28,0,0,2,0004d794-d96a-43b5-bccf-c92241af2b1c,34,TX,1,0,38.000000,3.432289,6.0,1,1
3,2021-11-17,0,0,2,0004d794-d96a-43b5-bccf-c92241af2b1c,46,TX,1,0,41.474998,5.311519,6.0,1,1
4,2021-11-09,0,0,3,0004f9b8-f795-4b3f-b132-f87972f254eb,45,NC,2,1,39.000000,8.611310,11.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130615,2021-11-11,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,45,GA,2,2,38.400000,13.779814,0.0,1,1
130616,2021-11-15,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,46,GA,2,2,38.100000,14.642188,0.0,1,1
130617,2021-11-17,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,46,GA,2,2,38.100000,14.642188,0.0,1,1
130618,2021-12-03,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,48,OH,2,2,38.200000,14.658349,0.0,1,1


In [21]:
# use the label encoder to encode the state variable for inclusion in model
tranmission_groups_df[['state_encode']] = tranmission_groups_df[['state']].apply(LabelEncoder().fit_transform)
tranmission_groups_df

Unnamed: 0,date_time,secondary_transmission,secondary_transmission_case,household_size,profile_id_index,week_num,state,total_household_children,total_household_adults,temperature_c_index,age_contact_pred,age_index,agegroup_index,agegroup_contact_pred,state_encode
0,2021-11-19,0,0,3,0002d79f-2ace-42ef-a2d5-d9a4853665ef,46,CA,3,0,38.200000,5.090660,2.0,1,1,4
1,2021-12-26,0,0,4,00048cf0-165f-459c-8caa-e76487a763e8,51,MI,2,2,38.000000,15.007055,5.0,1,1,22
2,2021-08-28,0,0,2,0004d794-d96a-43b5-bccf-c92241af2b1c,34,TX,1,0,38.000000,3.432289,6.0,1,1,43
3,2021-11-17,0,0,2,0004d794-d96a-43b5-bccf-c92241af2b1c,46,TX,1,0,41.474998,5.311519,6.0,1,1,43
4,2021-11-09,0,0,3,0004f9b8-f795-4b3f-b132-f87972f254eb,45,NC,2,1,39.000000,8.611310,11.0,1,1,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130615,2021-11-11,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,45,GA,2,2,38.400000,13.779814,0.0,1,1,10
130616,2021-11-15,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,46,GA,2,2,38.100000,14.642188,0.0,1,1,10
130617,2021-11-17,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,46,GA,2,2,38.100000,14.642188,0.0,1,1,10
130618,2021-12-03,0,0,4,ffff8c1c-2260-41cf-bcc7-f3f851b730da,48,OH,2,2,38.200000,14.658349,0.0,1,1,35


In [22]:
tranmission_groups_df['state'].nunique()

51

In [23]:
#export to csv file
tranmission_groups_df.to_csv("./Resources/transmission_df.csv", index=False)

In [24]:
tranmission_groups_df['state'].value_counts()

CA    17694
TX    12739
PA     6456
FL     6444
NY     6197
OH     4163
GA     4106
IL     3601
MI     3586
NJ     3470
CT     3459
NC     3390
VA     3208
MA     3203
IN     2560
MN     2493
TN     2411
AZ     2152
MO     2139
MD     1956
WI     1883
WA     1875
SC     1867
CO     1865
LA     1768
AL     1607
KY     1480
KS     1402
OR     1351
AR     1277
OK     1099
ID      951
UT      898
MS      831
NE      810
NV      778
IA      708
WV      629
NH      576
HI      440
DE      373
RI      369
ME      325
NM      292
MT      277
DC      264
AK      240
ND      202
VT      154
WY      135
SD      126
Name: state, dtype: int64

In [25]:
predictedAges.min()

0.15337142857142858

In [26]:
predictedAges.max()

72.76721666666667