In [68]:
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from tqdm import tqdm
import pandas as pd
#import matplotlib.pyplot as plt
import numpy as np
import ast
import os
import sys

In [69]:
tqdm.pandas()

In [70]:
actions_df_path = Path("./actions_day_1.csv")

In [71]:
actions_df = pd.read_csv(actions_df_path)

In [72]:
def get_group_size(row):
    return max(ast.literal_eval(row))

gs = actions_df["groupSize"].progress_apply(get_group_size)

100%|██████████| 558860/558860 [00:05<00:00, 109460.92it/s]


In [73]:
mask = (np.array(gs) == 1)

In [20]:
min_gs = min(gs); max_gs = max(gs)

def normalize_gs(group_size):
    return (group_size - min_gs) / (max_gs - min_gs)

gs_normalized = list(map(normalize_gs, gs))

In [74]:
duration = list(actions_df["end_time"] - actions_df["time"])

In [81]:
min_duration = min(duration); max_duration = max(duration)

def normalize_duration(d):
    return ((d - min_duration) / (max_duration - min_duration)) * (max_gs - min_gs) + min_gs
duration_normalized = list(map(normalize_duration, duration))

In [83]:
max(duration_normalized)

85.0

In [19]:
#plt.plot(list(range(len(duration))), duration_normalized, '*')
#plt.show()

In [21]:
#plt.plot(list(range(len(gs))), gs_normalized, '*')
#plt.show()

In [75]:
actions_df.head(5)

Unnamed: 0.1,Unnamed: 0,person,time,container,end_time,groupSize,contactPeople,almostContactPeople
0,0,21,122385.0,sec_490124,124837.0,[1],[],
1,1,21,125232.0,sec_425380,126637.0,[1],[],
2,2,21,127214.0,home_5,139237.0,[1],[],
3,3,21,139380.0,PT!100110007:7!120200199-1_202458!120062097-1_...,139860.0,[2],[9437416],
4,4,21,140160.0,PT!100100180:180!120178341-1_235191!120178340-...,141059.0,[1],[],


In [85]:
from collections import defaultdict
action_risk = defaultdict(int)

for row_num, row in tqdm(actions_df.iterrows()):
    action_risk[row["container"]] += duration_normalized[row_num]
    action_risk[row["container"]] += gs[row_num]
    

558860it [00:47, 11808.20it/s]


In [86]:
max(list(action_risk.values()))

10173.393365380034

In [87]:
from collections import defaultdict
action_risk_average = defaultdict(int)

for row_num, row in tqdm(actions_df.iterrows()):
    action_risk_average[row["container"]] += (duration_normalized[row_num] + gs_normalized[row_num]) / 2
    

558860it [00:40, 13790.90it/s]


# Create timelines

In [104]:
timelines = {}

for group_id, group in tqdm(actions_df.groupby("person")):
    temp_line = []
    
    for row_id, row in group.iterrows():
        row_duration = normalize_duration(row["end_time"] - row["time"])
        row_group_size = normalize_gs(get_group_size(row["groupSize"]))
        temp_line.append({
            "container": row["container"],
            "duration": row_duration,
            "groupSize": get_group_size(row["groupSize"])
        })
    
    if timelines.get(group_id, None):
        print(f"There is an existing {group_id}")
    
    timelines[group_id] = temp_line
    
    

100%|██████████| 108246/108246 [01:26<00:00, 1251.44it/s]


In [105]:
timelines[21]

[{'container': 'sec_490124', 'duration': 4.267621721955166, 'groupSize': 1},
 {'container': 'sec_425380', 'duration': 2.8723525772214553, 'groupSize': 1},
 {'container': 'home_5', 'duration': 17.022274046927798, 'groupSize': 1},
 {'container': 'PT!100110007:7!120200199-1_202458!120062097-1_202075_14:03:00',
  'duration': 1.6396649374137358,
  'groupSize': 2},
 {'container': 'PT!100100180:180!120178341-1_235191!120178340-1_235190_14:56:00',
  'duration': 2.1980391223644755,
  'groupSize': 1},
 {'container': 'sec_349262', 'duration': 5.569606396649374, 'groupSize': 1},
 {'container': 'PT!100100132:132!118429989-1_266853!118429989-1_266853_15:57:00',
  'duration': 2.0394555232973204,
  'groupSize': 1},
 {'container': 'PT!100110007:7!120062240-1_203260!120062240-1_203260_16:35:00',
  'duration': 1.5583741849507402,
  'groupSize': 2}]

In [106]:
probs = np.array([1, 2, 3, 5, 7, 8, 8])

In [107]:
probs = np.power(probs, np.e) / np.sum(np.power(probs, np.e))
probs

array([0.00114274, 0.00752026, 0.02264115, 0.09077074, 0.22654947,
       0.32568782, 0.32568782])

In [108]:
timelines[21]

[{'container': 'sec_490124', 'duration': 4.267621721955166, 'groupSize': 1},
 {'container': 'sec_425380', 'duration': 2.8723525772214553, 'groupSize': 1},
 {'container': 'home_5', 'duration': 17.022274046927798, 'groupSize': 1},
 {'container': 'PT!100110007:7!120200199-1_202458!120062097-1_202075_14:03:00',
  'duration': 1.6396649374137358,
  'groupSize': 2},
 {'container': 'PT!100100180:180!120178341-1_235191!120178340-1_235190_14:56:00',
  'duration': 2.1980391223644755,
  'groupSize': 1},
 {'container': 'sec_349262', 'duration': 5.569606396649374, 'groupSize': 1},
 {'container': 'PT!100100132:132!118429989-1_266853!118429989-1_266853_15:57:00',
  'duration': 2.0394555232973204,
  'groupSize': 1},
 {'container': 'PT!100110007:7!120062240-1_203260!120062240-1_203260_16:35:00',
  'duration': 1.5583741849507402,
  'groupSize': 2}]

In [109]:
action_risks_prob = defaultdict(int)

for person_id, actions_list in tqdm(timelines.items()):
    probs = []
    actions = []
    
    for action in actions_list:
        probs.append(action["duration"] + action["groupSize"])
        actions.append(action["container"])
    
    probs = np.array(probs)
    
    probs = np.power(probs, np.e) / np.sum(np.power(probs, np.e))
    
    for i, action in enumerate(actions):
        action_risks_prob[action] += probs[i]

100%|██████████| 108246/108246 [00:01<00:00, 62156.28it/s]


In [110]:
action_risks_prob["sec_425380"]

0.4728262982084884

In [111]:
infections_file_path = Path("./infectionEvents.txt")

In [112]:
infections = pd.read_csv(infections_file_path, sep="\t")

In [113]:
infections.head(1)

Unnamed: 0,time,infector,infected,infectionType,date,groupSize,facility
0,284037.0,7861927,7874412,home_home,2020-01-26,8,home_3512319


In [114]:
infection_risks = []

for row_id, row in tqdm(infections.iterrows()):
    infection_risks.append(action_risk[row["facility"]])

28248it [00:02, 13725.72it/s]


In [115]:
np.argmin(infection_risks), infections.iloc[60]

(83,
 time                 539424.0
 infector              5744755
 infected              5744756
 infectionType       home_home
 date               2020-01-29
 groupSize                   2
 facility         home_2594069
 Name: 60, dtype: object)

In [116]:
from collections import Counter

In [117]:
cnt = Counter(infection_risks)

In [118]:
duration_normalized[60], gs_normalized[60]

(3.709247537004426, 0.0)

In [119]:
infection_all = defaultdict(lambda : defaultdict(int))
infection_locations = defaultdict(int)

for row_id, row in tqdm(infections.iterrows()):
    infection_all[row["infected"]][row["facility"]] = 1
    infection_locations[row["facility"]] = 1

28248it [00:02, 12239.08it/s]


In [120]:
assert len(duration_normalized) == len(gs_normalized)

X = list(zip(duration_normalized, gs_normalized))
Y = []

for row_id, row in tqdm(actions_df.iterrows()):
    if infection_all[row["person"]][row["container"]] == 1:
        Y.append(1)
    else:
        Y.append(0)
X = np.array(X)
Y = np.array(Y)

558860it [00:43, 12927.86it/s]


In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [53]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor

#gpr = GaussianProcessRegressor()
rfr = RandomForestRegressor(n_estimators=200)

In [54]:
rfr.fit(X_train, Y_train)

RandomForestRegressor(n_estimators=200)

In [55]:
from sklearn.metrics import f1_score

In [56]:
Y_predict = rfr.predict(X_test)

In [57]:
Y_pred = np.array(Y_predict)

Y_pred = Y_pred > 0.5

In [58]:
Y_pred = np.array(Y_pred, dtype=int)

In [59]:
f1_score(Y_test, Y_pred)

0.09455150422847637

In [60]:
infection_all[21]

defaultdict(int,
            {'sec_490124': 0,
             'sec_425380': 0,
             'home_5': 0,
             'PT!100110007:7!120200199-1_202458!120062097-1_202075_14:03:00': 0,
             'PT!100100180:180!120178341-1_235191!120178340-1_235190_14:56:00': 0,
             'sec_349262': 0,
             'PT!100100132:132!118429989-1_266853!118429989-1_266853_15:57:00': 0,
             'PT!100110007:7!120062240-1_203260!120062240-1_203260_16:35:00': 0})

In [122]:
result_prediction = []

for person_id, actions in tqdm(timelines.items()):
    risk = -1
    riskiest_index = -1
    
    for i, action in enumerate(actions):
        if action_risks_prob[action["container"]] > risk:
            riskiest_index = i
        
    if infection_all[person_id][actions[riskiest_index]["container"]] == 1:
        result_prediction.append(1)
    else:
        result_prediction.append(0)

100%|██████████| 108246/108246 [00:00<00:00, 212067.62it/s]


In [123]:
sum(result_prediction) / len(infections)

0.1110875106202209

In [65]:
result_prediction_average = []

for person_id, actions in tqdm(timelines.items()):
    risk = -1
    riskiest_index = -1
    
    for i, action in enumerate(actions):
        if action_risks_prob[action["container"]] > risk:
            riskiest_index = i
        
    if infection_all[person_id][actions[riskiest_index]["container"]] == 1:
        result_prediction_average.append(1)
    else:
        result_prediction_average.append(0)

100%|██████████| 108246/108246 [00:00<00:00, 208333.41it/s]


In [66]:
sum(result_prediction_average) / len(infections)

0.11115831209289154

In [67]:
sum(result_prediction_average)

3140