In [1]:
from sklearn.cluster import AgglomerativeClustering, DBSCAN
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from tqdm import tqdm
import pandas as pd
#import matplotlib.pyplot as plt
import numpy as np
import ast
import os
import sys

In [2]:
tqdm.pandas()

In [3]:
actions_df_path = Path("./actions_day_1.csv")

In [4]:
actions_df = pd.read_csv(actions_df_path)

In [5]:
def get_group_size(row):
    return max(ast.literal_eval(row))

gs = actions_df["groupSize"].progress_apply(get_group_size)

100%|██████████| 558860/558860 [00:05<00:00, 109143.14it/s]


In [6]:
duration = list(actions_df["end_time"] - actions_df["time"])

## Keep the duration on the same scale as gs, since the weight is going to be transformed to a probability this doesn't matter much

In [9]:
min_duration = min(duration); max_duration = max(duration)
min_gs = min(gs); max_gs = max(gs)

def normalize_duration(d):
    return (d - min_duration) / (max_duration - min_duration) * (max_gs - min_gs) + min_gs
duration_normalized = list(map(normalize_duration, duration))

In [13]:
actions_df.head(5)

Unnamed: 0.1,Unnamed: 0,person,time,container,end_time,groupSize,contactPeople,almostContactPeople
0,0,21,122385.0,sec_490124,124837.0,[1],[],
1,1,21,125232.0,sec_425380,126637.0,[1],[],
2,2,21,127214.0,home_5,139237.0,[1],[],
3,3,21,139380.0,PT!100110007:7!120200199-1_202458!120062097-1_...,139860.0,[2],[9437416],
4,4,21,140160.0,PT!100100180:180!120178341-1_235191!120178340-...,141059.0,[1],[],


In [14]:
from collections import defaultdict
action_risk = defaultdict(int)

for row_num, row in tqdm(actions_df.iterrows()):
    action_risk[row["container"]] += duration_normalized[row_num]
    action_risk[row["container"]] += gs[row_num]
    

558860it [00:46, 11893.39it/s]


In [15]:
max(list(action_risk.values()))

10173.393365380034

# Create timelines

In [17]:
timelines = {}

for group_id, group in tqdm(actions_df.groupby("person")):
    temp_line = []
    
    for row_id, row in group.iterrows():
        row_duration = normalize_duration(row["end_time"] - row["time"])
        row_group_size = get_group_size(row["groupSize"])
        temp_line.append({
            "container": row["container"],
            "duration": row_duration,
            "groupSize": row_group_size
        })
    
    if timelines.get(group_id, None):
        print(f"There is an existing {group_id}")
    
    timelines[group_id] = temp_line
    
    

100%|██████████| 108246/108246 [01:16<00:00, 1414.59it/s]


In [18]:
timelines[21]

[{'container': 'sec_490124', 'duration': 4.267621721955166, 'groupSize': 1},
 {'container': 'sec_425380', 'duration': 2.8723525772214553, 'groupSize': 1},
 {'container': 'home_5', 'duration': 17.022274046927798, 'groupSize': 1},
 {'container': 'PT!100110007:7!120200199-1_202458!120062097-1_202075_14:03:00',
  'duration': 1.6396649374137358,
  'groupSize': 2},
 {'container': 'PT!100100180:180!120178341-1_235191!120178340-1_235190_14:56:00',
  'duration': 2.1980391223644755,
  'groupSize': 1},
 {'container': 'sec_349262', 'duration': 5.569606396649374, 'groupSize': 1},
 {'container': 'PT!100100132:132!118429989-1_266853!118429989-1_266853_15:57:00',
  'duration': 2.0394555232973204,
  'groupSize': 1},
 {'container': 'PT!100110007:7!120062240-1_203260!120062240-1_203260_16:35:00',
  'duration': 1.5583741849507402,
  'groupSize': 2}]

# Action risk as a probability of the location, based on a person's actions

In [23]:
action_per_person = defaultdict(float)

for person_id, timeline in tqdm(timelines.items()):
    actions = []
    places  = []
    
    for action in timeline:
        actions.append(action["duration"] + action['groupSize'])
        places.append(action['container'])
    
    actions = np.array(actions)
    actions = np.power(actions, np.e) / np.sum(np.power(actions, np.e)) ##Softmax
    
    for i, place in enumerate(places):
        action_per_person[place] += actions[i]

100%|██████████| 108246/108246 [00:01<00:00, 58136.07it/s]


In [26]:
infections_file_path = Path("./infectionEvents.txt")

In [27]:
infections = pd.read_csv(infections_file_path, sep="\t")

In [28]:
infections.head(1)

Unnamed: 0,time,infector,infected,infectionType,date,groupSize,facility
0,284037.0,7861927,7874412,home_home,2020-01-26,8,home_3512319


In [29]:
infection_risks = []

for row_id, row in tqdm(infections.iterrows()):
    infection_risks.append(action_per_person[row["facility"]])

28248it [00:02, 14006.79it/s]


In [30]:
np.argmin(infection_risks), infections.iloc[60]

(83,
 time                 539424.0
 infector              5744755
 infected              5744756
 infectionType       home_home
 date               2020-01-29
 groupSize                   2
 facility         home_2594069
 Name: 60, dtype: object)

In [31]:
from collections import Counter

In [32]:
cnt = Counter(infection_risks)

In [33]:
duration_normalized[60], gs[60]

(3.709247537004426, 1)

In [34]:
infection_all = defaultdict(lambda : defaultdict(int))
infection_locations = defaultdict(int)

for row_id, row in tqdm(infections.iterrows()):
    infection_all[row["infected"]][row["facility"]] = 1
    infection_locations[row["facility"]] = 1

28248it [00:02, 11642.46it/s]


In [37]:
assert len(duration_normalized) == len(gs)

X = list(zip(duration_normalized, gs))
Y = []

for row_id, row in tqdm(actions_df.iterrows()):
    if infection_all[row["person"]][row["container"]] == 1:
        Y.append(1)
    else:
        Y.append(0)
X = np.array(X)
Y = np.array(Y)

558860it [00:42, 13173.57it/s]


In [38]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

In [39]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import RandomForestRegressor

#gpr = GaussianProcessRegressor()
rfr = RandomForestRegressor(n_estimators=200)

In [40]:
rfr.fit(X_train, Y_train)

RandomForestRegressor(n_estimators=200)

In [41]:
from sklearn.metrics import f1_score

In [42]:
Y_predict = rfr.predict(X_test)

In [43]:
Y_pred = np.array(Y_predict)

Y_pred = Y_pred > 0.5

In [44]:
Y_pred = np.array(Y_pred, dtype=int)

In [45]:
f1_score(Y_test, Y_pred)

0.0931386056764949

In [46]:
infection_all[21]

defaultdict(int,
            {'sec_490124': 0,
             'sec_425380': 0,
             'home_5': 0,
             'PT!100110007:7!120200199-1_202458!120062097-1_202075_14:03:00': 0,
             'PT!100100180:180!120178341-1_235191!120178340-1_235190_14:56:00': 0,
             'sec_349262': 0,
             'PT!100100132:132!118429989-1_266853!118429989-1_266853_15:57:00': 0,
             'PT!100110007:7!120062240-1_203260!120062240-1_203260_16:35:00': 0})

In [50]:
result_prediction = []

for person_id, actions in tqdm(timelines.items()):
    risk = -1
    riskiest_index = -1
    
    for i, action in enumerate(actions):
        if action_per_person[action["container"]] > risk:
            riskiest_index = i
            risk = action_per_person[action["container"]]
        
    if infection_all[person_id][actions[riskiest_index]["container"]] == 1:
        result_prediction.append(1)
    else:
        result_prediction.append(0)

100%|██████████| 108246/108246 [00:00<00:00, 233566.87it/s]


# Accuracy

In [51]:
sum(result_prediction) / len(infections)

0.38593882752761255

1