In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from timeit import default_timer as timer

from datetime import date, timedelta

In [None]:
epi_dir = '/Volumes/umms-esnitkin/Project_KPC_LTACH/Analysis/LTACH_transmission_modeling'

In [None]:
# read cleaned data:
infections_cleaned = pd.read_csv(f"{epi_dir}/preprocessed/infections.csv", index_col=0)
infections_cleaned.columns = np.arange(367)
df_facility = pd.read_csv(f"{epi_dir}/preprocessed/facility_trace.csv", index_col=0)
df_facility.columns = np.arange(367)
df_floor = pd.read_csv(f"{epi_dir}/preprocessed/floor_trace.csv", index_col=0)
df_floor.columns = np.arange(367)
df_room = pd.read_csv(f"{epi_dir}/preprocessed/room_trace.csv", index_col=0)
df_room.columns = np.arange(367)

## Data Description

In [None]:
infections_cleaned.shape

In [None]:
infections_cleaned.notna().sum().sum()

In [None]:
infections_cleaned.notna().sum(1).describe()

### Number of Visits

In [None]:
r

In [None]:
n_visits = {}
visit_lengths = []
for i, r in infections_cleaned.iterrows():
    k = 0
    admitted = False
    entry_time = 0
    for t, v in r.items():
        if not np.isnan(v) and not admitted:
            admitted = True
            k += 1
            entry_time = t
        if (admitted and (np.isnan(v) or t == 366)):
            admitted = False
            visit_lengths.append(t - entry_time)
    n_visits[i] = k
n_visits = pd.Series(n_visits)
visit_lengths = np.array(visit_lengths)

In [None]:
np.median(np.array(visit_lengths))

In [None]:
np.quantile(np.array(visit_lengths), [.25, .75])

In [None]:
(np.array(visit_lengths) < 7).mean()

In [None]:
1/7

In [None]:
(np.array(visit_lengths)).mean()

In [None]:
pd.Series(n_visits).value_counts() / pd.Series(n_visits).value_counts().sum()

In [None]:
pd.Series(n_visits).sum()

#### mobility statistics

In [None]:
n_floors = {}
for i, r in df_floor.iterrows():
    n_floors[i] = 0
    floor = 0
    for f in r.values:
        if f != floor:
            if f != 0:
                n_floors[i] += 1
                floor = f
n_floors = pd.Series(n_floors)       

In [None]:
n_floors.value_counts().sort_index().cumsum() / 260 # more than half of patients move floors twice or more during their stay

In [None]:
ftrace.head()

In [None]:
for t in range(367):
    set1 = set(ftrace[t][ftrace[t] > 0].index)
    set2 = set(df_floor.iloc[:, t][df_floor.iloc[:, t] > 0].index)
    if set1 != set2:
        print(t)
        problem = set1 ^ set2
        print(problem)
        print(df_ftrace.loc[list(problem)[0], t:t+2])
        print("***")
        

In [None]:
((ftrace.values > 0) != (df_floor.values > 0)).sum()

#### Room Trace

In [None]:
df_room = pd.read_csv(f"{epi_dir}/2019-12-18_room_trace.csv", index_col=0)

In [None]:
np.unique(df_room.values) # 95 possible rooms!

In [None]:
np.unique(df_room.values, return_counts=True)[1][1:]

#### question: how many people share rooms, anyway?

# Data Preprocessing

In [None]:
df_ftrace = pd.read_csv(f"{epi_dir}/2019-12-18_facility_trace.csv", index_col=0)
dates = df_ftrace.columns
df_ftrace.columns = np.arange(367)

In [None]:
# facility trace
df_ftrace = pd.read_csv(f"{epi_dir}/2019-12-18_facility_trace.csv", index_col=0)
df_ftrace.columns = np.arange(367)

# recover facility trace
ftrace = (df_ftrace > 0).astype(int)

# recover infections
# how does this handle people who are 
infections = df_ftrace.replace(1, np.nan).ffill(axis=1)
infections = infections.replace({0:np.nan, 1.25:0, 1.5:1})

In [None]:
# how many infection events occur after admission?
new_infections = (infections.diff(axis=1) == 1).astype(int).sum(0)
print(new_infections.sum())
sns.lineplot(new_infections)
plt.show()

In [None]:
# how many infections happen upon admission?
# infections

In [None]:
I = infections.sum(0)
Nt = ftrace.sum(0)
S = Nt - I
sns.lineplot(Nt, label="Total")
sns.lineplot(S, label="Susceptible")
sns.lineplot(I, label="Infected")
plt.show()

In [None]:
# deal with: people who get infected twice!
# are there many?
# new goal: assume someone can't recover during a stay
# but they may leave and come back recovered (?)
infections_cleaned = infections.copy()
doubles = set()
for i, row in infections_cleaned.iterrows():
    infected = False
    for j, v in row.items():
        if infected:
            if v == 0:
                doubles.add(i)
                row[j] = 1
        if v == 1:
            infected = True
        if np.isnan(v):
            infected = False

print(list(doubles))

In [None]:
n_visits = {}
for i, row in infections_cleaned.iterrows():
    admitted = False
    z = 0
    for _, v in row.items():
        if not np.isnan(v) and not admitted:
            admitted = True
            z += 1
        if np.isnan(v):
            admitted = False
    n_visits[i] = z
n_visits = pd.Series(n_visits)

In [None]:
n_visits.value_counts() / len(n_visits)

In [None]:
n_visits.sum()

One possible issue: the data makes it seem like we start off with an unusually high number of infections. Maybe we can ameliorate this by starting 100 days into the year...seems easier than modeling a time-varying pre-admitted colonization rate

Or, we hardcode initial infections (back to the original plan)

### Understanding precolonization

In [None]:
# multiple visits
first_test = {}
i = 0
for _, r in df_ftrace.reset_index(drop=True).iterrows():
    entry_time = 0
    entered = False
    recorded = False
    for j, v in r.items():
        if v > 0 and not entered:
            entry_time = j
            entered = True
        if entered and v == 0:
            i += 1
            entered = False
            recorded = False
        if v > 1 and not recorded:
            first_test[i] = j - entry_time
            recorded = True
    if entered:
        i += 1
first_test = pd.Series(first_test)      

In [None]:
first_test.value_counts() / 367

build dataframe of colonizations upon arrival

what i'll do--if someone gets tested within 3 days of arrival (95% of first tests)

In [None]:
df_precol = pd.DataFrame(index=df_ftrace.index, columns=df_ftrace.columns)
for n, r in df_ftrace.iterrows():
    entry_time = 0
    entered = False
    recorded = False
    for t, v in r.items():
        if v > 0 and not entered:
            entry_time = t
            entered = True
        if entered and v == 0:
            entered = False
            recorded = False
        if v > 1 and not recorded:
            test_time = t - entry_time
            if test_time < 3:
                # df_precol.loc[n,t] = v
                df_precol.loc[n, entry_time] = v
            # else:
            #     df_precol.loc[n,t] = 
            recorded = True
df_precol.replace({1.5:1, 1.25:0}, inplace=True)

In [None]:
df_precol.count().sum()

In [None]:
screened_infected = (df_precol).sum(0)
sns.lineplot(screened_infected)
plt.show()

## Simulator II: Fixed Precolonization

In [None]:
def crkp_simulator2(logbeta, seed):
    np.random.seed(seed)
    N, T = ftrace.shape
    beta = np.exp(logbeta)
    
    W = ftrace.values
    # old admitted status
    w = np.zeros(N)
    
    X = np.empty((N, T))
    # old infection status
    x = np.empty(N)
    # x[:] = np.nan # should this be all zeros (negatives) by default?
    x[:] = 0
    
    I = np.zeros(N)
    0
    for t in range(T):
        # case 1: not present
        # if absent, set to nan
        # otherwise, inherit old status
        X[:, t] = np.where(1 - W[:, t], np.nan, x)
        # case 2: new arrival
        newly_admitted = W[:, t] * (1 - w)
        # if newly admitted, load test data if available, otherwise default to last status
        # will this under-report? if someone gets tested a day after arrival
        X[:, t] = np.where(newly_admitted, df_precol[t], X[:, t])
        # ALTERNATIVELY
        # inherit infection statuses from ground truth
        # case 3: already admitted and susceptible
        # randomly model transmission event
        # otherwise, inherit old status
        staying = W[:, t] * w
        hazard = I.sum() * beta * np.ones(N)
        p = 1 - np.exp(-hazard / N) # not the end of the world to normalize by size of population
        X[:, t] = np.where(staying * (1 - I), np.random.binomial(1, p, N), X[:, t])
        x = X[:, t]
        w = W[:, t]
        I = (x == 1).astype(int)

    return np.nansum(X, axis=0) / N

In [None]:
N = 260
precols = crkp_simulator2(-6, 1) * N

In [None]:
I_rep = crkp_simulator2(-1.8, 1)

sns.lineplot(I / 260, label="observed")
sns.lineplot(I_rep, label="simulated")
sns.lineplot(ftrace.sum() / 260, color="k", linestyle="--", label="pop")
plt.show()

In [None]:
sns.lineplot((I - precols), label="observed")
sns.lineplot(I_rep * 260 - precols, label="simulated")
# sns.lineplot(ftrace.sum() / 260, color="k", linestyle="--", label="pop")
plt.show()