In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
## load data
epi_dir = '/Volumes/umms-esnitkin/Project_KPC_LTACH/Analysis/LTACH_transmission_modeling'
cluster_file = f"{epi_dir}/data/2021-01-26_cluster_isolate_summary_table.xlsx"
isolates = pd.read_csv(f"{epi_dir}/data/2019-12-18_patient_isolate_date_lookup_df.csv", index_col=0)
infections = pd.read_csv(f"{epi_dir}/preprocessed/resampled/infections.csv", index_col=0)
screening = pd.read_csv(f"{epi_dir}/preprocessed/resampled/screening.csv", index_col=0)
facility_trace = pd.read_csv(f"{epi_dir}/preprocessed/resampled/facility_trace.csv", index_col=0)
floor_trace = pd.read_csv(f"{epi_dir}/preprocessed/resampled/floor_trace.csv", index_col=0)
room_trace = pd.read_csv(f"{epi_dir}/preprocessed/resampled/room_trace.csv", index_col=0)

In [None]:
f"{epi_dir}/data/2021-01-26_cluster_isolate_summary_table.xlsx"

In [None]:
df = pd.read_excel(cluster_file, index_col=0)

In [None]:
df.head()

## data exploration

In [None]:
df["patient.id"].nunique()

In [None]:
(df["patient.id"].value_counts() < 2).mean()
# most patients only have one associated isolate...

In [None]:
df.groupby("patient.id")["cluster.id"].nunique().value_counts()
# 87% of patients have just one cluster assignment..not a big stretch to discard secondary assignments, right?df

In [None]:
131 / 151

cluster size

In [None]:
df[["patient.id", "cluster.id"]].drop_duplicates()["cluster.id"].value_counts().head()

In [None]:
df["cluster.id"].nunique()

## Data Processing

In [None]:
# first step: reduce down to one cluster per patient
df_clean = df.sort_values(["patient.id", "trace.format.culture.date"]).drop_duplicates(["patient.id"])
# there are other strategies for acheiving 1-1 clustering: merging into bigger clusters, keeping smaller clusters
# e.g. df.sort_values(["patient.id", "clust.pt.count"]).drop_duplicates(["patient.id"])

In [None]:
# second step: keep only clusters that have an index patient
indexed = df_clean.groupby("cluster.id")["is.index.pt"].any()
indexed.name = "has.index.patient"
df_clean = df_clean.join(indexed, on="cluster.id")
df_clean = df_clean[df_clean["has.index.patient"] == True]

In [None]:
# third step: get rid of clusters that now have only one patient
singleton = df_clean.groupby("cluster.id")["patient.id"].nunique() == 1
singleton.name = "singleton"
df_clean = df_clean.join(singleton, on="cluster.id")
df_clean = df_clean[df_clean["singleton"] == False]

In [None]:
cluster_assignments = df_clean.set_index("patient.id")["cluster.id"]

In [None]:
len(np.unique(cluster_assignments))

In [None]:
np.sort(cluster_assignments.unique())

# give this a try..convert these to consecutive integers, with "1" set aside for "no cluster"

In [None]:
cluster_reindexing = {int(v):i+2 for i,v in enumerate(np.unique(cluster_assignments))}
cluster_assignments_remapped = cluster_assignments.map(cluster_reindexing)

In [None]:
# create time series for cluster assignments

df_cluster = pd.DataFrame(index = infections.index, columns = infections.columns)

for i in infections.index:
    if i in cluster_assignments_remapped.index:
        cluster_key = cluster_assignments_remapped.loc[i]
        df_cluster.loc[i] = infections.loc[i].replace(1, cluster_key)
    else:
        df_cluster.loc[i] = infections.loc[i]

In [None]:
cluster_counts = dict()
for t in df_cluster.columns:
    c = df_cluster[t]
    cluster_counts[int(t)] = len(c[c > 1])
cluster_counts = pd.Series(cluster_counts)

plt.plot(cluster_counts)
plt.ylim((0, None))
plt.title("Number of Clusters in the Facility over Time")
plt.show()

In [None]:
# number clustered vs. unclustered over time
unclustered = dict()
clustered = dict()
uninfected = dict()
for t in df_cluster.columns:
    c = df_cluster[t]
    unclustered[int(t)] = (c == 1).sum()
    clustered[int(t)] = (c > 1).sum()
    uninfected[int(t)] = (c == 0).sum()
unclustered = pd.Series(unclustered)
clustered = pd.Series(clustered)
uninfected = pd.Series(uninfected)

In [None]:
plt.plot(unclustered, label="Unclustered")
plt.plot(clustered, label="Clustered")
plt.plot(uninfected, label="Uninfected")
plt.legend()
plt.ylabel("Number of Patients")
plt.ylim((0, None))

Why would there be more patients belonging to "no cluster" at the beginning of the study?

In [None]:
# who are the "index patients"? 

In [None]:
(infections == 1).any(axis=1).sum()

In [None]:
df["patient.id"].nunique()

In [None]:
screened = set(screening.index[(screening == 1).any(axis=1)])

In [None]:
index_pts = set(df_clean[df_clean["is.index.pt"] == True]["patient.id"])

In [None]:
index_pts < screened

## what is the overlap between genomics and the "infections" data?

how many infections are covered by the cleaned clustering data?

In [None]:
infections_cleaned = pd.read_csv(f"{epi_dir}/preprocessed/infections.csv", index_col=0)
infections_cleaned.columns = np.arange(367)

In [None]:
infecteds = np.array(infections_cleaned.index[(infections_cleaned == 1).any(axis=1)])

In [None]:
len(infecteds)

In [None]:
df["patient.id"].nunique() / len(infecteds)
# in the raw cluster data: 42% of patients are unclustered

In [None]:
110 / 259

In [None]:
df_clean["patient.id"].nunique() / len(infecteds)
# in the cleaned data, this drops down to 40%
# could possibly get this back above 50% if i knew 
# how to handle clusters without index patients

In [None]:
# what is the balance between index patients, cluster acquisitions, and unclustered patients?

In [None]:
df_clean["is.index.pt"].value_counts()
# 67 acquisitions...this might be enough for a signal..!

#### break down of unclustered

In [None]:
unclustered = set(infecteds) - set(df["patient.id"])

In [None]:
type(infections.loc[5].notna().idxmax())

In [None]:
adm_negative = 0
adm_positive = 0
for i in list(unclustered):
    v = int(infections.loc[i].notna().idxmax())
    if v:
        adm_positive += 1
    else:
        adm_negative += 1

In [None]:
adm_positive, adm_negative

In [None]:
(df["patient.id"].nunique() + 43) / len(infecteds)

## evidence of room/floor transmission

In [None]:
i = 5
cluster = cluster_assignments_remapped.iloc[i]
patient_ids = np.array(cluster_assignments_remapped[cluster_assignments_remapped == cluster].index)

roughly speaking: when one patient acquires CRKP, is there recently a room or floor shared?

one issue: the resampling measure may discard some fine detail (brief periods of overlap)
another issue: my epidemiological model only considers the most recent contact network (previous timestep)
is there room for modifying it to include some sort of time lag? 

In [None]:
infections.loc[patient_ids]

In [None]:
floor_trace.loc[patient_ids]

In [None]:
room_trace.loc[patient_ids]

In [None]:
cluster_assignments_remapped.iloc[0]

In [None]:
floor_trace

### get counts in each cluster over time