In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from timeit import default_timer as timer
import re
from datetime import date, timedelta

## Summary

Use census data to augment room and floor trace data to cover "never infected" patients. Extend facility trace data as well.

In [2]:
epi_dir = '/Volumes/umms-esnitkin/Project_KPC_LTACH/Analysis/LTACH_transmission_modeling/data'

Goal: augment the floor and room trace data with patients who were never infected

In [3]:
df_census = pd.read_excel(f"{epi_dir}/2019-02-25_KPCLTACH_C_Census.xlsx")
df_census = df_census.sort_values(["WGS_ID", "From_Date"])

In [4]:
df_census.head()

Unnamed: 0,WGS_ID,WGS_StudyID,Room,Bed,Admit_Date,DischargeDate,LOS,Room_Bed,From_Date,Thru_Date
5,C-1,1,DE/DE.SCU,1,2012-02-22,2012-10-23,244,DE.318-A,2012-06-18,2012-06-25
7,C-1,1,DE/DE.SCU,1,2012-02-22,2012-10-23,244,DE.111-B,2012-06-26,2012-07-04
3,C-1,1,DE/DE.SCU,1,2012-02-22,2012-10-23,244,DE.109-B,2012-07-05,2012-07-07
4,C-1,1,DE/DE.SCU,1,2012-02-22,2012-10-23,244,DE.117-B,2012-07-08,2012-07-23
8,C-1,1,DE/DE.SCU,1,2012-02-22,2012-10-23,244,DE.SCU8-008,2012-07-24,2012-09-17


In [5]:
df_ftrace = pd.read_csv(f"{epi_dir}/2019-12-18_facility_trace.csv", index_col=0)
df_ftrace.columns = np.arange(367)

# floor and room traces
df_floor = pd.read_csv(f"{epi_dir}/2019-12-18_floor_trace.csv", index_col=0)
df_room = pd.read_csv(f"{epi_dir}/2019-12-18_room_trace.csv", index_col=0)

NameError: name 'np' is not defined

In [6]:
def floor_encoder(room):
    if room in ('TMPLOAROOM-1', 'DE.TMP-1'): 
        return 5
    code = room[3]
    if code == "S": 
        return 6
    else:
        return int(code)

In [9]:
T = (df_census["From_Date"].max() - df_census["Thru_Date"].min()).days + 1
patient_ids = np.sort(df_census["WGS_StudyID"].unique())
floor_trace = pd.DataFrame(index=patient_ids, columns=range(T))
room_trace = pd.DataFrame(index=patient_ids, columns=range(T))

min_date = df_census["Thru_Date"].min()

room_lookup = dict()
R = 1
for n in patient_ids:
    stays = df_census[df_census["WGS_StudyID"] == n]
    t = 0
    s = 0
    while (t < T):
        while s < len(stays):
            if t >= T:
                break
            from_date = (stays.iloc[s]["From_Date"] - min_date).days
            thru_date = (stays.iloc[s]["Thru_Date"] - min_date).days
            room = stays.iloc[s]["Room_Bed"]
            room = re.sub("-[AB]", "", room)
            floor_code = floor_encoder(room)
            r = room_lookup.get(room)
            if r is None:
                room_code = R
                room_lookup[room] = R
                R += 1
                if R == 8: # skip 8
                    R += 1
            else:
                room_code = r
            if t < from_date:
                floor_trace.loc[n, t] = 0
                room_trace.loc[n, t] = 0
            elif t <= thru_date:
                floor_trace.loc[n, t] = floor_code
                room_trace.loc[n, t] = room_code
            else:
                s += 1
                continue
            t += 1
        if t >= T:
            break
        floor_trace.loc[n, t] = 0
        room_trace.loc[n, t] = 0
        t += 1

In [52]:
# how to engineer the new facility trace...

In [57]:
infections = df_ftrace.replace(1, np.nan).ffill(axis=1)
infections = infections.replace({0:np.nan, 1.25:0, 1.5:1})

In [66]:
infections

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,357,358,359,360,361,362,363,364,365,366
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284,,,,,,,,,,,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
286,,,,,,,,,,,...,,,,,,,,,,
287,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
295,,,,,,,,,,,...,,,,,,,,,,


In [87]:
n_infected = infections.shape[0]
infections_plus = (floor_trace[n_infected:] > 0).astype(int)
infections_plus = infections_plus.replace({0: np.nan, 1:0})
infections_combined = pd.concat([infections, infections_plus])

In [20]:
# assert: whenever original is nonzero, ours is nonzero
# i.e., our visits include all original vists
for j in df_floor.index:
    A = df_room.loc[j].values
    B = room_trace.loc[j].values
    if not (A == B).all():
        assert ((A != 0) <= (B != 0)).all()
        if not ((A != 0) == (B != 0)).all():
            print(f"found new visit(s) for patient {j}")
        else:
            print(f"room code mismatch for patient {j}")

found new visit(s) for patient 38
found new visit(s) for patient 65
found new visit(s) for patient 84
found new visit(s) for patient 85
found new visit(s) for patient 101
found new visit(s) for patient 129
room code mismatch for patient 141
room code mismatch for patient 144
found new visit(s) for patient 149
found new visit(s) for patient 161
found new visit(s) for patient 172
found new visit(s) for patient 173
found new visit(s) for patient 183
room code mismatch for patient 189
room code mismatch for patient 196
found new visit(s) for patient 197
room code mismatch for patient 199
found new visit(s) for patient 218
found new visit(s) for patient 223
found new visit(s) for patient 248
room code mismatch for patient 252
room code mismatch for patient 259
found new visit(s) for patient 264
room code mismatch for patient 295


In [25]:
for j in df_floor.index:
    A = df_floor.loc[j].values
    B = floor_trace.loc[j].values
    if not (A == B).all():
        assert ((A != 0) <= (B != 0)).all()
        if not ((A != 0) == (B != 0)).all():
            pass
        else:
            print(f"floor code mismatch for patient {j}")

floor code mismatch for patient 199


In [26]:
z = np.unique(np.array(df_room))

In [27]:
len(z)

96

In [40]:
# how many people are sharing rooms?

In [48]:
S = []
for zed in z[1:]:
    S.append((room_trace == zed).sum(0).max())
max_occupancy = pd.Series(S, index=z[1:])

In [49]:
max_occupancy.value_counts()

2    64
1    29
3     2
Name: count, dtype: int64

In [89]:
output_dir = '/Volumes/umms-esnitkin/Project_KPC_LTACH/Analysis/LTACH_transmission_modeling/preprocessed'

infections_combined.to_csv(f"{output_dir}/augmented/facility_trace.csv")
floor_trace.to_csv(f"{output_dir}/augmented/floor_trace.csv")
room_trace.to_csv(f"{output_dir}/augmented/room_trace.csv")
# ftrace.to_csv(f"{output_dir}/facility_trace.csv")
# # infections_cleaned.to_csv(f"{output_dir}/infections.csv")
# with open(f"{output_dir}/observed_data.npy", "wb") as f:
#     np.save(f, observed_data)
# df_screen.to_csv(f"{output_dir}/screening.csv")
# df_floor.to_csv(f"{output_dir}/floor_trace.csv")
# df_room.to_csv(f"{output_dir}/room_trace.csv")