In [2]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import pickle
%matplotlib inline

In [None]:
RNN_data = pd.read_csv("data/RLdata_for_RNN_01242024.csv")
RNN_data.drop(['Unnamed: 0'], axis = 1, inplace = True)
RNN_data.loc[RNN_data['Age.FirstDose'] == '>89', 'Age.FirstDose'] = 90
RNN_data.rename(columns = {"severe_infection_next": "sev_inf_next"}, inplace = True)
where_severe_inf = np.where(RNN_data.sev_inf_next == 1)
RNN_data.inf_next.iloc[where_severe_inf] = 1
RNN_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,id,action,Age.FirstDose,Gender,Race,Visits,imm_baseline,windex,numVax,variant,sev_inf_next,inf_next
0,00023D48-0BD3-471F-A8AB-E93279677E61,0,65,M,Caucasian,56,0,3,0,none,0,0
1,00023D48-0BD3-471F-A8AB-E93279677E61,0,65,M,Caucasian,56,0,3,0,none,0,0
2,00023D48-0BD3-471F-A8AB-E93279677E61,0,65,M,Caucasian,56,0,3,0,none,0,0
3,00023D48-0BD3-471F-A8AB-E93279677E61,0,65,M,Caucasian,56,0,3,0,none,0,0
4,00023D48-0BD3-471F-A8AB-E93279677E61,0,65,M,Caucasian,56,0,3,0,none,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2183163,FFFF8453-0E47-46EF-B174-BD631AFDF434,0,31,M,Caucasian,0,0,0,2,omicron,0,0
2183164,FFFF8453-0E47-46EF-B174-BD631AFDF434,0,31,M,Caucasian,0,0,0,2,omicron,0,0
2183165,FFFF8453-0E47-46EF-B174-BD631AFDF434,0,31,M,Caucasian,0,0,0,2,omicron,0,0
2183166,FFFF8453-0E47-46EF-B174-BD631AFDF434,1,31,M,Caucasian,0,0,0,3,omicron,0,0


In [3]:
scaler = StandardScaler()
RNN_data_demographics = RNN_data.drop_duplicates(subset = ["id"])
age = RNN_data_demographics[["Age.FirstDose"]].astype(int)
normalized_age = scaler.fit_transform(age)
age["normalized_age"] = normalized_age
age_map = age.sort_values("Age.FirstDose").drop_duplicates(subset = ["Age.FirstDose"])
age_map = dict(zip(age_map["Age.FirstDose"], age_map["normalized_age"]))

In [4]:
RNN_data["Age.FirstDose"] = RNN_data["Age.FirstDose"].astype(int)
RNN_data["Age.FirstDose"].replace(age_map, inplace = True)
RNN_data

Unnamed: 0,id,action,Age.FirstDose,Gender,Race,Visits,imm_baseline,windex,numVax,variant,sev_inf_next,inf_next
0,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,M,Caucasian,56,0,3,0,none,0,0
1,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,M,Caucasian,56,0,3,0,none,0,0
2,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,M,Caucasian,56,0,3,0,none,0,0
3,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,M,Caucasian,56,0,3,0,none,0,0
4,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,M,Caucasian,56,0,3,0,none,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2183163,FFFF8453-0E47-46EF-B174-BD631AFDF434,0,-0.653687,M,Caucasian,0,0,0,2,omicron,0,0
2183164,FFFF8453-0E47-46EF-B174-BD631AFDF434,0,-0.653687,M,Caucasian,0,0,0,2,omicron,0,0
2183165,FFFF8453-0E47-46EF-B174-BD631AFDF434,0,-0.653687,M,Caucasian,0,0,0,2,omicron,0,0
2183166,FFFF8453-0E47-46EF-B174-BD631AFDF434,1,-0.653687,M,Caucasian,0,0,0,3,omicron,0,0


In [5]:
race_dummies = pd.get_dummies(RNN_data["Race"])
gender_dummies = (RNN_data["Gender"] == "M") + 0
variant_dummies = pd.get_dummies(RNN_data["variant"])

visits_dummies = pd.get_dummies(
    pd.cut(RNN_data["Visits"], bins = [0, 5, 10, 20, 50, 1000], include_lowest = True, right = False)
                                        )
windex_dummies = pd.get_dummies(
    pd.cut(RNN_data["windex"], bins = [0, 1, 3, 5, 100], include_lowest = True, right = False)
                                        )

race_dummies = race_dummies[['Caucasian', 'African American', 'Other']]
variant_dummies = variant_dummies[['none', 'delta', 'omicron']]

In [6]:
covariates = pd.concat([RNN_data[["id", "action", "Age.FirstDose", "imm_baseline", "numVax"]],
                        gender_dummies,
                        race_dummies.iloc[:, 1:],
                        visits_dummies.iloc[:, 1:],
                        windex_dummies.iloc[:, 1:],
                        variant_dummies.iloc[:, 1:]], axis = 1)

In [7]:
covariates

Unnamed: 0,id,action,Age.FirstDose,imm_baseline,numVax,Gender,African American,Other,"[5, 10)","[10, 20)","[20, 50)","[50, 1000)","[1, 3)","[3, 5)","[5, 100)",delta,omicron
0,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,0,0,1,0,0,0,0,0,1,0,1,0,0,0
1,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,0,0,1,0,0,0,0,0,1,0,1,0,0,0
2,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,0,0,1,0,0,0,0,0,1,0,1,0,0,0
3,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,0,0,1,0,0,0,0,0,1,0,1,0,0,0
4,00023D48-0BD3-471F-A8AB-E93279677E61,0,0.880583,0,0,1,0,0,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2183163,FFFF8453-0E47-46EF-B174-BD631AFDF434,0,-0.653687,0,2,1,0,0,0,0,0,0,0,0,0,0,1
2183164,FFFF8453-0E47-46EF-B174-BD631AFDF434,0,-0.653687,0,2,1,0,0,0,0,0,0,0,0,0,0,1
2183165,FFFF8453-0E47-46EF-B174-BD631AFDF434,0,-0.653687,0,2,1,0,0,0,0,0,0,0,0,0,0,1
2183166,FFFF8453-0E47-46EF-B174-BD631AFDF434,1,-0.653687,0,3,1,0,0,0,0,0,0,0,0,0,0,1


In [None]:
demographics = covariates.drop_duplicates(subset = ["id"]).drop(["action", "numVax", "delta", "omicron"], axis = 1)
demographics.to_csv("data/demographics.csv")

In [8]:
n = len(RNN_data['id'].unique())
p = covariates.shape[1] - 1
t = RNN_data[['id', 'action']].groupby('id').count().max().item()
covariates_rnn = np.zeros((n, t, p))
outcomes_rnn = np.zeros((n, t, 2))
seq_length = np.zeros(n)

In [None]:
grouped_data = RNN_data.groupby('id')
for i, (pt_id, group) in enumerate(grouped_data):
    covariates_i = np.array(covariates.loc[covariates['id'] == pt_id, :].iloc[:, 1:])

    time_i = np.arange(group.shape[0])
    outcomes_i = group[['sev_inf_next', 'inf_next']].values
    seq_length[i] = np.min(np.concatenate((np.where(outcomes_i[:, 0])[0], [time_i[-1]]))) + 1

    covariates_rnn[i, time_i, :] = covariates_i
    outcomes_rnn[i, time_i, :] = outcomes_i
    
    if (i + 1) % 100 == 0:
        print("{} / {}".format(i + 1, n))

100 / 81000
200 / 81000
300 / 81000
400 / 81000
500 / 81000
600 / 81000


In [None]:
np.save('covariates_rnn.npy', covariates_rnn)
np.save('outcomes_rnn.npy', outcomes_rnn)
np.save('seq_length.npy', seq_length)