In [None]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
import seaborn as sns
from matplotlib import pyplot as plt
from dateutil import parser
import datetime

## Timeseries

In [None]:
us = pd.read_csv("https://github.com/nytimes/covid-19-data/blob/master/us.csv?raw=true", parse_dates=['date'])
us_states = pd.read_csv("https://github.com/nytimes/covid-19-data/blob/master/us-states.csv?raw=true", parse_dates=['date'])
us

In [None]:
us_diff = us.set_index('date')
assert us_diff.resample('1d').asfreq().shape == us_diff.shape
us_diff -= us_diff.shift(1)
us_diff.iloc[0, :] = us.iloc[0, 1:]
us_diff

In [None]:
fig, ax = plt.subplots()

sns.lineplot(data=us_diff.rolling('14d').mean(), x='date', y='cases')

fig.set_size_inches(12,8)
plt.show()

## Antibodies

In [None]:
workbook_name = 'data/fake.xlsx'

In [None]:
sample_info = pd.read_excel(workbook_name, sheet_name='Sample Information')
sample_info.head()

In [None]:
participant_info = pd.read_excel(workbook_name, sheet_name='Participant Information')
participant_info.head()

In [None]:
indexed = participant_info.set_index('Participant ID')
indexed.head()

In [None]:
df = sample_info.join(indexed, on='Participant ID')
df.head()

In [None]:
df['Vax1 to Infection'] - df['Vax1 to Boost']

In [None]:
df['Infection Pre-Boost'] = df.apply(lambda row: row['Vax1 to Infection'] < row['Vax1 to Boost'] or pd.isna(row['Vax1 to Boost']), axis=1)
df['Infection Post-Boost'] = df['Infection Pre-Boost'].apply(lambda val: "No" if val else "Yes")
df['Days to Infection'] = df['Days to Vax1'] - df['Vax1 to Infection']
df['Days to Boost'] = df['Days to Vax1'] - df['Vax1 to Boost']
df['Sample Pre-Infection'] = df['Days to Infection'] <= 0

df.head()

In [None]:
df['Sorting Column'] = abs(df['Days to Infection'] - 28)
df_dedup = (df.sort_values('Sorting Column')
              .drop_duplicates(subset=['Participant ID', 'Sample Pre-Infection'], keep='first')
              .sort_values(by=['Participant ID','Days to Infection'])
              .drop('Sorting Column', axis=1))
df_dedup['Log2AUC'] = np.log2(df['AUC'])
df_dedup.head()

In [None]:
df_dedup.groupby('Participant ID').count()

In [None]:
df_dedup.groupby('Participant ID').count().query('AUC == 1')

In [None]:
insufficient = df_dedup.groupby('Participant ID').count().query('AUC == 1').index.to_numpy()
insufficient

In [None]:
df_final = df_dedup[df_dedup['Participant ID'].apply(lambda val: val not in insufficient)]
print(df_final.shape)
df_final.head()