In [52]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression

from utils import * ### need utils.py in the same folder as this notebook

idx = pd.IndexSlice
pd.set_option('max.rows', 100)
pd.set_option('max.columns', 300)
pd.set_option('mode.chained_assignment', 'raise')
pd.set_option('display.float_format', lambda x: '%.4f' % x) #supress scientific notation

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# 0. self-defined utils

In [75]:
### keep both training labels and ILI_type for y
def get_ili_type(df, keep_filter, label_col = ('labels', 'ILI_type')):
    ## get ILI training label and ILI type for y
    y = df.loc[keep_filter ,label_col]
 
    return y

### impute na by 0
### not ideal but should work for a start
def impute_na(df):
    return df.fillna(0)

# 1. Generating whole data set

In [49]:
rnd = np.random.RandomState(1729)
max_num = 10000000

### Determine #samples in train, val, and test 
###(a,b): a - disease type(1-ili, 2-flu, 3-covid), b - #samples
dict_split = {'train': [(1, 150), (2, 150), (3, 150)], # ili:flu:covid=1
             'val': [(1, 30), (2, 30), (3, 30)],       # ili:flu:covid=1
             'test': [(1, 1000), (2, 100), (3, 30)]    # ili:flu:covid=100:10:3
             }

### generating data based on dict_split
df_activity = pd.concat([generate_normalized_hr_sample(i, split, ili) for split,v in dict_split.items() 
                         for ili,n in v 
                         for i in rnd.choice(np.arange(1,max_num), n)
                        ])

### summarize generated data
print(df_activity.shape)
print('N =', count_unique_index(df_activity))
assert count_unique_index(df_activity) == df_activity.shape[0]/43
describe_datetimeindex(df_activity)

(71810, 24)
N = 1670


count                            71810
mean     2020-04-08 00:25:00.359281152
min                2020-01-04 00:00:00
25%                2020-03-01 00:00:00
50%                2020-04-08 00:00:00
75%                2020-05-16 00:00:00
max                2020-07-11 00:00:00
Name: dt, dtype: object

In [50]:
### summary test:train:val
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
prop_table(df_activity['labels', 'split'])

Unnamed: 0,"(labels, split)",count,percent
0,test,48590,67.66
1,train,19350,26.95
2,val,3870,5.39
3,Total,71810,100.0


## 1.1 Generating training and val set
* in training and val, we only work on health/ill days, i.e. days with training label {0,1}
* so total 21 days per id
* finally {X_train/val, y_train/val_label}

In [64]:
### generating training
label_col = ('labels', 'training_labels') ###  training_labels {-1,0,1}, for all train,test,val
split_col = ('labels', 'split')           ### train/test/val
type_col = ('labels', 'ILI_type')       

days_ago = ['0days_ago', '1days_ago', '2days_ago', '3days_ago', '4days_ago']
feature_cols = ['heart_rate__not_moving__max', 
                'heart_rate__resting_heart_rate',
                'heart_rate__stddev',
                'heart_rate__perc_50th']

keep_train = (df_activity[label_col].isin([0,1])  #trim buffer days (which has label -1)
                & (df_activity[split_col] == 'train')
                & (df_activity[type_col].isin([1,2,3]))
              )

X_train, y_train, filter_train = get_dataset(df_activity, keep_train, days_ago, feature_cols, label_col)

y_train_type = get_ili_type(df_activity, keep_train) 

## y_train only contains {0:healthy, 1:ill}, not adequate for multiclass setting
## y_train_type contains {1:ili, 2:fli, 3:covid}
## y_train_label = y_train*y_train_type
y_train_label = y_train*y_train_type


### generating val
keep_val = (df_activity[label_col].isin([0,1]) 
            & (df_activity[split_col] == 'val')
             )

X_val, y_val, filter_val = get_dataset(df_activity, keep_val, days_ago, feature_cols, label_col)

y_val_type  = get_ili_type(df_activity, keep_val)

## same tweak for y_val
y_val_label = y_val*y_val_type

(9450, 20) (9450,)
Missing rows percent = 13.38%
  (labels, training_labels)  count  percent
0                         0   6300  66.6700
1                         1   3150  33.3300
2                     Total   9450 100.0000
(1890, 20) (1890,)
Missing rows percent = 12.86%
  (labels, training_labels)  count  percent
0                         0   1260  66.6700
1                         1    630  33.3300
2                     Total   1890 100.0000


## 1.2 Generating test set
* same as the repo, in test set we keep all days, i.e. days with training label {-1,0,1}
* so total 43 days per id
* finally {X_test, y_test_label}

In [67]:
keep_test = (df_activity[split_col] == 'test')
X_test, y_test, filter_test = get_dataset(df_activity, keep_test, days_ago, feature_cols, label_col)
y_test_type = get_ili_type(df_activity, keep_test)
y_test_label = y_test_type*y_test

(48590, 20) (48590,)
Missing rows percent = 13.22%
  (labels, training_labels)  count  percent
0                        -1  24860  51.1600
1                         0  15820  32.5600
2                         1   7910  16.2800
3                     Total  48590 100.0000


# 2. Imputation of NA values
* na value only in X, either train/val/test
* use impute_na() in section 0