# DISDE Libraries

In [8]:
# Note, WhyShift uses some odd, very specific versions of certain packages. This may need futher configuration
!pip install graphviz
!pip uninstall -y numpy
!pip install --no-cache-dir numpy
!pip install xgboost
!pip install fairlearn
!pip install --no-cache-dir lightgbm
!pip install --upgrade pandas
!pip install \
    'dask>2023.3.2' \
    'distributed>2023.3.2'

[33mDEPRECATION: pytorch-lightning 1.7.7 has a non-standard dependency specifier torch>=1.9.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m[33mDEPRECATION: tsfresh 0.18.0 has a non-standard dependency specifier matrixprofile>=1.1.10<2.0.0. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of tsfresh or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update

In [9]:
import sys
import os
import seaborn as sns

import numpy as np
import pandas as pd
import pickle
sys.path.insert(0, './')

from whyshift import degradation_decomp, fetch_model, risk_region
import torch 
import random

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, average_precision_score, roc_auc_score, accuracy_score
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
import copy
import warnings
warnings.filterwarnings("ignore")


# WhyShift Model

In [10]:
# Add the following to the WhyShift package

model = fetch_model('hgbc')
#model.fit(source_X_train, source_y_train)

# Load the data

In [11]:
def clean_daily_data(df, columns, pid_name, date_name):
    """
    df: the dataframe with samples
    columns: should be a dictionary, with the columns to filter and the filter values, i.e. {'resting_heart_rate': (20,200)}
    pid_name: name of the pid column
    date_name: name of the date column, assumed not to already be datetime based
    """

    df['pid'] = df[pid_name]
    # Make sure dates are in pandas datetime
    df['date'] = pd.to_datetime(df[date_name])
    
    # Set a multi-index on PID and date
    df = df.set_index(['pid', 'date']).sort_index()

    # Iterate through the set of columns to filter on
    for key, values in columns.items():
        df = df[df[key].between(values[0], values[1])]
        
    # Resample to daily, important for lagged-z-scoring
    df = df.groupby('pid').apply(lambda x: x.droplevel(0).resample('D').asfreq())

    df['day_of_week'] = df.index.get_level_values(1).dayofweek
    
    return df

    

In [5]:
homekit_nightly = pd.read_csv('/homekit2020neurips/fitbit_day_level_activity.csv', index_col=0)

columns = {
    'resting_heart_rate': (20, 200),
    'caloriesOut': (500, 20000),
    'total_asleep_minutes': (60, 16*60*60), #Let's say 16 hours
}

homekit_nightly = clean_daily_data(homekit_nightly, columns, 'participant_id', 'date')

In [12]:
# COVID-RED
covid_red_nightly = pd.read_csv('./data/wd_20230515.csv')

columns = {
    'WDTEMP': (0, 45),
    'WDPULSE': (20, 200),
    'WDSLEEP': (60, 16*60*60), #Let's say 16 hours
}

covid_red_nightly = clean_daily_data(covid_red_nightly, columns, 'SUBJID', 'WDDT')

# Surveys

## Homekit

In [None]:
homekit_surveys = pd.read_csv('/homekit2020neurips/daily_surveys_onehot.csv')
homekit_surveys['date'] = pd.to_datetime(homekit_surveys['timestamp'].apply(lambda x: x[:10]))
homekit_surveys = homekit_surveys.drop_duplicates(subset=['participant_id','date'], keep = 'last')
homekit_surveys = homekit_surveys.set_index(['participant_id', 'date'])
homekit_surveys = homekit_surveys.sort_index()

indeces = list(set(homekit_nightly.index).intersection(homekit_surveys.index))
homekit_nightly[homekit_surveys.columns] = np.nan
homekit_nightly.loc[indeces, homekit_surveys.columns] = homekit_surveys.loc[indeces].values

"""
Fever labels
"""
homekit_nightly['sx_Fever'] = homekit_nightly[['symptom_severity__fever_q_2', 'symptom_severity__fever_q_3']].sum(axis = 1)

"""
Flu labels: fever and one of: sore throat or cough
"""
# Body ache
homekit_nightly['sx_Fever_ILI'] = homekit_nightly[['symptom_severity__fever_q_1','symptom_severity__fever_q_2', 'symptom_severity__fever_q_3']].sum(axis = 1)
homekit_nightly['sx_Sore_throat_ILI'] = homekit_nightly[['symptom_severity__q_sore_throat_1','symptom_severity__q_sore_throat_2', 'symptom_severity__q_sore_throat_3']].sum(axis = 1)
homekit_nightly['sx_Cough_ILI'] = homekit_nightly[['symptom_severity__cough_q_1','symptom_severity__cough_q_2', 'symptom_severity__cough_q_2']].sum(axis = 1)
homekit_nightly['ili_sum'] = homekit_nightly['sx_Fever_ILI'] + homekit_nightly[['sx_Sore_throat_ILI', 'sx_Cough_ILI']].max(axis = 1)

homekit_nightly['sx_ILI'] = np.nan

homekit_nightly.loc[homekit_nightly[homekit_nightly['ili_sum'] == 2].index, 'sx_ILI'] = 1
homekit_nightly.loc[homekit_nightly[homekit_nightly['ili_sum'] == 1].index, 'sx_ILI'] = 0
homekit_nightly.loc[homekit_nightly[homekit_nightly['ili_sum'] == 0].index, 'sx_ILI'] = 0
homekit_nightly['have_flu'] = homekit_nightly['sx_ILI']

"""
Viral positivity label
"""
hk_lab_tests = pd.read_csv('/homekit2020neurips/lab_results_with_triggerdate.csv', index_col = 0)
hk_lab_tests['date'] = pd.to_datetime(pd.to_datetime(hk_lab_tests['trigger_datetime']).dt.date)

flus = ["Influenza A (Flu A)","Influenza B (Flu B)"]
hk_lab_tests = hk_lab_tests.drop_duplicates(subset=['participant_id','date'], keep = 'last')
hk_lab_tests = hk_lab_tests.set_index(['participant_id', 'date'])
hk_lab_tests['type'] = hk_lab_tests['result'].apply(lambda x: type(x))
hk_lab_tests = hk_lab_tests[hk_lab_tests['type'] == type('dummy_string')]
hk_lab_tests = hk_lab_tests.drop('first_report_yn', axis = 1) 

indeces = list(set(homekit_nightly.index).intersection(hk_lab_tests.index))
homekit_nightly[hk_lab_tests.columns] = np.nan
homekit_nightly.loc[indeces, hk_lab_tests.columns] = hk_lab_tests.loc[indeces].values
homekit_nightly['is_pos'] = 0
is_pos_index = homekit_nightly[(homekit_nightly['test_name'].isin(flus)) & (homekit_nightly['result'] == 'Detected')].index
homekit_nightly.loc[is_pos_index, 'is_pos'] = 1

# COVID-RED

In [13]:
"""
Viral positivity
"""
covid_red_nightly['is_pos'] = 0
covid_red_nightly.loc[covid_red_nightly[covid_red_nightly['WDDIAG'] == 'positive'].index, 'is_pos'] = 1

"""
Fever sypmtoms
"""
covid_red_nightly['sx_Fever'] = covid_red_nightly['WDSYMP'].astype(str).apply(lambda x: np.nan if 'nan' in x else (1 if 'fever' in x else 0))
covid_red_nightly['sx_Cough'] = covid_red_nightly['WDSYMP'].astype(str).apply(lambda x: np.nan if 'nan' in x else (1 if 'cough' in x else 0))
covid_red_nightly['sx_Sore_throat'] = covid_red_nightly['WDSYMP'].astype(str).apply(lambda x: np.nan if 'nan' in x else (1 if 'sore_throat' in x else 0))

"""
Flu sypmtoms
"""
covid_red_nightly['ili_sum'] = covid_red_nightly['sx_Fever'] + covid_red_nightly[['sx_Cough', 'sx_Sore_throat']].max(axis = 1)
covid_red_nightly['have_flu'] = np.nan

covid_red_nightly.loc[covid_red_nightly[covid_red_nightly['ili_sum'] == 2].index, 'have_flu'] = 1
covid_red_nightly.loc[covid_red_nightly[covid_red_nightly['ili_sum'] == 1].index, 'have_flu'] = 0
covid_red_nightly.loc[covid_red_nightly[covid_red_nightly['ili_sum'] == 0].index, 'have_flu'] = 0


# Util Functions

In [14]:
def z_score_modalities(df, modalities, length, min_num, offset, pid_name):
    """
    Inputs:
    df: a pandas dataframe, multi-indexed by pid and date
    modalities: iterable of strings, should be the names of columns
    length: the length of the window to z-score by
    min_num: the minumum number of non-missing days in the baseline period
    offset: how many days lagged to start the baseline period
    pid_name: the name of the index level with the pids
    """
    
    grouped_mean = df.groupby(pid_name)[modalities].rolling(window = length, min_periods = min_num).mean().shift(offset)
    grouped_std = df.groupby(pid_name)[modalities].rolling(window = length, min_periods = min_num).std().shift(offset)

    z_modality_names = [mod + "_z" for mod in modalities]
    df[z_modality_names] = (df[modalities].values - grouped_mean.values)/grouped_std.values
    return df

def get_ML_dataset(z_scored_df, modalities, prediction_or_detection, pid_name, ground_truth_column):
    """
    Inputs:
    z_scored_df: a pandas dataframe, multi-indexed by pid and date. Note, since this will be using shift, it should be complete in time, with nan's for any in-between dates without data
        - also note, dates should correspond to wearable data from the NIGHT BEFORE
        - Wearable data should already be z-scored by relevent baseline period
    modalities: iterable of strings, should be the names of columns
    prediction_or_detection: if prediction, take the data from 2 nights and the night before the ground truth day
        - if detection, take data from the night before and the night after detection
    pid_name: the name of the index level with the pids
    """
    z_scored_df['day_of_week'] = z_scored_df.index.get_level_values(1).dayofweek
    z_scored_df[np.arange(0,7).astype(str)] = pd.get_dummies(z_scored_df['day_of_week']).astype(int)
    
    if prediction_or_detection == 'prediction':
                
        shift = 2
        # Assign the correct values to two nights before
        three_before_names = [f"{mod}_3_nights_before" for mod in modalities]
        z_scored_df[three_before_names] = z_scored_df.groupby(pid_name).shift(shift)[modalities].values
        
        shift = 1

        # Assign the correct values to two nights before
        two_before_names = [f"{mod}_2_nights_before" for mod in modalities]
        z_scored_df[two_before_names] = z_scored_df.groupby(pid_name).shift(shift)[modalities].values
        
        # Assign the correct values to the night before columns
        one_before_names = [f"{mod}_1_night_before" for mod in modalities]
        z_scored_df[one_before_names] = z_scored_df[modalities].values      
        

        return z_scored_df[np.hstack([three_before_names, two_before_names, one_before_names, np.arange(0,7).astype(str), ground_truth_column])], np.hstack([three_before_names, two_before_names, one_before_names, np.arange(0,7).astype(str)])
        
    elif prediction_or_detection == 'detection':

        shift = 1

        # Assign the correct values to two nights before
        two_before_names = [f"{mod}_2_nights_before" for mod in modalities]
        z_scored_df[two_before_names] = z_scored_df.groupby(pid_name).shift(shift)[modalities].values
        
        shift = -1

        # Assign the correct values to two nights before
        night_after_names = [f"{mod}_night_after" for mod in modalities]
        z_scored_df[night_after_names] = z_scored_df.groupby(pid_name).shift(shift)[modalities].values
        
        # Assign the correct values to the night before columns
        night_before_names = [f"{mod}_night_before" for mod in modalities]
        z_scored_df[night_before_names] = z_scored_df[modalities].values

        return z_scored_df[np.hstack([two_before_names, night_before_names, night_after_names, np.arange(0,7).astype(str), ground_truth_column])], np.hstack([two_before_names, night_before_names, night_after_names, np.arange(0,7).astype(str)])

def calculate_sample_weights(y):
    class_counts = np.bincount(y)
    # Calculate the total number of samples
    total_samples = len(y)
    
    # Calculate the proportion of each class
    class_proportions = class_counts / total_samples
    
    # Assign weights inversely proportional to the class proportions
    weights = (1.0 / class_proportions[y])/total_samples

    return weights

# Prediction, detection, all possible features

# Homekit2020 Prediction

In [20]:
%%time
optimal_offset = 12
optimal_window_length = 10
minimum_number = 6
prediction_detection = 'prediction'

"""
Homekit dataset training
"""
tasks = ['is_pos', 'have_flu', 'sx_Fever']

for ground_truth_label in tasks:
    print("\n")
    print(f"Task is {ground_truth_label}")

    homekit_cols = ['resting_heart_rate','main_in_bed_minutes','main_efficiency','nap_count','total_asleep_minutes','total_in_bed_minutes','activityCalories','caloriesOut',
 'caloriesBMR','marginalCalories','sedentaryMinutes','lightlyActiveMinutes','fairlyActiveMinutes','veryActiveMinutes']
    
    z_scored_df = z_score_modalities(homekit_nightly, homekit_cols, optimal_offset, minimum_number, optimal_window_length, 'pid')
    z_score_columns = [col + "_z" for col in homekit_cols]
    
    dataset_df, feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, prediction_detection, 'pid', ground_truth_label)
    
    dataset_df_drop = dataset_df[np.hstack([feature_column_names, [ground_truth_label]])].replace([np.inf, -np.inf, None], np.nan).dropna()
    X, y = dataset_df_drop[feature_column_names].values, dataset_df_drop[ground_truth_label].values
    
    positive_ids = dataset_df_drop[dataset_df_drop[ground_truth_label] == 1].index.get_level_values(0).unique()
    user_split_pids = pd.DataFrame(index = dataset_df_drop.index.get_level_values(0).unique(), columns = [ground_truth_label])
    
    # Dummy X for input to stratified K fold splitting
    X = user_split_pids.values
    
    user_split_pids[ground_truth_label] = 0
    user_split_pids.loc[positive_ids] = 1
    
    y = user_split_pids.values
    
    skf = StratifiedKFold(n_splits=5, random_state = 42, shuffle=True)
    skf.get_n_splits(X, y)
    
    accuracy_score_holder = []
    au_roc_holder = []
    av_prec_holder = []
    
    for fold, (train, test) in enumerate(skf.split(X, y)):
        train_pids = user_split_pids.iloc[train].index
        test_pids = user_split_pids.iloc[test].index
    
        X_train = dataset_df_drop.loc[train_pids, feature_column_names].values
        X_test = dataset_df_drop.loc[test_pids, feature_column_names].values
    
        y_train = dataset_df_drop.loc[train_pids, ground_truth_label].values
        y_test = dataset_df_drop.loc[test_pids, ground_truth_label].values
    
        clf = HistGradientBoostingClassifier(random_state = 42, l2_regularization = 0.2, early_stopping = False)
        clf.fit(X_train, y_train)
        
        y_test_predict_prob = clf.predict_proba(X_test)
        y_test_predict = clf.predict(X_test)

        weights = calculate_sample_weights(y_test.astype(int))

        accuracy_score_holder.append(accuracy_score(y_test, y_test_predict, sample_weight = weights))
        au_roc_holder.append(roc_auc_score(y_test, y_test_predict_prob[:,1]))
        av_prec_holder.append(average_precision_score(y_test, y_test_predict_prob[:,1]))
    
    print(f"Average ROC: {np.array(au_roc_holder).mean().round(3)}, Average Precision: {np.array(av_prec_holder).mean().round(4)}, Average accuracy: {np.array(accuracy_score_holder).mean().round(4)}")
    




Task is is_pos
Average ROC: 0.858, Average Precision: 0.002, Average accuracy: 0.5


Task is have_flu
Average ROC: 0.637, Average Precision: 0.0159, Average accuracy: 0.5


Task is sx_Fever
Average ROC: 0.766, Average Precision: 0.0363, Average accuracy: 0.5083
CPU times: user 4min 7s, sys: 4.16 s, total: 4min 11s
Wall time: 38.9 s


# Homekit2020 Detection

In [21]:
%%time
optimal_offset = 12
optimal_window_length = 10
minimum_number = 6
prediction_detection = 'detection'

"""
Homekit dataset training
"""
tasks = ['is_pos', 'have_flu', 'sx_Fever']

for ground_truth_label in tasks:
    print("\n")
    print(f"Task is {ground_truth_label}")

    homekit_cols = ['resting_heart_rate','main_in_bed_minutes','main_efficiency','nap_count','total_asleep_minutes','total_in_bed_minutes','activityCalories','caloriesOut',
 'caloriesBMR','marginalCalories','sedentaryMinutes','lightlyActiveMinutes','fairlyActiveMinutes','veryActiveMinutes']
    
    z_scored_df = z_score_modalities(homekit_nightly, homekit_cols, optimal_offset, minimum_number, optimal_window_length, 'pid')
    z_score_columns = [col + "_z" for col in homekit_cols]
    
    dataset_df, feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, prediction_detection, 'pid', ground_truth_label)
    
    dataset_df_drop = dataset_df[np.hstack([feature_column_names, [ground_truth_label]])].replace([np.inf, -np.inf, None], np.nan).dropna()
    X, y = dataset_df_drop[feature_column_names].values, dataset_df_drop[ground_truth_label].values
    
    positive_ids = dataset_df_drop[dataset_df_drop[ground_truth_label] == 1].index.get_level_values(0).unique()
    user_split_pids = pd.DataFrame(index = dataset_df_drop.index.get_level_values(0).unique(), columns = [ground_truth_label])
    
    # Dummy X for input to stratified K fold splitting
    X = user_split_pids.values
    
    user_split_pids[ground_truth_label] = 0
    user_split_pids.loc[positive_ids] = 1
    
    y = user_split_pids.values
    
    skf = StratifiedKFold(n_splits=5, random_state = 42, shuffle=True)
    skf.get_n_splits(X, y)
    
    accuracy_score_holder = []
    au_roc_holder = []
    av_prec_holder = []
    
    for fold, (train, test) in enumerate(skf.split(X, y)):
        train_pids = user_split_pids.iloc[train].index
        test_pids = user_split_pids.iloc[test].index
    
        X_train = dataset_df_drop.loc[train_pids, feature_column_names].values
        X_test = dataset_df_drop.loc[test_pids, feature_column_names].values
    
        y_train = dataset_df_drop.loc[train_pids, ground_truth_label].values
        y_test = dataset_df_drop.loc[test_pids, ground_truth_label].values
    
        clf = HistGradientBoostingClassifier(random_state = 42, l2_regularization = 0.2, early_stopping = False)
        clf.fit(X_train, y_train)
        
        y_test_predict_prob = clf.predict_proba(X_test)
        y_test_predict = clf.predict(X_test)

        weights = calculate_sample_weights(y_test.astype(int))

        accuracy_score_holder.append(accuracy_score(y_test, y_test_predict, sample_weight = weights))
        au_roc_holder.append(roc_auc_score(y_test, y_test_predict_prob[:,1]))
        av_prec_holder.append(average_precision_score(y_test, y_test_predict_prob[:,1]))
    
    print(f"Average ROC: {np.array(au_roc_holder).mean().round(3)}, Average Precision: {np.array(av_prec_holder).mean().round(4)}, Average accuracy: {np.array(accuracy_score_holder).mean().round(4)}")
    




Task is is_pos
Average ROC: 0.931, Average Precision: 0.0112, Average accuracy: 0.5


Task is have_flu
Average ROC: 0.638, Average Precision: 0.016, Average accuracy: 0.5006


Task is sx_Fever
Average ROC: 0.77, Average Precision: 0.0159, Average accuracy: 0.5
CPU times: user 4min 18s, sys: 5.33 s, total: 4min 24s
Wall time: 40.4 s


# COVID-RED Prediction

In [15]:
%%time
optimal_offset = 12
optimal_window_length = 10
minimum_number = 6
prediction_detection = 'prediction'

"""
COVID-RED dataset training
"""
tasks = ['is_pos', 'have_flu', 'sx_Fever']

for ground_truth_label in tasks:
    print("\n")
    print(f"Task is {ground_truth_label}")

    covid_red_cols = ["WDPULSE", "WDRESP","WDTEMP", "WDPULSEV", "WDOXI", "WDSLEEP"]
    
    z_scored_df = z_score_modalities(covid_red_nightly, covid_red_cols, optimal_offset, minimum_number, optimal_window_length, 'pid')
    z_score_columns = [col + "_z" for col in covid_red_cols]
    
    dataset_df, feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, prediction_detection, 'pid', ground_truth_label)
    
    dataset_df_drop = dataset_df[np.hstack([feature_column_names, [ground_truth_label]])].replace([np.inf, -np.inf, None], np.nan).dropna()
    X, y = dataset_df_drop[feature_column_names].values, dataset_df_drop[ground_truth_label].values
    
    positive_ids = dataset_df_drop[dataset_df_drop[ground_truth_label] == 1].index.get_level_values(0).unique()
    user_split_pids = pd.DataFrame(index = dataset_df_drop.index.get_level_values(0).unique(), columns = [ground_truth_label])
    
    # Dummy X for input to stratified K fold splitting
    X = user_split_pids.values
    
    user_split_pids[ground_truth_label] = 0
    user_split_pids.loc[positive_ids] = 1
    
    y = user_split_pids.values
    
    skf = StratifiedKFold(n_splits=5, random_state = 42, shuffle=True)
    skf.get_n_splits(X, y)
    
    accuracy_score_holder = []
    au_roc_holder = []
    av_prec_holder = []
    
    for fold, (train, test) in enumerate(skf.split(X, y)):
        train_pids = user_split_pids.iloc[train].index
        test_pids = user_split_pids.iloc[test].index
    
        X_train = dataset_df_drop.loc[train_pids, feature_column_names].values
        X_test = dataset_df_drop.loc[test_pids, feature_column_names].values
    
        y_train = dataset_df_drop.loc[train_pids, ground_truth_label].values
        y_test = dataset_df_drop.loc[test_pids, ground_truth_label].values
    
        clf = HistGradientBoostingClassifier(random_state = 42, l2_regularization = 0.2, early_stopping = False)
        clf.fit(X_train, y_train)
        
        y_test_predict_prob = clf.predict_proba(X_test)
        y_test_predict = clf.predict(X_test)

        weights = calculate_sample_weights(y_test.astype(int))

        accuracy_score_holder.append(accuracy_score(y_test, y_test_predict, sample_weight = weights))
        au_roc_holder.append(roc_auc_score(y_test, y_test_predict_prob[:,1]))
        av_prec_holder.append(average_precision_score(y_test, y_test_predict_prob[:,1]))
    
    print(f"Average ROC: {np.array(au_roc_holder).mean().round(3)}, Average Precision: {np.array(av_prec_holder).mean().round(4)}, Average accuracy: {np.array(accuracy_score_holder).mean().round(4)}")
    




Task is is_pos
Average ROC: 0.628, Average Precision: 0.0014, Average accuracy: 0.5


Task is have_flu
Average ROC: 0.657, Average Precision: 0.0306, Average accuracy: 0.4999


Task is sx_Fever
Average ROC: 0.686, Average Precision: 0.0955, Average accuracy: 0.5091
CPU times: user 1min 49s, sys: 4.33 s, total: 1min 53s
Wall time: 51.8 s


# COVID-RED Detection

In [23]:
%%time
optimal_offset = 12
optimal_window_length = 10
minimum_number = 6
prediction_detection = 'detection'

"""
COVID-RED dataset training
"""
tasks = ['is_pos', 'have_flu', 'sx_Fever']

for ground_truth_label in tasks:
    print("\n")
    print(f"Task is {ground_truth_label}")

    covid_red_cols = ["WDPULSE", "WDRESP","WDTEMP", "WDPULSEV", "WDOXI", "WDSLEEP"]
    
    z_scored_df = z_score_modalities(covid_red_nightly, covid_red_cols, optimal_offset, minimum_number, optimal_window_length, 'pid')
    z_score_columns = [col + "_z" for col in covid_red_cols]
    
    dataset_df, feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, prediction_detection, 'pid', ground_truth_label)
    
    dataset_df_drop = dataset_df[np.hstack([feature_column_names, [ground_truth_label]])].replace([np.inf, -np.inf, None], np.nan).dropna()
    X, y = dataset_df_drop[feature_column_names].values, dataset_df_drop[ground_truth_label].values
    
    positive_ids = dataset_df_drop[dataset_df_drop[ground_truth_label] == 1].index.get_level_values(0).unique()
    user_split_pids = pd.DataFrame(index = dataset_df_drop.index.get_level_values(0).unique(), columns = [ground_truth_label])
    
    # Dummy X for input to stratified K fold splitting
    X = user_split_pids.values
    
    user_split_pids[ground_truth_label] = 0
    user_split_pids.loc[positive_ids] = 1
    
    y = user_split_pids.values
    
    skf = StratifiedKFold(n_splits=5, random_state = 42, shuffle=True)
    skf.get_n_splits(X, y)
    
    accuracy_score_holder = []
    au_roc_holder = []
    av_prec_holder = []
    
    for fold, (train, test) in enumerate(skf.split(X, y)):
        train_pids = user_split_pids.iloc[train].index
        test_pids = user_split_pids.iloc[test].index
    
        X_train = dataset_df_drop.loc[train_pids, feature_column_names].values
        X_test = dataset_df_drop.loc[test_pids, feature_column_names].values
    
        y_train = dataset_df_drop.loc[train_pids, ground_truth_label].values
        y_test = dataset_df_drop.loc[test_pids, ground_truth_label].values
    
        clf = HistGradientBoostingClassifier(random_state = 42, l2_regularization = 0.2, early_stopping = False)
        clf.fit(X_train, y_train)
        
        y_test_predict_prob = clf.predict_proba(X_test)
        y_test_predict = clf.predict(X_test)

        weights = calculate_sample_weights(y_test.astype(int))

        accuracy_score_holder.append(accuracy_score(y_test, y_test_predict, sample_weight = weights))
        au_roc_holder.append(roc_auc_score(y_test, y_test_predict_prob[:,1]))
        av_prec_holder.append(average_precision_score(y_test, y_test_predict_prob[:,1]))
    
    print(f"Average ROC: {np.array(au_roc_holder).mean().round(3)}, Average Precision: {np.array(av_prec_holder).mean().round(4)}, Average accuracy: {np.array(accuracy_score_holder).mean().round(4)}")
    




Task is is_pos
Average ROC: 0.638, Average Precision: 0.0041, Average accuracy: 0.5


Task is have_flu
Average ROC: 0.7, Average Precision: 0.0479, Average accuracy: 0.4999


Task is sx_Fever
Average ROC: 0.709, Average Precision: 0.0998, Average accuracy: 0.5107
CPU times: user 3min 12s, sys: 3.62 s, total: 3min 15s
Wall time: 33.6 s


In [None]:
%%time

prediction_detection = 'prediction'
ground_truth_label = 'sx_Fever'
#ground_truth_label = 'is_pos'
#ground_truth_label = 'have_flu'

optimal_offset = 12
optimal_window_length = 10
minimum_number = 6

"""
COVID-RED dataset
"""
covid_red_generalization_columns = ["WDPULSE", "WDSLEEP"]
z_scored_df = z_score_modalities(covid_red_df, covid_red_generalization_columns, optimal_offset, minimum_number, optimal_window_length, 'SUBJID')
z_score_columns = [col + "_z" for col in covid_red_generalization_columns]
dataset_df, feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, 'prediction', 'SUBJID', 'sx_Fever')
dataset_df[['is_pos', 'have_flu']] = covid_red_df[['is_pos', 'have_flu']].values
covid_red_df_drop = dataset_df[np.hstack([feature_column_names, ['sx_Fever', 'is_pos', 'have_flu']])].replace([np.inf, -np.inf, None], np.nan).dropna()

"""
Homekit dataset
"""
homekit_generalization_columns = ['resting_heart_rate','total_asleep_minutes']
z_scored_df = z_score_modalities(homekit_nightly, homekit_generalization_columns, optimal_offset, minimum_number, optimal_window_length, 'participant_id')
z_score_columns = [col + "_z" for col in homekit_generalization_columns]
dataset_df, feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, 'prediction', 'participant_id', 'sx_Fever')
dataset_df[['is_pos', 'have_flu']] = homekit_nightly[['is_pos', 'have_flu']].values
homekit_df_drop = dataset_df[np.hstack([feature_column_names, ['sx_Fever', 'is_pos', 'have_flu']])].replace([np.inf, -np.inf, None], np.nan).dropna()

# Homekit -> COVID-RED

In [None]:
%%time
optimal_offset = 12
optimal_window_length = 10
minimum_number = 6

"""
Homekit dataset training
"""
tasks = ['is_pos', 'have_flu', 'sx_Fever']
prediction_detection = 'prediction'

for ground_truth_label in tasks:
    print("\n")
    print(f"Task is {ground_truth_label}")

    """
    Homekit generalization testing
    """
    homekit_generalization_columns = ['resting_heart_rate','total_asleep_minutes']
    z_scored_df = z_score_modalities(homekit_nightly, homekit_generalization_columns, optimal_offset, minimum_number, optimal_window_length, 'pid')
    z_score_columns = [col + "_z" for col in homekit_generalization_columns]
    dataset_df, homekit_feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, prediction_detection, 'pid', ground_truth_label)
    homekit_dataset_df_drop = dataset_df[np.hstack([homekit_feature_column_names, [ground_truth_label]])].replace([np.inf, -np.inf, None], np.nan).dropna()
    X, y = homekit_dataset_df_drop[homekit_feature_column_names].values, homekit_dataset_df_drop[ground_truth_label].values    
    
    positive_ids = homekit_dataset_df_drop[homekit_dataset_df_drop[ground_truth_label] == 1].index.get_level_values(0).unique()
    user_split_pids = pd.DataFrame(index = homekit_dataset_df_drop.index.get_level_values(0).unique(), columns = [ground_truth_label])
    
    # Dummy X for input to stratified K fold splitting
    X = user_split_pids.values
    
    user_split_pids[ground_truth_label] = 0
    user_split_pids.loc[positive_ids] = 1
    
    y = user_split_pids.values
    
    skf = StratifiedKFold(n_splits=5, random_state = 42, shuffle=True)
    skf.get_n_splits(X, y)
    
    f1_score_holder = []
    au_roc_holder = []
    av_prec_holder = []
    concept_shift_holder_covid_red = []
    #concept_shift_holder_homekit = []

    """
    COVID-RED dataset
    """
    
    covid_red_generalization_columns = ["WDPULSE", "WDSLEEP"]
    
    z_scored_df = z_score_modalities(covid_red_nightly, covid_red_generalization_columns, optimal_offset, minimum_number, optimal_window_length, 'pid')
    z_score_columns = [col + "_z" for col in covid_red_generalization_columns]
    
    dataset_df, covid_red_feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, prediction_detection, 'pid', ground_truth_label)
    
    covid_red_dataset_df_drop = dataset_df[np.hstack([covid_red_feature_column_names, [ground_truth_label]])].replace([np.inf, -np.inf, None], np.nan).dropna()
    covid_red_X, covid_red_y = covid_red_dataset_df_drop[covid_red_feature_column_names].values, covid_red_dataset_df_drop[ground_truth_label].values


    for fold, (train, test) in enumerate(skf.split(X, y)):
        train_pids = user_split_pids.iloc[train].index
        test_pids = user_split_pids.iloc[test].index
    
        X_train = homekit_dataset_df_drop.loc[train_pids, homekit_feature_column_names].values
        X_test = homekit_dataset_df_drop.loc[test_pids, homekit_feature_column_names].values
    
        y_train = homekit_dataset_df_drop.loc[train_pids, ground_truth_label].values
        y_test = homekit_dataset_df_drop.loc[test_pids, ground_truth_label].values
    
        clf = HistGradientBoostingClassifier(random_state = 42, l2_regularization = 0.2, early_stopping = False)
        clf.fit(X_train, y_train)
        
        y_test_predict_prob = clf.predict_proba(X_test)
        y_test_predict = clf.predict(X_test)
        
        au_roc_holder.append(roc_auc_score(y_test, y_test_predict_prob[:,1]))
        av_prec_holder.append(average_precision_score(y_test, y_test_predict_prob[:,1]))

        """
        Determine proportion of shift due to P(Y|X) shift
        """
        
        """
        Concept shift: COVID-RED
        """
        model = copy.deepcopy(clf)
        target_X, target_y = covid_red_X, covid_red_y
        source_X, source_y = X_test, y_test
    
        p2p, q2q, p2s, s2q = degradation_decomp(source_X, source_y, target_X, target_y, model, data_sum=20000, K=8, draw_calibration=False)
        concept_shift_holder_covid_red.append(np.abs(p2s-s2q)/np.abs(p2p-q2q))


        
    print(f"Average ROC: {np.array(au_roc_holder).mean().round(3)}, Average Precision: {np.array(av_prec_holder).mean().round(4)}")
    print(f"Average proportion of concept shift. COVID-RED: {np.array(concept_shift_holder_covid_red).mean().round(3)}, ")

    clf = HistGradientBoostingClassifier(random_state = 42, l2_regularization = 0.2, early_stopping = False)
    clf.fit(homekit_dataset_df_drop[homekit_feature_column_names].values, homekit_dataset_df_drop[ground_truth_label].values)
    
    """
    COVID-RED predictions
    """
    y_test_predict_prob = clf.predict_proba(covid_red_X)
    y_test_predict = clf.predict(covid_red_X)
    
    print('\n')
    #print(f"F1 score, {f1_score(covid_red_y, y_test_predict)}")
    print(f"Average precision, {average_precision_score(covid_red_y, y_test_predict_prob[:,1]).round(4)}")
    print(f"AUC ROC, {roc_auc_score(covid_red_y, y_test_predict_prob[:,1]).round(3)}")
  

# COVID-RED -> Homekit

In [None]:
%%time
optimal_offset = 12
optimal_window_length = 10
minimum_number = 6

"""
COVID-RED dataset training
"""
tasks = ['is_pos', 'have_flu', 'sx_Fever']
prediction_detection = 'prediction'

for ground_truth_label in tasks:
    print("\n")
    print(f"Task is {ground_truth_label}")

    """
    COVID-RED dataset
    """
    
    covid_red_generalization_columns = ["WDPULSE", "WDSLEEP"]
    
    z_scored_df = z_score_modalities(covid_red_nightly, covid_red_generalization_columns, optimal_offset, minimum_number, optimal_window_length, 'pid')
    z_score_columns = [col + "_z" for col in covid_red_generalization_columns]
    dataset_df, covid_red_feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, prediction_detection, 'pid', ground_truth_label)
    covid_red_dataset_df_drop = dataset_df[np.hstack([covid_red_feature_column_names, [ground_truth_label]])].replace([np.inf, -np.inf, None], np.nan).dropna()
    
    X, y = covid_red_dataset_df_drop[covid_red_feature_column_names].values, covid_red_dataset_df_drop[ground_truth_label].values
    
    positive_ids = covid_red_dataset_df_drop[covid_red_dataset_df_drop[ground_truth_label] == 1].index.get_level_values(0).unique()
    user_split_pids = pd.DataFrame(index = covid_red_dataset_df_drop.index.get_level_values(0).unique(), columns = [ground_truth_label])
    
    # Dummy X for input to stratified K fold splitting
    X = user_split_pids.values
    
    user_split_pids[ground_truth_label] = 0
    user_split_pids.loc[positive_ids] = 1
    
    y = user_split_pids.values
    
    skf = StratifiedKFold(n_splits=5, random_state = 42, shuffle=True)
    skf.get_n_splits(X, y)
    
    f1_score_holder = []
    au_roc_holder = []
    av_prec_holder = []
    #concept_shift_holder_covid_red = []
    concept_shift_holder_homekit = []

    """
    Homekit generalization testing
    """
    homekit_generalization_columns = ['resting_heart_rate','total_asleep_minutes']
    z_scored_df = z_score_modalities(homekit_nightly, homekit_generalization_columns, optimal_offset, minimum_number, optimal_window_length, 'pid')
    z_score_columns = [col + "_z" for col in homekit_generalization_columns]
    dataset_df, homekit_feature_column_names = get_ML_dataset(z_scored_df, z_score_columns, prediction_detection, 'pid', ground_truth_label)
    homekit_dataset_df_drop = dataset_df[np.hstack([homekit_feature_column_names, [ground_truth_label]])].replace([np.inf, -np.inf, None], np.nan).dropna()
    homekit_X, homekit_y = homekit_dataset_df_drop[homekit_feature_column_names].values, homekit_dataset_df_drop[ground_truth_label].values

    
    for fold, (train, test) in enumerate(skf.split(X, y)):
        train_pids = user_split_pids.iloc[train].index
        test_pids = user_split_pids.iloc[test].index
    
        X_train = covid_red_dataset_df_drop.loc[train_pids, covid_red_feature_column_names].values
        X_test = covid_red_dataset_df_drop.loc[test_pids, covid_red_feature_column_names].values
    
        y_train = covid_red_dataset_df_drop.loc[train_pids, ground_truth_label].values
        y_test = covid_red_dataset_df_drop.loc[test_pids, ground_truth_label].values
    
        clf = HistGradientBoostingClassifier(random_state = 42, l2_regularization = 0.2, early_stopping = False)
        clf.fit(X_train, y_train)
        
        y_test_predict_prob = clf.predict_proba(X_test)
        y_test_predict = clf.predict(X_test)
        
        au_roc_holder.append(roc_auc_score(y_test, y_test_predict_prob[:,1]))
        av_prec_holder.append(average_precision_score(y_test, y_test_predict_prob[:,1]))

        """
        Determine proportion of shift due to P(Y|X) shift
        """
    
        """
        Concept shift: Homekit
        """
        model = copy.deepcopy(clf)
        target_X, target_y = homekit_X, homekit_y
        source_X, source_y = X_test, y_test
    
        p2p, q2q, p2s, s2q = degradation_decomp(source_X, source_y, target_X, target_y, model, data_sum=20000, K=8, draw_calibration=False)
        concept_shift_holder_homekit.append(np.abs(p2s-s2q)/np.abs(p2p-q2q))
        

        
    print(f"Average ROC: {np.array(au_roc_holder).mean().round(3)}, Average Precision: {np.array(av_prec_holder).mean().round(4)}")
    print(f"Average proportion of concept shift. Homekit: {np.array(concept_shift_holder_homekit).mean().round(3)}, ")

    clf = HistGradientBoostingClassifier(random_state = 42, l2_regularization = 0.2, early_stopping = False)
    clf.fit(covid_red_dataset_df_drop[covid_red_feature_column_names].values, covid_red_dataset_df_drop[ground_truth_label].values)
    
    """
    Homekit prediction
    """
    y_test_predict_prob = clf.predict_proba(homekit_X)
    y_test_predict = clf.predict(homekit_X)
    
    print('\n')
    print("HGBC COVID-RED -> Homekit")
    #print(f"F1 score, {f1_score(homekit_y, y_test_predict)}")
    print(f"Average precision, {average_precision_score(homekit_y, y_test_predict_prob[:,1]).round(3)}")
    print(f"AUC ROC, {roc_auc_score(homekit_y, y_test_predict_prob[:,1]).round(3)}")
