In [6]:
import os
import pickle
import pandas as pd
from scipy.stats import spearmanr
from sklearn.metrics import median_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns

def return_rho(df):
    rho, p = spearmanr(df.y_test, df.y_pred)
    return rho

def return_mae(df):
    mae = median_absolute_error(df.y_test, df.y_pred)
    return mae

def return_overall_rho(experiment, model_name):
    
    root = os.getcwd()
    
    multiple_rho = pd.DataFrame()
    
    for i in range(1,6,1):

        filename = root + "/experiment-" + str(experiment) + "/models/" + model_name + "_" + str(i) + ".pkl"

        model = pickle.load(open(filename, 'rb'))
        X_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/X_test_" + str(i) + ".csv", index_col = 0)
        y_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/y_test_" + str(i) + ".csv", index_col = 0)

        y_pred = model.predict(X_test)
        y_pred = pd.DataFrame(y_pred)
        y_pred.columns = ["y_pred"]
        y_test.columns = ["y_test"]

        df = pd.concat([y_test.reset_index(), y_pred], axis = 1)
        
        multiple_rho = pd.concat([multiple_rho, pd.Series(return_rho(df))], axis = 0)
    
    return multiple_rho

def return_overall_mae(experiment, model_name):
    
    root = os.getcwd()
    
    multiple_mae = pd.DataFrame()
    
    for i in range(1,6,1):

        filename = root + "/experiment-" + str(experiment) + "/models/" + model_name + "_" + str(i) + ".pkl"

        model = pickle.load(open(filename, 'rb'))
        X_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/X_test_" + str(i) + ".csv", index_col = 0)
        y_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/y_test_" + str(i) + ".csv", index_col = 0)

        y_pred = model.predict(X_test)
        y_pred = pd.DataFrame(y_pred)
        y_pred.columns = ["y_pred"]
        y_test.columns = ["y_test"]

        df = pd.concat([y_test.reset_index(), y_pred], axis = 1)
        
        multiple_mae = pd.concat([multiple_mae, pd.Series(return_mae(df))], axis = 0)
    
    return multiple_mae

def return_multiple_rho(experiment, model_name):
    
    root = os.getcwd()
    
    multiple_rho = pd.DataFrame()
    
    for i in range(1,6,1):

        filename = root + "/experiment-" + str(experiment) + "/models/" + model_name + "_" + str(i) + ".pkl"

        model = pickle.load(open(filename, 'rb'))
        X_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/X_test_" + str(i) + ".csv", index_col = 0)
        y_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/y_test_" + str(i) + ".csv", index_col = 0)

        y_pred = model.predict(X_test)
        y_pred = pd.DataFrame(y_pred)
        y_pred.columns = ["y_pred"]
        y_test.columns = ["y_test"]

        df = pd.concat([y_test.reset_index(), y_pred], axis = 1)
        
        multiple_rho = pd.concat([multiple_rho, df.groupby("id").apply(return_rho)], axis = 0)
    
    return multiple_rho

def return_multiple_mae(experiment, model_name):
    
    root = os.getcwd()
    
    multiple_mae = pd.DataFrame()
    
    for i in range(1,6,1):

        filename = root + "/experiment-" + str(experiment) + "/models/" + model_name + "_" + str(i) + ".pkl"

        model = pickle.load(open(filename, 'rb'))
        X_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/X_test_" + str(i) + ".csv", index_col = 0)
        y_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/y_test_" + str(i) + ".csv", index_col = 0)

        y_pred = model.predict(X_test)
        y_pred = pd.DataFrame(y_pred)
        y_pred.columns = ["y_pred"]
        y_test.columns = ["y_test"]

        df = pd.concat([y_test.reset_index(), y_pred], axis = 1)
        
        multiple_mae = pd.concat([multiple_mae, df.groupby("id").apply(return_mae)], axis = 0)
    
    return multiple_mae

def return_true_pred_plot(experiment, model_name):
    
    root = os.getcwd()
    
    for i in range(1,6,1):

        filename = root + "/experiment-" + str(experiment) + "/models/" + model_name + "_" + str(i) + ".pkl"

        model = pickle.load(open(filename, 'rb'))
        X_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/X_test_" + str(i) + ".csv", index_col = 0)
        y_test = pd.read_csv(root + "/experiment-" + str(experiment) + "/data/y_test_" + str(i) + ".csv", index_col = 0)

        y_pred = model.predict(X_test)
        y_pred = pd.DataFrame(y_pred)
        y_pred.columns = ["y_pred"]
        y_test.columns = ["y_test"]

        df = pd.concat([y_test.reset_index(), y_pred], axis = 1)
        df = df[df.y_test > 3]
        df = df[df.y_pred > 3]
        df = df[df.y_test < 12]
        df = df[df.y_pred < 12]
        
        plt.scatter(df.y_test, df.y_pred)
        plt.xlim(3, 12)
        plt.ylim(3, 12)
        plt.xlabel("Self-reported sleep duration")
        plt.show("Estimated sleep duation")
        plt.title("Median MAE: " + str(return_multiple_mae(experiment, model_name).median()[0]))
        
def results_single_study(row):
        
    rhos = return_multiple_rho(row[0], row[1][0])
    maes = return_multiple_mae(row[0], row[1][0])
    rhos.columns = ["rho"]
    maes.columns = ["mae"]
    
    results_single_study = pd.DataFrame({"median rho":rhos.median().values[0],
                                         "median mae":maes.median().values[0] * 60,
                                         "min rho":rhos.min().values[0],
                                         "min mae":maes.min().values[0] * 60,
                                         "max rho":rhos.max().values[0],
                                         "max mae":maes.max().values[0] * 60,
                                         "n rho > 0.5":sum(rhos.rho > 0.5)/rhos.shape[0],
                                         "n rho > 0.8":sum(rhos.rho > 0.8)/rhos.shape[0]}, index = [row[1][0]])
    
    return results_single_study

def results_multiple_studies(x):
    
    results_multiple_studies = pd.DataFrame()
    
    for row in x.iterrows():
        
        results_multiple_studies = pd.concat([results_multiple_studies, results_single_study(row)], axis = 0)
        
    return results_multiple_studies

def table_2():
    study_parameters = pd.read_json("study_parameters.json")
    x = pd.DataFrame({"model":["lasso","svr","rf","gbr"]})
    x.index = [1,2,3,4]
    x = results_multiple_studies(x)
    x.to_csv(study_parameters["markdown_path"] + "table_2.csv")

In [7]:
print(return_overall_rho(1, "lasso").median())
print(return_overall_rho(2, "svr").median())
print(return_overall_rho(3, "rf").median())
print(return_overall_rho(4, "gbr").median())

Unnamed: 0,median rho,median mae,min rho,min mae,max rho,max mae,n rho > 0.5,n rho > 0.8
lasso,0.421696,43.59749,-0.176114,12.861592,0.813427,114.439629,0.369697,0.006061
svr,0.459122,41.393907,-0.140587,14.968655,0.765002,131.960998,0.406061,0.0
rf,0.575685,40.317754,-0.265405,12.406942,0.864547,113.387202,0.593939,0.054545
gbr,0.442623,41.900506,-0.111895,21.680395,0.80493,153.770172,0.381818,0.006061


In [10]:
print("median")
print(return_overall_rho(1, "lasso").median())
print(return_overall_rho(2, "svr").median())
print(return_overall_rho(3, "rf").median())
print(return_overall_rho(4, "gbr").median())
print("min")
print(return_overall_rho(1, "lasso").min())
print(return_overall_rho(2, "svr").min())
print(return_overall_rho(3, "rf").min())
print(return_overall_rho(4, "gbr").min())
print("max")
print(return_overall_rho(1, "lasso").max())
print(return_overall_rho(2, "svr").max())
print(return_overall_rho(3, "rf").max())
print(return_overall_rho(4, "gbr").max())

median
0    0.502969
dtype: float64
0    0.522789
dtype: float64
0    0.633169
dtype: float64
0    0.52719
dtype: float64
min
0    0.327956
dtype: float64
0    0.387164
dtype: float64
0    0.429495
dtype: float64
0    0.391846
dtype: float64
max
0    0.560205
dtype: float64
0    0.594827
dtype: float64
0    0.668028
dtype: float64
0    0.550469
dtype: float64


In [8]:
print(return_overall_mae(1, "lasso").median() * 60)
print(return_overall_mae(2, "svr").median() * 60)
print(return_overall_mae(3, "rf").median() * 60)
print(return_overall_mae(4, "gbr").median() * 60)

print(return_overall_mae(1, "lasso").min() * 60)
print(return_overall_mae(2, "svr").min() * 60)
print(return_overall_mae(3, "rf").min() * 60)
print(return_overall_mae(4, "gbr").min() * 60)

print(return_overall_mae(1, "lasso").max() * 60)
print(return_overall_mae(2, "svr").max() * 60)
print(return_overall_mae(3, "rf").max() * 60)
print(return_overall_mae(4, "gbr").max() * 60)

0    44.659119
dtype: float64
0    41.617546
dtype: float64
0    40.037562
dtype: float64
0    42.280398
dtype: float64
0    41.352583
dtype: float64
0    39.248674
dtype: float64
0    39.77171
dtype: float64
0    40.713238
dtype: float64
0    46.615669
dtype: float64
0    44.920639
dtype: float64
0    42.823851
dtype: float64
0    43.382687
dtype: float64


In [2]:
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import spearmanr
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [19]:
study_parameters = pd.read_json("study_parameters.json")
root = os.getcwd()
data = pd.DataFrame()
for row, study in study_parameters.iterrows():
    for i in range(1,5,1):
        model_name = os.listdir(study["model_output_path"])[0].split("_")[0]

        filename = study["model_output_path"] + model_name + "_" + str(i) + ".pkl"

        model = pickle.load(open(filename, 'rb'))
        X_test = pd.read_csv(study["data_output_path"] + "/X_test_" + str(i) + ".csv", index_col = 0)
        y_test = pd.read_csv(study["data_output_path"] + "/y_test_" + str(i) + ".csv", index_col = 0)

        y_pred = model.predict(X_test)
        y_pred = pd.DataFrame(y_pred)
        y_pred.columns = ["y_pred"]
        y_test.columns = ["y_test"]

        df = pd.concat([y_test.reset_index(), y_pred], axis = 1)
        data = pd.concat([data, df], axis = 0)

    data.reset_index(inplace=True,drop=True)
    
    
    print(spearmanr(data.y_pred, data.y_test))
    print(median_absolute_error(data.y_pred, data.y_test) * 60)

SpearmanrResult(correlation=0.4815674529841081, pvalue=2.3217569362313565e-304)
0.7483246414728804
SpearmanrResult(correlation=0.49551148394291317, pvalue=0.0)
0.7242570061198159
SpearmanrResult(correlation=0.5234972443252375, pvalue=0.0)
0.7097378897333018
SpearmanrResult(correlation=0.5108585821452533, pvalue=0.0)
0.7100100482648761


In [3]:
import pandas as pd
def in_text():

    # Read data
    df = pd.read_csv("data.csv", low_memory=False)
    mdna = pd.read_csv("/home/haalbers/dissertation/mobiledna-clean.csv", usecols = ["id", "startTime"], low_memory=False)
    baseline = pd.read_csv("/home/haalbers/dissertation/baseline-longitudinal-clean.csv", index_col = 0, low_memory=False)
    study_parameters = pd.read_json("study_parameters.json")
    
    # Add date so we can count days person has been in the study
    mdna["date"] = pd.to_datetime(mdna.startTime).dt.date 

    # Select included participants
    baseline = baseline[baseline.id.isin(df.id.unique().tolist())]
    
    # Get general descriptives
    n_participants = df.id.nunique()
    n_observations = df.shape[0]
    median_compliance = df.id.value_counts().median()
    std_compliance = df.id.value_counts().std()
    hours_of_logging = ( 24 * mdna.groupby('id').date.nunique().median() )    
    baseline = baseline.groupby('id').mean().reset_index()
    percentage_female = baseline.sex.value_counts().max()/n_participants * 100
    median_age = baseline.age.median()
    std_age = baseline.age.std()
    
    # Generate table with in-text values
    in_text_values = pd.DataFrame({"variable_name" : ["n_participants", "n_observations", "median_compliance", "std_compliance", "hours_of_logging", "median_age", "std_age",
                                                      "percentage_female"], 
                                   "value" : [n_participants, n_observations, median_compliance, std_compliance, hours_of_logging, median_age, std_age, percentage_female]})
    
    # Write to file
    in_text_values.to_csv(study_parameters["markdown_path"][0] + "in_text_values.csv")
    
in_text()