In [1230]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

In [1231]:
!pip install category_encoders
import category_encoders as ce



In [1232]:
pd.options.mode.chained_assignment = None

In [1233]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/e/2PACX-1vTMIUP6_JIoxWSAFCe1h6Hz12r-41t6qHv5cCXIBmYJUK2KS188pKkZnkr4jJRpIcC3mRZV36z21oNv/pub?gid=0&single=true&output=csv')

In [1234]:
header_list = ['Date','Weight','Fat','Sleep Debt','REM','Deep Sleep','Snore',
     'Meditate','Spanish','Push-ups','Pull-ups','Sit-ups','Coffee','Handstands',
     'Acro','Swing','Strain','Calories','AHR','MHR','HRV','RHR','Recovery',
     'Carbs','Journal','Spinal Mobility','Flexibility','Notes','DOW',
     'Weight_AVR','Fat_AVR','Sleep Debt_AVR','REM_AVR','Deep Sleep_AVR',
     'Strain_AVR','Calories_AVR','AHR_AVR','MHR_AVR','HRV_AVR','RHR_AVR',
     'Recovery_AVR','Weight_PASS','Fat_PASS','Sleep Debt_PASS','REM_PASS',
     'Deep Sleep_PASS','Strain_PASS','Calories_PASS','AHR_PASS','MHR_PASS',
     'HRV_PASS','RHR_PASS','Recovery_PASS']

In [1235]:
df = df.reindex(columns = header_list)

In [1236]:
df['Date'] = pd.to_datetime(df['Date'])

for i in range(len(df)):
    df['DOW'][i] = df['Date'][i].day_name()

In [1237]:
features = ['Weight','Fat','Sleep Debt','REM','Deep Sleep','Strain','Calories','AHR','MHR','HRV','RHR','Recovery']
special_features = ['Sleep Debt','AHR','RHR']
drops = ['Acro','Meditate','Snore','Coffee','Handstands','Spanish','Push-ups',
         'Pull-ups','Sit-ups','Swing','Carbs','Journal','Spinal Mobility',
         'Flexibility','Notes']
week_day = ['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']

In [1238]:
for each in features:
    df[each] = df[each].interpolate(method='linear')

In [1239]:
df = df.drop(labels=drops,axis=1)

In [1240]:
df['Weight'].mode() #most common weight

0    151.4
dtype: float64

In [1241]:
for i in range(0,df.shape[0]-1):
    for j in range(len(features)):
        string = features[j] + '_AVR'
        df[string] = df.iloc[:,j+1].expanding(min_periods=7).mean()
        #print(i,":",features[j],":",df[string][i])

features = ['Weight','Fat','Sleep Debt','REM','Deep Sleep','Strain','Calories','AHR','MHR','HRV','RHR','Recovery']

special_features = ['Sleep Debt','AHR','RHR']

In [1242]:
for i in range(0,df.shape[0]-1):
    for j in range(len(features)):
            score = features[j] + '_PASS'
            avr = features[j] + '_AVR'

            if features[j] not in [special_features]:
                
                if df[features[j]][i] >= df[avr][i]:
                    df[score][i] = "Y"
                    
                else:
                    df[score][i] = "N"
                    
            elif features[j] in [special_features]:
                    
                if df[features[j]][i] <= df[avr][i]:
                    df[score][i] = "Y"
                    
                else:
                    df[score][i] = "N"
                   

In [1243]:
df.describe()

Unnamed: 0,Weight,Fat,Sleep Debt,REM,Deep Sleep,Strain,Calories,AHR,MHR,HRV,...,Sleep Debt_AVR,REM_AVR,Deep Sleep_AVR,Strain_AVR,Calories_AVR,AHR_AVR,MHR_AVR,HRV_AVR,RHR_AVR,Recovery_AVR
count,587.0,587.0,587.0,587.0,587.0,587.0,587.0,587.0,587.0,587.0,...,581.0,581.0,581.0,581.0,581.0,581.0,581.0,581.0,581.0,581.0
mean,152.286882,13.389949,85.996593,75.486371,66.22402,11.609284,2895.15247,79.293867,160.012777,33.425894,...,94.838157,53.839581,53.223175,12.276892,3388.58208,79.006962,159.461727,31.555181,65.002221,48.32074
std,3.429221,0.790793,27.206873,34.653679,31.572887,2.246374,908.638589,4.403359,6.61813,8.518731,...,9.139078,16.338589,9.088731,0.528289,283.77539,0.811472,0.883935,2.05096,0.485542,2.950243
min,146.8,12.2,0.0,2.0,2.0,4.6,449.0,71.0,132.0,10.0,...,75.928571,21.428571,43.514411,11.609284,2895.15247,77.808383,157.975716,21.7,64.177388,45.386157
25%,148.932692,12.562019,71.0,49.061644,42.150685,10.0,2153.5,76.0,156.45602,29.0,...,88.288727,39.107877,45.577874,11.741953,3132.744898,78.418831,158.715328,31.391346,64.709256,46.394089
50%,151.4,13.2,85.0,76.0,71.0,11.9,2765.0,79.0,159.726027,33.298246,...,90.011076,55.846801,50.569883,12.279836,3441.47678,78.872727,159.543364,31.951847,64.929245,47.132634
75%,155.821795,14.182692,100.857143,98.811111,85.0,13.265789,3660.719298,82.0,162.0,37.038406,...,100.174583,68.528281,59.103125,12.501991,3530.685978,79.219406,160.146098,32.871041,65.195544,48.638408
max,159.6,15.0,192.0,200.0,165.0,16.8,5660.0,93.0,193.0,81.0,...,114.622348,75.486371,84.75,13.306667,3907.733333,81.608696,162.266667,33.430034,68.857143,56.166667


In [1244]:
test = df[df['Date'] >= '12/03/2019']
train = df[df['Date'] <= '08/06/2019']
val = df[(df['Date'] <= '12/02/2019') & (df['Date'] >= '08/07/2019')]

In [1245]:
target = 'Sleep Debt_PASS'
# Get a dataframe with all train columns except the target
train_features = train.drop(columns=[target])

# Get a list of the numeric features
numeric_features = train_features.select_dtypes(include='number').columns.tolist()

# Get a series with the cardinality of the nonnumeric features
cardinality = train_features.select_dtypes(exclude='number').nunique()

# Get a list of all categorical features with cardinality <= 50
categorical_features = cardinality[cardinality <= 25].index.tolist()

# Combine the lists 
features = numeric_features + categorical_features

In [1246]:
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]

In [1247]:
y_train.value_counts(normalize=True)

N    0.64878
Y    0.35122
Name: Sleep Debt_PASS, dtype: float64

In [1248]:
pipeline = make_pipeline(
    ce.OneHotEncoder(use_cat_names=True), 
    SimpleImputer(strategy='mean'), 
    RandomForestClassifier(max_features='sqrt',n_estimators=100,n_jobs=-1, random_state=42)
)

In [1249]:
pipeline.fit(X_train, y_train)
print('Train Accuracy', pipeline.score(X_train, y_train))
print('Validation Accuracy', pipeline.score(X_val, y_val))

Train Accuracy 1.0
Validation Accuracy 0.5677966101694916


In [1250]:
y_pred = pipeline.predict(X_test)

In [1251]:
X_test.head()

Unnamed: 0,Weight,Fat,Sleep Debt,REM,Deep Sleep,Strain,Calories,AHR,MHR,HRV,...,Fat_PASS,REM_PASS,Deep Sleep_PASS,Strain_PASS,Calories_PASS,AHR_PASS,MHR_PASS,HRV_PASS,RHR_PASS,Recovery_PASS
528,157.2,14.4,89.0,94.0,102.0,9.5,1775.0,76.0,160.0,38.0,...,Y,Y,Y,N,N,N,N,Y,N,Y
529,157.2,14.4,43.0,123.0,137.0,9.5,2146.0,80.0,160.0,36.0,...,Y,Y,Y,N,N,Y,N,Y,N,Y
530,155.5,14.1,96.0,95.0,70.0,11.4,1851.0,82.0,156.0,30.0,...,Y,Y,Y,N,N,Y,N,N,N,Y
531,156.8,14.4,98.0,97.0,88.0,6.1,1713.0,77.0,153.0,46.0,...,Y,Y,Y,N,N,N,N,Y,Y,Y
532,156.0,14.3,89.0,101.0,81.0,8.2,1834.0,79.0,151.0,28.0,...,Y,Y,Y,N,N,N,N,N,N,Y


In [1252]:
y_pred

array(['Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y',
       'Y', 'Y', 'Y', 'Y', 'Y', 'Y', 'Y'], dtype=object)

In [1253]:
print('Test Accuracy', pipeline.score(X_test, y_pred))

Test Accuracy 1.0


In [1254]:
df['Sleep Debt'][538], df['Sleep Debt_PASS'][538], test['Sleep Debt'][538], X_test['Sleep Debt'][538], np.array(y_pred)[10],X_test['Sleep Debt_AVR'][538]

(40.0, 'N', 40.0, 40.0, 'Y', 87.54359925788498)

In [1255]:
df['RHR'][538], df['RHR_PASS'][538], test['RHR'][538], X_test['RHR'][538], np.array(y_pred)[10],X_test['RHR_AVR'][538]

(65.0, 'Y', 65.0, 65.0, 'Y', 64.6799628942486)

In [1256]:
df['AHR'][538], df['AHR_PASS'][538], test['AHR'][538], X_test['AHR'][538], np.array(y_pred)[10],X_test['AHR_AVR'][538]

(78.0, 'N', 78.0, 78.0, 'Y', 79.06771799628943)