In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv("../data/cleaned_dailyActivity.csv")

df.head()

Unnamed: 0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories,is_active_day
0,1503960366,2016-03-25,11004,7.11,7.11,0.0,2.57,0.46,4.07,0.0,33,12,205,804,1819,1
1,1503960366,2016-03-26,17609,11.55,11.55,0.0,6.92,0.73,3.91,0.0,89,17,274,588,2154,1
2,1503960366,2016-03-27,12736,8.53,8.53,0.0,4.66,0.16,3.71,0.0,56,5,268,605,1944,1
3,1503960366,2016-03-28,13231,8.93,8.93,0.0,3.19,0.79,4.95,0.0,39,20,224,1080,1932,1
4,1503960366,2016-03-29,12041,7.85,7.85,0.0,2.16,1.09,4.61,0.0,28,28,243,763,1886,1


In [16]:
threshold = df['TotalSteps'].quantile(0.75)

df['is_active'] = (df['TotalSteps'] >= threshold).astype(int)

print('Active Steps Threshold:', threshold)

Active Steps Threshold: 10198.0


In [28]:
features = ['VeryActiveMinutes',
            'TotalSteps',
            'Calories']

In [18]:
X = df[features]
y = df['is_active']

print(X.shape, y.shape)

(457, 3) (457,)


In [19]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123, stratify = y)

In [20]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators = 100, random_state = 123)

In [23]:
cv_scores = cross_val_score(rf, X_train, y_train, cv = 10, scoring = 'f1')

print('F1 Scores (10-Fold CV:', cv_scores)
print('Mean Training Cross-Validation Accuracy: ', round(cv_scores.mean() * 100, 2), '%')

F1 Scores (10-Fold CV: [1.         0.88888889 1.         1.         1.         1.
 1.         1.         1.         1.        ]
Mean Training Cross-Validation Accuracy:  98.89 %


In [24]:
param_grid = {

    'n_estimators' : [100, 250, 500],
    'max_depth'    : [None, 5, 10],
    'min_samples_split' : [2, 5, 10]
}

grid_search = GridSearchCV(

    estimator = rf,
    param_grid = param_grid,
    cv = 10
)

In [25]:
grid_search.fit(X_train, y_train)

print('Best Parameters: ', grid_search.best_params_)

print('Best CV Score: ', grid_search.best_score_)

Best Parameters:  {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 250}
Best CV Score:  0.9972972972972972


In [26]:
model = grid_search.best_estimator_

In [27]:
import joblib

joblib.dump(model, '../models/rf_model.pkl')
joblib.dump(list(X.columns), '../models/rf_features.pkl')
joblib.dump(y_test, '../models/y_test.pkl')
joblib.dump(X_test, '../models/x_test.pkl')

['../models/x_test.pkl']