In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#pd.set_option("display.max_rows", 1000)
pd.set_option("display.max_columns", 100)
np.set_printoptions(suppress=True)

In [2]:
file_paths = {
              'in_train':'../../../data/prepared/activity_log_train__ml_prepared.csv',
              'in_test':'../../../data/prepared/activity_log_test__ml_prepared.csv',
              'in_missing':'../../../data/prepared/activity_log_missing__ml_prepared.csv',
             }

df = pd.read_csv(file_paths['in_train'])

df.head()

Unnamed: 0,elapsed_distance,moving_time,avg_speed,ride_cruise_speed,ride_avg_power,peak_20min_power,training_window_0,training_window_1,training_window_2,training_window_3,training_window_4,training_window_5,training_window_6,training_window_7,training_window_8,training_window_9,training_window_10,training_window_11,training_window_12,simple_exertion
0,0.301947,0.179185,0.5026,-0.104944,0.559036,0.975233,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,-0.558354,-0.76912,0.86486,0.342861,0.788333,0.903988,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
2,-0.595791,-0.801851,0.888048,0.804365,1.269131,0.616208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
3,-0.125927,-0.29564,0.614986,-0.118132,0.211381,0.31075,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
4,-1.127194,-1.180276,-0.693269,0.249776,-0.217549,-0.983908,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [3]:
target_col = 'simple_exertion'

In [26]:
df[target_col].value_counts()

2.0    73
3.0    34
1.0    19
Name: simple_exertion, dtype: int64

In [27]:
df[target_col].value_counts(normalize=True)

2.0    0.579365
3.0    0.269841
1.0    0.150794
Name: simple_exertion, dtype: float64

### Extract X and y

In [19]:
X, y = df.drop(target_col, axis=1).values, df[target_col].values

In [79]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

In [80]:
def stratified_train_classifier(X, y, clf, metric='accuracy', k_folds=5):
    skf = StratifiedKFold(n_splits=k_folds)
    
    if metric == 'f1':
        scoring_metric = f1_score
    elif metric == 'roc_auc':
        scoring_metric = roc_auc_score
    elif metric == 'accuracy':
        scoring_metric = accuracy_score
    else:
        return -1
    
    scores = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train, y_train)
        
        y_pred = clf.predict(X_test)
        score = scoring_metric(y_test, y_pred)
        scores.append(score)
    
    print(f'For metric "{metric}", the mean = {np.mean(scores)}, std.dev. = {np.std(scores)}')
        
        

# 1. Logistic Regression

In [46]:
clf_OvA_logreg = OneVsRestClassifier(LogisticRegression(random_state=42))

In [47]:
clf_OvA_logreg.fit(X, y)

OneVsRestClassifier(estimator=LogisticRegression(random_state=42))

In [48]:
clf_OvA_logreg.score(X,y)

0.6666666666666666

In [83]:
#clf_OvA_logreg.decision_function(X[0:])

In [84]:
stratified_train_classifier(X[:, 0:6],y, clf_OvA_logreg)

For metric "accuracy", the mean = 0.6193846153846154, std.dev. = 0.06237584117372831


# 2. Random Forest

In [64]:
clf_rf = RandomForestClassifier(random_state=42)

In [57]:
clf_rf.fit(X,y)

RandomForestClassifier(random_state=42)

In [76]:
cross_val_score(clf_rf, X[:, 0:6], y, scoring='accuracy', cv=10).mean()

0.625

In [82]:
stratified_train_classifier(X[:, 0:6],y, clf_rf)

For metric "accuracy", the mean = 0.6270769230769231, std.dev. = 0.09609462791848927
