### Contents

- [EDA](#EDA)

### Header

In [1]:
# user configuration

In [2]:
# import libraries

# maths
import numpy as np
import pandas as pd

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pydotplus

# modelling
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures,LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix,accuracy_score,r2_score,mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.utils import resample, shuffle
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.externals.six import StringIO 

# Others
import warnings
warnings.filterwarnings("ignore")
from IPython.display import Image



In [3]:
# file paths

folder = 'fitrec/'

input_path = '../data/1_input/' + folder
clean_path = '../data/2_clean/' + folder
preprocess_path = '../data/3_preprocess/' + folder
output_path = '../data/4_output/' + folder

### Import Data

In [4]:
# import data

file = 'endomondoHR_proper_summary.csv'
in_path = clean_path + file

df = pd.read_csv(in_path)

### Functions

### Pre-process Data

In [5]:
df.columns

Index(['id', 'userId', 'gender', 'sport', 'time_start', 'time_end', 'time_dur',
       'lat_start', 'lat_end', 'lon_start', 'lon_end', 'alt_avg', 'alt_min',
       'alt_q1', 'alt_q3', 'alt_max', 'hr_avg', 'hr_min', 'hr_q1', 'hr_q3',
       'hr_max', 'hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak', 'spd_avg',
       'spd_min', 'spd_q1', 'spd_q3', 'spd_max'],
      dtype='object')

In [6]:
# remove rows with abnormal heartrate

print(len(df))

cond_1 = df['hr_avg'] > 60
cond_2 = df['hr_min'] > 0
cond_3 = df['hr_q1'] > 0
cond_4 = df['hr_q3'] > 0
cond_5 = df['hr_max'] > 0

df = df[cond_1 & cond_2 & cond_3 & cond_4 & cond_5]

print(len(df))

167783
167220


In [7]:
# drop rows if speed is nan

# print(len(df))
# df.dropna(subset=['spd_avg'],inplace=True)
# print(len(df))

In [8]:
df['sport'].value_counts()

bike                       71652
run                        70469
mountain bike              10710
bike (transport)            7658
indoor cycling              1725
walk                        1246
orienteering                 867
cross-country skiing         788
core stability training      443
fitness walking              292
skate                        259
roller skiing                238
hiking                       237
kayaking                      92
circuit training              89
weight training               74
rowing                        71
gymnastics                    66
soccer                        51
downhill skiing               42
treadmill running             28
snowshoeing                   16
swimming                      14
golf                          12
badminton                     10
elliptical                    10
horseback riding              10
tennis                         8
basketball                     8
aerobics                       7
climbing  

In [9]:
# merge similar sports

df['sport'].replace('treadmill running','run',inplace=True)
df['sport'].replace(['treadmill walking','fitness walking'],'walk',inplace=True)
df['sport'].replace(['bike','mountain bike','bike (transport)','indoor cycling'],'cycle',inplace=True)
df['sport'].replace(['windsurfing','kite surfing'],'surfing',inplace=True)
df['sport'].replace(['cross-country skiing','downhill skiing','roller skiing'],'skiing',inplace=True)
df['sport'].replace('pilates','yoga',inplace=True)

In [10]:
df['sport'].value_counts()

cycle                      91745
run                        70497
walk                        1539
skiing                      1068
orienteering                 867
core stability training      443
skate                        259
hiking                       237
kayaking                      92
circuit training              89
weight training               74
rowing                        71
gymnastics                    66
soccer                        51
snowshoeing                   16
swimming                      14
golf                          12
horseback riding              10
elliptical                    10
badminton                     10
tennis                         8
basketball                     8
aerobics                       7
climbing                       5
yoga                           4
table tennis                   4
stair climing                  3
snowboarding                   3
rugby                          3
surfing                        2
sailing   

In [11]:
# select columns for feature selection

# cols = ['sport', 'time_dur', 'alt_avg', 'alt_min', 'alt_q1', 'alt_q3','alt_max', 'hr_avg', 'hr_min', 'hr_q1', 'hr_q3', 'hr_max']

#cols = ['sport','hr_avg', 'hr_min', 'hr_q1', 'hr_q3', 'hr_max']
#cols = ['sport','hr_avg', 'hr_min', 'hr_q1', 'hr_q3', 'hr_max','spd_avg', 'spd_min', 'spd_q1', 'spd_q3', 'spd_max']

cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak']
#cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak','spd_avg', 'spd_min', 'spd_q1', 'spd_q3', 'spd_max']

df_model = df[cols]

In [12]:
# select only sports with minimal rows

count = df_model['sport'].value_counts()

#count_cond = count[count > 1].index
#count_cond = count[count >= 5].index
#count_cond = count[count >= 10].index
count_cond = count[count >= 50].index
#count_cond = count[count >= 100].index

count_mask = df_model['sport'].isin(count_cond)
df_model = df_model[count_mask]

In [13]:
print(df_model.shape)
df_model.head()

(167098, 5)


Unnamed: 0,sport,hr_outof,hr_fatburn,hr_cardio,hr_peak
0,cycle,0.0,0.012,0.466,0.522
1,cycle,0.0,0.018,0.628,0.354
2,cycle,0.0,0.06,0.782,0.158
3,cycle,0.0,0.034,0.644,0.322
4,cycle,0.0,0.01,0.072,0.918


In [14]:
df_model['sport'].value_counts()

cycle                      91745
run                        70497
walk                        1539
skiing                      1068
orienteering                 867
core stability training      443
skate                        259
hiking                       237
kayaking                      92
circuit training              89
weight training               74
rowing                        71
gymnastics                    66
soccer                        51
Name: sport, dtype: int64

### Plot df_model

In [15]:
# order = df_model.groupby('sport')['hr_max'].median().sort_values(ascending=False).index

# plt.figure(figsize=(20,15))
# #plt.xlim(0,300)

# sns.boxplot(data=df_model,x='hr_max',y='sport',order=order);

In [16]:
# order = df_model.groupby('sport')['hr_avg'].median().sort_values(ascending=False).index

# plt.figure(figsize=(20,15))
# #plt.xlim(0,300)

# sns.boxplot(data=df_model,x='hr_avg',y='sport',order=order);

In [17]:
# order = df_model.groupby('sport')['hr_min'].median().sort_values(ascending=False).index

# plt.figure(figsize=(20,15))
# #plt.xlim(0,300)

# sns.boxplot(data=df_model,x='hr_min',y='sport',order=order);

### Create Features and Target

In [18]:
target = 'sport'
features = [x for x in df_model.columns if x != target]

X = df_model[features].values
y = df_model[target].values

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,stratify=y,random_state=3050)

In [19]:
ss = StandardScaler()
ss.fit(X_train)

X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [20]:
# check

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(133678, 4)
(33420, 4)
(133678,)
(33420,)


### Logistic Regresion Model

In [21]:
# init model

logreg = LogisticRegression()

In [22]:
# perform cross validation

score = cross_val_score(logreg,X,y,cv=5)
print('score:',score.mean(),score)

# score: 0.6545580125487176 [0.6302521  0.66461016 0.66203607 0.65096449 0.66492724]

score: 0.7066983631754369 [0.71767486 0.70475421 0.71399503 0.69416764 0.70290007]


In [23]:
# fit model

logreg.fit(X_train, y_train)

# score model
print("train r2:",logreg.score(X_train, y_train))
print("test r2:",logreg.score(X_test, y_test))

# train r2: 0.6561339556433755
# test r2: 0.6566573926868045

train r2: 0.7094735109741319
test r2: 0.7059844404548175


In [33]:
# confusion matrix
print('rows:actual columns:predicted')

#y_pred = logreg.predict(X_test)
#cm = confusion_matrix(y_test, y_pred)
#pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

y_pred = logreg.predict_proba(X_test)
pd.DataFrame(y_pred,columns=le.classes_) 

rows:actual columns:predicted


Unnamed: 0,circuit training,core stability training,cycle,gymnastics,hiking,kayaking,orienteering,rowing,run,skate,skiing,soccer,walk,weight training
0,0.000228,0.003012,0.280455,0.000094,0.000305,0.000538,0.011998,0.000411,0.696316,0.000595,0.004996,0.000623,0.000329,0.000098
1,0.001323,0.005679,0.892962,0.002797,0.006199,0.001864,0.001416,0.000665,0.029962,0.003553,0.010550,0.000263,0.040955,0.001812
2,0.000303,0.002405,0.387487,0.000086,0.000369,0.000419,0.007802,0.000438,0.592983,0.000874,0.005434,0.000470,0.000789,0.000142
3,0.000256,0.001755,0.364436,0.000049,0.000256,0.000280,0.007009,0.000403,0.618639,0.000816,0.004924,0.000412,0.000653,0.000112
4,0.000235,0.002698,0.300330,0.000083,0.000298,0.000472,0.010652,0.000411,0.678119,0.000641,0.005002,0.000570,0.000388,0.000102
5,0.000305,0.001174,0.451244,0.000032,0.000245,0.000173,0.004412,0.000405,0.534127,0.001123,0.005004,0.000292,0.001328,0.000137
6,0.000243,0.002484,0.318844,0.000076,0.000295,0.000428,0.009650,0.000411,0.660780,0.000683,0.005021,0.000531,0.000447,0.000106
7,0.000435,0.001540,0.634617,0.000077,0.000485,0.000264,0.003251,0.000464,0.347545,0.001559,0.006061,0.000275,0.003178,0.000249
8,0.000316,0.001187,0.468877,0.000034,0.000259,0.000177,0.004247,0.000410,0.516348,0.001165,0.005088,0.000288,0.001459,0.000145
9,0.000459,0.001661,0.646876,0.000089,0.000539,0.000291,0.003285,0.000475,0.334469,0.001610,0.006239,0.000282,0.003456,0.000269


### KNN Model

In [None]:
# init model

knn = KNeighborsClassifier()

#n_neighbors=2
#n_neighbors=3
#n_neighbors=4
#n_neighbors=5

In [None]:
# perform cross validation

score = cross_val_score(knn,X,y,cv=5)
print('score:',score.mean(),score)

In [None]:
# fit model

knn.fit(X_train, y_train)

# score model
print("train r2:",knn.score(X_train, y_train))
print("test r2:",knn.score(X_test, y_test))

In [None]:
# confusion matrix
print('rows:actual columns:predicted')

# y_pred = knn.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

y_pred = knn.predict_proba(X_test)
pd.DataFrame(y_pred,columns=le.classes_)

### DTC Model

In [None]:
# init model

dtc = DecisionTreeClassifier(max_depth=10,random_state=3050)

In [None]:
# perform cross validation

score = cross_val_score(dtc,X,y,cv=5)
print(score.mean(),score)

In [None]:
# fit model
dtc = dtc.fit(X_train,y_train)

# score model
print("train r2:",dtc.score(X_train, y_train))
print("test r2:",dtc.score(X_test, y_test))

In [None]:
# confusion matrix
print('rows:actual columns:predicted')

# y_pred = dtc.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

y_pred = dtc.predict_proba(X_test)
pd.DataFrame(y_pred,columns=le.classes_)

In [None]:
# # initialize the output file object
# dot_data = StringIO() 

# # my fit DecisionTreeRegressor object here is: dtr1
# # for feature_names i put the columns of my Xr matrix
# export_graphviz(dtc, 
#                 out_file=dot_data,  
#                 filled=True, 
#                 rounded=True,
#                 special_characters=True,
#                 feature_names=df_model[features].columns
#                )  

# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# Image(graph.create_png())

### RFC Model

In [None]:
# init model

rfc = RandomForestClassifier(n_estimators=10,max_depth=10,n_jobs=2, random_state=3050)

In [None]:
# perform cross validation

score = cross_val_score(rfc,X,y,cv=5)
print(score.mean(),score)

In [None]:
# fit model
rfc = rfc.fit(X_train,y_train)

# score model
print("train r2:",rfc.score(X_train, y_train))
print("test r2:",rfc.score(X_test, y_test))

In [None]:
# confusion matrix
print('rows:actual columns:predicted')

# y_pred = rfc.predict(X_test)
# cm = confusion_matrix(y_test, y_pred)
# pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

y_pred = rfc.predict_proba(X_test)
pd.DataFrame(y_pred,columns=le.classes_)

### GridSearch Model

In [None]:
# init models

estimators = {
    'lr': LogisticRegression(),
    'knn': KNeighborsClassifier(n_neighbors=5),
    'dtc': DecisionTreeClassifier(),
    'rfc': RandomForestClassifier(),
    'abc': AdaBoostClassifier(),
    'gbc': GradientBoostingClassifier()
}.items()

In [None]:
# init model parameters

params = {
    'lr': {

    },
    'knn': {
        'knn__n_neighbors': [3,5,7,9]
    },

    'dtc': {
        'dtc__max_features': ['auto', 'log2', None],
        'dtc__max_depth': [None, 5, 10, 15],
        'dtc__min_samples_split': np.linspace(0.1, 0.5, 5)
    },
    'rfc': {
        'rfc__n_estimators': [10, 20, 50, 100],
        'rfc__max_depth': [None, 5, 10, 15],
        'rfc__max_features': ['auto', 'log2', None],
        'rfc__min_samples_split': np.linspace(0.1, 0.5, 5)
    },
    'abc': {

    }, 
    'gbc': {

    }
}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2,stratify=y,random_state=3050)

In [None]:
models = []
parameters = []
best_score = []
roc_auc = []

for k,v in estimators:
    
    pipe = Pipeline([
        ('sc', StandardScaler()),
        (k,v)])
    
    param = params[k]
    
    gridsearch = GridSearchCV(
        estimator=pipe,
        param_grid=param,
        verbose=1,
        cv= 5,
        n_jobs=3,
        return_train_score= True
        #scoring = 'roc_auc'
    )

    gridsearch.fit(X_train, y_train)
    
    model = gridsearch.best_estimator_
    cv_score = gridsearch.cv_results_
    best_params = gridsearch.best_params_

    # predict y
    #y_pred = model.predict(X_test)
    y_pred = model.predict_proba(X_test)
    
    # print results
    print("Model: ", k)
    print("Best parameters:", best_params)
    print("Best score:", gridsearch.best_score_)
    #print("AUC/ROC test:", roc_auc_score(y_test,y_pred))
    display(pd.DataFrame(cv_score, columns = cv_score.keys()))
    
    
    # append info to list
    models.append(k)
    best_score.append(gridsearch.best_score_)
    parameters.append(best_params)
    #roc_auc.append(roc_auc_score(y_test,y_pred))