### Contents

- [Header](#Header)
- [Import Data](#Import-Data)
- [Functions](#Functions)


- [Pre-process Data](#Pre-process-Data)
- [Plot df_model](#Plot-df_model)


- [Create Features and Target](#Create-Features-and-Target)
- [Handle Imbalanced Data](#Handle-Imbalanced-Data)


- [Logistic Regresion Model](#Logistic-Regresion-Model)
- [KNN Model](#KNN-Model)
- [DTC Model](#DTC-Model)
- [RTC Model](#RTC-Model)
- [GridSearch Model](#GridSearch-Model)

### Header

In [1]:
# import libraries

# maths
import numpy as np
import pandas as pd

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pydotplus

# modelling
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures,LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix,accuracy_score,r2_score,mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.utils import resample, shuffle
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.externals.six import StringIO
from imblearn.over_sampling import SMOTE

# Others
import warnings
warnings.filterwarnings("ignore")
from IPython.display import Image

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
# file paths

raw_path = '../../data/0_raw/fitrec/' 
input_path = '../../data/1_input/fitrec/'
clean_path = '../../data/2_clean/fitrec/' 
preprocess_path = '../../data/3_preprocess/fitrec/' 
output_path = '../../data/4_output/fitrec/'

sports_path = '../../data/1_input/sports/' 

### Import Data

In [3]:
# import data

#file = 'endomondoHR_proper_summary.csv'
file = 'endomondoHR_proper_dist_spd_summary.csv'

in_path = clean_path + file

df = pd.read_csv(in_path)

In [4]:
# import sports.xlsx

path = sports_path + 'sports.xlsx'
df_sports = pd.read_excel(path)
df_sports

Unnamed: 0,sport,sport_rename,type,venue,location_valid,distance_valid,speed_valid,speed_max
0,aerobics,aerobics,gym,indoor,0,0,0,0
1,badminton,badminton,racket,indoor,0,1,0,0
2,basketball,basketball,team,outdoor/indoor,1,1,0,0
3,bike,bike,aerobic,outdoor,1,1,1,244
4,bike (transport),bike,aerobic,outdoor,1,1,1,244
5,indoor cycling,bike,aerobic,indoor,0,1,1,244
6,mountain bike,bike,aerobic,outdoor,1,1,1,244
7,circuit training,circuit training,aerobic,outdoor/indoor,0,0,0,0
8,climbing,climbing,aerobic,outdoor/indoor,1,0,0,0
9,core stability training,core stability training,indoor,indoor,0,0,0,0


### Functions

### Pre-process Data

In [5]:
df.columns

Index(['id', 'userId', 'gender', 'sport', 'url', 'time_start', 'time_end',
       'time_dur', 'lat_start', 'lon_start', 'lat_end', 'lon_end', 'alt_avg',
       'alt_min', 'alt_05', 'alt_25', 'alt_75', 'alt_95', 'alt_max', 'hr_avg',
       'hr_min', 'hr_05', 'hr_25', 'hr_75', 'hr_95', 'hr_max', 'hr_outof',
       'hr_fatburn', 'hr_cardio', 'hr_peak', 'spd_avg', 'spd_min', 'spd_05',
       'spd_25', 'spd_75', 'spd_95', 'spd_max', 'impute'],
      dtype='object')

In [6]:
# remove rows with abnormal heartrate

print(len(df))

cond_1 = df['hr_min'] > 40
cond_2 = df['hr_avg'] > 50
cond_3 = df['hr_max'] > 60

df = df[cond_1 & cond_2 & cond_3]

print(len(df))

167783
167017


In [7]:
# remove rows with abnormal workout duration

print(len(df))

# time_dur in minutes
time_dur_mask = df['time_dur'] < 24 * 60
df = df[time_dur_mask]

print(len(df))

167017
167013


In [8]:
# drop rows if speed is nan

# print(len(df))
# df.dropna(subset=['spd_avg'],inplace=True)
# print(len(df))

In [9]:
df['sport'].value_counts()

bike                       71577
run                        70389
mountain bike              10703
bike (transport)            7653
indoor cycling              1689
walk                        1245
orienteering                 866
cross-country skiing         786
core stability training      445
fitness walking              292
skate                        257
roller skiing                238
hiking                       237
kayaking                      92
circuit training              89
weight training               74
rowing                        71
gymnastics                    66
soccer                        51
downhill skiing               42
treadmill running             28
snowshoeing                   16
swimming                      14
golf                          12
badminton                     10
elliptical                    10
horseback riding              10
basketball                     8
tennis                         8
aerobics                       7
climbing  

In [10]:
# select only sports with speed_valid = 1

valid_mask = df_sports['speed_valid'] == 1
valid_sport_list = df_sports[valid_mask]['sport']
valid_sport_list = list(valid_sport_list)

# overwrite: compare specific sports
#valid_sport_list = ['kayaking','rowing']

valid_mask_2 = df['sport'].isin(valid_sport_list)
df = df[valid_mask_2]
df['sport'].unique()

array(['bike', 'bike (transport)', 'run', 'mountain bike', 'rowing',
       'orienteering', 'kayaking', 'indoor cycling', 'skate',
       'cross-country skiing', 'walk', 'hiking', 'treadmill running',
       'swimming', 'snowshoeing', 'snowboarding', 'fitness walking',
       'roller skiing', 'horseback riding', 'downhill skiing',
       'treadmill walking', 'sailing', 'kite surfing', 'windsurfing'],
      dtype=object)

In [11]:
# merge similar sports

for idx,row in df_sports.iterrows():
    
    sport = row['sport'].rstrip()
    sport_rename = row['sport_rename']
    print(sport,sport_rename)
    
    df['sport'].replace(sport,sport_rename,inplace=True)

aerobics aerobics
badminton badminton
basketball basketball
bike bike
bike (transport) bike
indoor cycling bike
mountain bike bike
circuit training circuit training
climbing climbing
core stability training core stability training
elliptical elliptical
golf golf
gymnastics gymnastics
hiking hiking
horseback riding horseback riding
kayaking kayaking
martial arts martial arts
orienteering orienteering
rowing rowing       
rugby rugby
run run
treadmill running run
sailing sailing
skate skate
cross-country skiing skiing
downhill skiing skiing
roller skiing skiing
snowboarding snowboarding
snowshoeing snowshoeing
soccer soccer
squash squash
stair climing stair climbing
kite surfing surfing
windsurfing surfing
swimming swimming
table tennis table tennis
tennis tennis
fitness walking walk
treadmill walking walk
walk walk
weight training weight training
pilates yoga
yoga yoga


In [12]:
# merge similar sports

# df['sport'].replace('treadmill running','run',inplace=True)
# df['sport'].replace(['treadmill walking','fitness walking'],'walk',inplace=True)
# df['sport'].replace(['bike','mountain bike','bike (transport)','indoor cycling'],'cycle',inplace=True)
# df['sport'].replace(['windsurfing','kite surfing'],'surfing',inplace=True)
# df['sport'].replace(['cross-country skiing','downhill skiing','roller skiing'],'skiing',inplace=True)
# df['sport'].replace('pilates','yoga',inplace=True)

In [13]:
df['sport'].value_counts()

bike                91622
run                 70417
walk                 1538
skiing               1066
orienteering          866
skate                 257
hiking                237
kayaking               92
rowing                 71
snowshoeing            16
swimming               14
horseback riding       10
snowboarding            3
surfing                 2
sailing                 1
Name: sport, dtype: int64

In [14]:
# select columns for feature selection

# cols = ['sport', 'time_dur', 'alt_avg', 'alt_min', 'alt_25', 'alt_75','alt_max', 'hr_avg', 'hr_min', 'hr_25', 'hr_75', 'hr_max']

#cols = ['sport','hr_avg', 'hr_min', 'hr_25', 'hr_75', 'hr_max']
#cols = ['sport','hr_avg', 'hr_min', 'hr_05', 'hr_25', 'hr_75', 'hr_95','hr_max']
#cols = ['sport','hr_avg', 'hr_min', 'hr_25', 'hr_75', 'hr_max','spd_avg']
#cols = ['sport','hr_avg', 'hr_min', 'hr_25', 'hr_75', 'hr_max','spd_avg', 'spd_min', 'spd_25', 'spd_75', 'spd_max']

cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak']
#cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak','spd_avg']
#cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak','spd_95']
#cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak','spd_avg','spd_25','spd_75']
#cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak','spd_avg', 'spd_min', 'spd_25', 'spd_75', 'spd_max']

df_model = df[cols]

In [15]:
# select only sports with minimal rows

count = df_model['sport'].value_counts()

#count_cond = count[count > 1].indexkn
#count_cond = count[count >= 5].inde
#count_cond = count[count >= 10].index
#count_cond = count[count >= 50].index
count_cond = count[count >= 70].index
#count_cond = count[count >= 100].index
#count_cond = count[count >= 200].index
#count_cond = count[count >= 800].index
#count_cond = count[count >= 1500].index
#count_cond = count[count >= 70000].index

count_mask = df_model['sport'].isin(count_cond)
df_model = df_model[count_mask]

In [16]:
print(df_model.shape)
df_model.head()

(166166, 5)


Unnamed: 0,sport,hr_outof,hr_fatburn,hr_cardio,hr_peak
0,bike,0.0,0.013333,0.464444,0.522222
1,bike,0.0,0.02,0.591111,0.388889
2,bike,0.0,0.057778,0.782222,0.16
3,bike,0.0,0.037778,0.637778,0.324444
4,bike,0.0,0.011111,0.08,0.908889


In [17]:
df_model['sport'].value_counts()

bike             91622
run              70417
walk              1538
skiing            1066
orienteering       866
skate              257
hiking             237
kayaking            92
rowing              71
Name: sport, dtype: int64

### Plot df_model

In [18]:
# order = df_model.groupby('sport')['hr_max'].median().sort_values(ascending=False).index

# plt.figure(figsize=(20,15))
# #plt.xlim(0,300)

# sns.boxplot(data=df_model,x='hr_max',y='sport',order=order);

In [19]:
# order = df_model.groupby('sport')['hr_avg'].median().sort_values(ascending=False).index

# plt.figure(figsize=(20,15))
# #plt.xlim(0,300)

# sns.boxplot(data=df_model,x='hr_avg',y='sport',order=order);

In [20]:
# order = df_model.groupby('sport')['hr_min'].median().sort_values(ascending=False).index

# plt.figure(figsize=(20,15))
# #plt.xlim(0,300)

# sns.boxplot(data=df_model,x='hr_min',y='sport',order=order);

### Create Features and Target

In [21]:
df_model.head()

Unnamed: 0,sport,hr_outof,hr_fatburn,hr_cardio,hr_peak
0,bike,0.0,0.013333,0.464444,0.522222
1,bike,0.0,0.02,0.591111,0.388889
2,bike,0.0,0.057778,0.782222,0.16
3,bike,0.0,0.037778,0.637778,0.324444
4,bike,0.0,0.011111,0.08,0.908889


In [22]:
# doing scaling and encoding before create X and y

cols = df_model.columns[1:]

ss = StandardScaler()
df_model[cols] = ss.fit_transform(df_model[cols])

le = LabelEncoder()
df_model['sport'] = le.fit_transform(df_model['sport'])

In [23]:
# create feature and target. next perform train_test_split

X = df_model.drop(columns='sport')
y = df_model['sport']

#le = LabelEncoder()
#y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3,stratify=y,random_state=3050)

In [24]:
# ss = StandardScaler()
# ss.fit(X_train)

# X_train = ss.transform(X_train)
# X_test = ss.transform(X_test)

In [25]:
df_model.head()

Unnamed: 0,sport,hr_outof,hr_fatburn,hr_cardio,hr_peak
0,0,-0.296564,-0.789622,-0.059314,0.757804
1,0,-0.296564,-0.763293,0.383794,0.335254
2,0,-0.296564,-0.614095,1.052342,-0.390124
3,0,-0.296564,-0.693082,0.547044,0.131021
4,0,-0.296564,-0.798399,-1.404184,1.983201


### Handle Imbalanced Data

In [26]:
# check before upsample/downsample

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(116316, 4)
(49850, 4)
(116316,)
(49850,)


In [27]:
# check before upsample/downsample

unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 64135, 1: 166, 2: 64, 3: 606, 4: 50, 5: 49292, 6: 180, 7: 746, 8: 1077}

In [28]:
# sm = SMOTE(random_state=3050)
# X_train, y_train = sm.fit_sample(X_train, y_train.ravel())

In [29]:
# get majority class index and row count

# sport_counts = df_model['sport'].value_counts()
# print(sport_counts)

# major_class_index = sport_counts.index[0]
# major_class_count = sport_counts.values[0]
# print(major_class_index,major_class_count)

In [30]:
# concatenate our training data back together

Xy_train = pd.concat([X_train, y_train], axis=1)

sport_counts = Xy_train['sport'].value_counts()
print(sport_counts)

print(Xy_train.shape)
Xy_train.head()

0    64135
5    49292
8     1077
7      746
3      606
6      180
1      166
2       64
4       50
Name: sport, dtype: int64
(116316, 5)


Unnamed: 0,hr_outof,hr_fatburn,hr_cardio,hr_peak,sport
51326,-0.296564,-0.719411,1.705343,-0.897185,5
137700,-0.296564,-0.631648,1.627604,-0.897185,5
9228,-0.266916,0.684807,-0.11373,-0.383082,0
160851,-0.237268,0.939321,0.220544,-0.897185,0
60074,0.918985,2.598054,-1.551886,-0.897185,0


In [31]:
# perform upsampling and downsampling

sample_size = 13000

df_all_sample = pd.DataFrame()

sport_list = df_model['sport'].unique()
sport_list

for sport in sport_list:
    
    cond = Xy_train['sport'] == sport
    df_sport = Xy_train[cond]
    
    # perform downsampling
    if sport_counts[sport] >= sample_size:    
        print('downsampling',sport,sport_counts[sport])
        df_sample = df_sport.sample(sample_size,replace=False,random_state=3050)
        
    # perform upsampling
    # sport_counts[sport] < sample_size: 
    else:
        print('upsampling',sport,sport_counts[sport])
        df_sample = df_sport.sample(sample_size,replace=True,random_state=3050)
        
    df_all_sample = pd.concat([df_all_sample, df_sample], axis=0)
    
X_train = df_all_sample.drop(columns='sport').values
y_train = df_all_sample['sport'].values

downsampling 0 64135
downsampling 5 49292
upsampling 4 50
upsampling 3 606
upsampling 2 64
upsampling 6 180
upsampling 7 746
upsampling 8 1077
upsampling 1 166


In [32]:
# # perform upsampling for minority classes

# df_all_sample = pd.DataFrame()

# sport_list = df_model['sport'].unique()

# for sport in sport_list:
    
#     if sport != major_class_index:
        
#         cond = df_model['sport'] == sport
#         df_sport = df_model[cond]
#         #print(sport,len(df_sport))
        
#         df_sample = df_sport.sample(major_class_count,replace=True,random_state=3050)
#         df_all_sample = pd.concat([df_all_sample, df_sample], axis=0)
        
# cond = df_model['sport'] == major_class_index
# df_top = df_model[cond]
# df_all_sample = pd.concat([df_all_sample, df_top], axis=0)

# X_train = df_all_sample.drop(columns='sport').values
# y_train = df_all_sample['sport'].values

In [33]:
# check after upsample/downsample

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(117000, 4)
(49850, 4)
(117000,)
(49850,)


In [34]:
# check after upsample/downsample

unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 13000,
 1: 13000,
 2: 13000,
 3: 13000,
 4: 13000,
 5: 13000,
 6: 13000,
 7: 13000,
 8: 13000}

### Logistic Regresion Model

In [35]:
# init model

logreg = LogisticRegression()

In [36]:
# perform cross validation

score = cross_val_score(logreg,X,y,cv=5)
print('score:',score.mean(),score)

# sample_size = 10000
# score: 0.9171444825052808 [0.91460533 0.91459767 0.93424281 0.89779799 0.92447862]

score: 0.7066121187166571 [0.71581576 0.70643278 0.7155367  0.69163407 0.70364129]


In [37]:
# fit model

logreg.fit(X_train, y_train)

# score model
print("train r2:",logreg.score(X_train, y_train))
print("test r2:",logreg.score(X_test, y_test))

# sample_size = 10000
#train r2: 0.40617857142857144
#test r2: 0.6280887896132905

train r2: 0.2572051282051282
test r2: 0.3629087261785356


In [38]:
# confusion matrix
print('rows:actual columns:predicted')

y_pred = logreg.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

# y_pred = logreg.predict_proba(X_test)
# pd.DataFrame(y_pred,columns=le.classes_) 

rows:actual columns:predicted


Unnamed: 0,bike,hiking,kayaking,orienteering,rowing,run,skate,skiing,walk
bike,7447,297,308,3827,618,7488,2183,143,5176
hiking,4,1,6,2,1,2,5,1,49
kayaking,2,1,2,6,2,3,2,0,10
orienteering,30,0,2,128,1,87,3,1,8
rowing,2,0,0,1,4,7,2,0,5
run,2002,127,144,7294,177,10164,366,30,821
skate,24,0,1,8,2,11,4,0,27
skiing,62,3,3,78,4,81,23,2,64
walk,46,11,6,1,2,40,15,1,339


### KNN Model

In [39]:
# init model

knn = KNeighborsClassifier(n_neighbors=3)

In [40]:
# perform cross validation

score = cross_val_score(knn,X,y,cv=5)
print('score:',score.mean(),score)

score: 0.7130585421717655 [0.70907669 0.70279215 0.72477506 0.7111947  0.71745411]


In [41]:
# fit model

knn.fit(X_train, y_train)

# score model
print("train r2:",knn.score(X_train, y_train))
print("test r2:",knn.score(X_test, y_test))

# sample_size = 10000
#train r2: 0.9733142857142857
#test r2: 0.8036935840928582

train r2: 0.9192649572649573
test r2: 0.5861785356068204


In [42]:
# confusion matrix
print('rows:actual columns:predicted')

y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

# y_pred = knn.predict_proba(X_test)
# pd.DataFrame(y_pred,columns=le.classes_)

rows:actual columns:predicted


Unnamed: 0,bike,hiking,kayaking,orienteering,rowing,run,skate,skiing,walk
bike,16054,374,169,937,124,5844,399,1716,1870
hiking,28,4,1,0,2,5,2,3,26
kayaking,12,1,0,1,0,7,1,3,3
orienteering,84,1,1,23,1,130,3,11,6
rowing,11,0,0,0,0,5,0,3,2
run,5140,71,61,1168,91,12908,188,949,549
skate,36,0,2,3,0,20,2,5,9
skiing,155,4,1,13,1,98,6,25,17
walk,138,35,13,9,6,33,4,18,205


### DTC Model

In [43]:
# init model

dtc = DecisionTreeClassifier(max_depth=10,random_state=3050)

In [44]:
# perform cross validation

score = cross_val_score(dtc,X,y,cv=5)
print(score.mean(),score)

# sample_size = 10000
# 0.9436500430477148 [0.94146502 0.94489979 0.94803446 0.93923528 0.94461566]

0.7556058779112285 [0.75754385 0.74960886 0.76783726 0.74766777 0.75537165]


In [45]:
# fit model
dtc = dtc.fit(X_train,y_train)

# score model
print("train r2:",dtc.score(X_train, y_train))
print("test r2:",dtc.score(X_test, y_test))

# sample_size = 10000
# print("train r2:",dtc.score(X_train, y_train))
# print("test r2:",dtc.score(X_test, y_test))

train r2: 0.5811025641025641
test r2: 0.3655366098294885


In [46]:
# confusion matrix
print('rows:actual columns:predicted')

y_pred = dtc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

# y_pred = dtc.predict_proba(X_test)
# pd.DataFrame(y_pred,columns=le.classes_)

rows:actual columns:predicted


Unnamed: 0,bike,hiking,kayaking,orienteering,rowing,run,skate,skiing,walk
bike,8864,2342,1626,2146,808,3588,4182,2218,1713
hiking,8,21,6,2,1,1,4,8,20
kayaking,3,3,4,5,1,3,4,1,4
orienteering,36,3,15,86,14,65,18,21,2
rowing,4,0,4,1,2,2,3,2,3
run,2129,462,660,4742,561,8998,2069,922,582
skate,15,5,2,4,0,15,20,6,10
skiing,83,18,16,46,6,51,51,32,17
walk,49,93,27,3,7,21,43,23,195


In [47]:
# # initialize the output file object
# dot_data = StringIO() 

# # my fit DecisionTreeRegressor object here is: dtr1
# # for feature_names i put the columns of my Xr matrix
# export_graphviz(dtc, 
#                 out_file=dot_data,  
#                 filled=True, 
#                 rounded=True,
#                 special_characters=True,
#                 feature_names=df_model[features].columns
#                )  

# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# Image(graph.create_png())

### RFC Model

In [48]:
# init model

rfc = RandomForestClassifier(n_estimators=10,max_depth=10,n_jobs=2, random_state=3050)

In [49]:
# perform cross validation

score = cross_val_score(rfc,X,y,cv=5)
print(score.mean(),score)

# sample_size = 10000
# 0.9467134535825069 [0.94526366 0.94657493 0.95105606 0.94357348 0.94709913]

0.7580132050501989 [0.75856674 0.75105307 0.77256176 0.75043635 0.75744809]


In [50]:
# fit model
rfc = rfc.fit(X_train,y_train)

# score model
print("train r2:",rfc.score(X_train, y_train))
print("test r2:",rfc.score(X_test, y_test))

# sample_size = 10000
# train r2: 0.8223142857142857
# test r2: 0.7444007897728406

train r2: 0.704034188034188
test r2: 0.4438114343029087


In [51]:
# confusion matrix
print('rows:actual columns:predicted')

y_pred = rfc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

# y_pred = rfc.predict_proba(X_test)
# pd.DataFrame(y_pred,columns=le.classes_)

rows:actual columns:predicted


Unnamed: 0,bike,hiking,kayaking,orienteering,rowing,run,skate,skiing,walk
bike,10884,2159,436,1655,936,3742,2981,3083,1611
hiking,8,23,2,0,2,3,8,4,21
kayaking,7,3,1,2,2,5,2,2,4
orienteering,57,2,5,78,6,83,14,13,2
rowing,6,0,1,2,4,4,1,1,2
run,2855,258,148,3823,424,10845,1093,1199,480
skate,16,6,2,7,0,12,19,6,9
skiing,98,20,5,31,8,73,29,44,12
walk,50,83,20,5,6,25,22,24,226


### GridSearch Model

In [52]:
# init models

estimators = {
    'lr': LogisticRegression(),
    'knn': KNeighborsClassifier(),
    'dtc': DecisionTreeClassifier(),
    'rfc': RandomForestClassifier(),
    'abc': AdaBoostClassifier(),
    'gbc': GradientBoostingClassifier()
}.items()

In [53]:
# init model parameters

params = {
    'lr': {

    },
    'knn': {
        'knn__n_neighbors': [3,5,7,9]
    },

    'dtc': {
        'dtc__max_features': ['auto', 'log2', None],
        'dtc__max_depth': [None, 5, 10, 15],
        'dtc__min_samples_split': np.linspace(0.1, 0.5, 5)
    },
    'rfc': {
        'rfc__n_estimators': [10, 20, 50, 100],
        'rfc__max_depth': [None, 5, 10, 15],
        'rfc__max_features': ['auto', 'log2', None],
        'rfc__min_samples_split': np.linspace(0.1, 0.5, 5)
    },
    'abc': {

    }, 
    'gbc': {

    }
}

In [54]:
models = []
parameters = []
best_score = []
roc_auc = []

for k,v in estimators:
    
    pipe = Pipeline([
        ('sc', StandardScaler()),
        (k,v)])
    
    gridsearch = GridSearchCV(
        estimator=pipe,
        param_grid=params[k],
        verbose=1,
        cv= 5,
        n_jobs=-1,
        return_train_score= True
        #scoring = 'roc_auc'
    )

    gridsearch.fit(X_train, y_train)
    
    model = gridsearch.best_estimator_
    cv_score = gridsearch.cv_results_
    best_params = gridsearch.best_params_

    # predict y
    #y_pred = model.predict(X_test)
    y_pred = model.predict_proba(X_test)
    
    # print results
    print("Model: ", k)
    print("Best parameters:", best_params)
    print("Best score:", gridsearch.best_score_)
    #print("AUC/ROC test:", roc_auc_score(y_test,y_pred))
    display(pd.DataFrame(cv_score, columns = cv_score.keys()))    
    
    # append info to list
    models.append(k)
    best_score.append(gridsearch.best_score_)
    parameters.append(best_params)
    #roc_auc.append(roc_auc_score(y_test,y_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.6s finished


Model:  lr
Best parameters: {}
Best score: 0.2561111111111111


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,1.457658,0.156664,0.008777,0.001163,{},0.254145,0.252393,0.255897,0.257991,0.260128,0.256111,0.002735,1,0.256335,0.25656,0.25484,0.255556,0.25719,0.256096,0.000817


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   35.5s finished


Model:  knn
Best parameters: {'knn__n_neighbors': 3}
Best score: 0.8810940170940171


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.389558,0.013029,1.29753,0.01095,3,{'knn__n_neighbors': 3},0.879231,0.88406,0.877308,0.884615,...,0.881094,0.002818,1,0.916549,0.920214,0.915342,0.917201,0.915865,0.917034,0.001709
1,0.352076,0.024216,1.320581,0.012614,5,{'knn__n_neighbors': 5},0.872222,0.877778,0.875769,0.876026,...,0.87553,0.001811,2,0.899028,0.902607,0.901581,0.898761,0.9025,0.900895,0.001674
2,0.334965,0.01664,1.31322,0.010658,7,{'knn__n_neighbors': 7},0.864316,0.868248,0.863889,0.868889,...,0.866624,0.002093,3,0.884605,0.887073,0.884573,0.886442,0.887927,0.886124,0.001339
3,0.326342,0.00823,1.32577,0.031634,9,{'knn__n_neighbors': 9},0.857265,0.859744,0.857009,0.862051,...,0.859094,0.001842,4,0.874071,0.875716,0.874509,0.875427,0.875844,0.875113,0.0007


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   10.9s


Model:  dtc
Best parameters: {'dtc__max_depth': 15, 'dtc__max_features': 'log2', 'dtc__min_samples_split': 0.1}
Best score: 0.34194871794871795


[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   16.6s finished


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dtc__max_depth,param_dtc__max_features,param_dtc__min_samples_split,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.21864,0.018664,0.011422,0.003063,,auto,0.1,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.346325,0.332094,...,0.337803,0.00754,4,0.345192,0.336987,0.341848,0.336731,0.330983,0.338348,0.004854
1,0.169747,0.016121,0.010092,0.004075,,auto,0.2,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.295855,0.265983,...,0.280359,0.010824,24,0.294562,0.267521,0.26969,0.28485,0.285748,0.280474,0.010291
2,0.153073,0.019763,0.008814,0.000808,,auto,0.3,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.260598,0.272179,...,0.263675,0.010161,32,0.261154,0.274359,0.265801,0.271859,0.248088,0.264252,0.009309
3,0.16598,0.015153,0.008631,0.00054,,auto,0.4,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.225812,0.257436,...,0.244991,0.013825,39,0.222927,0.259444,0.229893,0.256293,0.251165,0.243944,0.014724
4,0.126633,0.00854,0.010792,0.005042,,auto,0.5,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.21953,0.220855,...,0.220855,0.003241,52,0.21984,0.22344,0.222361,0.214808,0.221902,0.22047,0.003063
5,0.182049,0.013109,0.00818,0.000978,,log2,0.1,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.335214,0.324103,...,0.330564,0.004563,10,0.335417,0.327169,0.336774,0.326207,0.331635,0.33144,0.004241
6,0.181347,0.015184,0.009774,0.002129,,log2,0.2,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.326325,0.298162,...,0.29035,0.020377,21,0.326186,0.302436,0.27266,0.27813,0.282479,0.292378,0.019667
7,0.187275,0.023147,0.007639,0.000768,,log2,0.3,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.238974,0.261325,...,0.262957,0.016756,33,0.238429,0.262735,0.290694,0.268408,0.253707,0.262795,0.01724
8,0.131473,0.009336,0.010913,0.00606,,log2,0.4,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.258803,0.246154,...,0.243607,0.01001,40,0.257842,0.248526,0.232703,0.248173,0.230705,0.24359,0.010325
9,0.135908,0.007579,0.007977,0.001409,,log2,0.5,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.223162,0.21312,...,0.22106,0.004186,51,0.222863,0.215438,0.222361,0.219476,0.222938,0.220615,0.002884


Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   51.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed: 13.5min
[Parallel(n_jobs=-1)]: Done 1200 out of 1200 | elapsed: 20.7min finished


Model:  rfc
Best parameters: {'rfc__max_depth': 10, 'rfc__max_features': 'auto', 'rfc__min_samples_split': 0.1, 'rfc__n_estimators': 50}
Best score: 0.353017094017094


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rfc__max_depth,param_rfc__max_features,param_rfc__min_samples_split,param_rfc__n_estimators,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.921470,0.037591,0.060038,0.002220,,auto,0.1,10,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.346239,...,0.341752,0.010420,30,0.347404,0.358590,0.323472,0.346635,0.335556,0.342331,0.011918
1,1.726326,0.039939,0.111360,0.002167,,auto,0.1,20,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.352479,...,0.346615,0.004402,21,0.352254,0.348632,0.345139,0.343109,0.347906,0.347408,0.003126
2,4.210841,0.043607,0.277591,0.028295,,auto,0.1,50,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.352436,...,0.348974,0.005262,15,0.353750,0.352585,0.341175,0.355160,0.347062,0.349947,0.005174
3,9.805551,0.250688,0.679581,0.048338,,auto,0.1,100,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.357564,...,0.352667,0.003611,3,0.358077,0.351100,0.352970,0.352767,0.351090,0.353201,0.002565
4,0.842744,0.029966,0.074800,0.008078,,auto,0.2,10,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.295085,...,0.295983,0.014920,80,0.296731,0.292767,0.280438,0.323590,0.290844,0.296874,0.014402
5,1.630039,0.071026,0.129654,0.015283,,auto,0.2,20,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.308974,...,0.300991,0.007084,78,0.309562,0.294017,0.296528,0.302179,0.305342,0.301526,0.005673
6,4.009131,0.093219,0.305905,0.007564,,auto,0.2,50,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.292650,...,0.311470,0.010189,56,0.296165,0.315588,0.311667,0.321122,0.315908,0.312090,0.008511
7,7.935799,0.326674,0.638595,0.046922,,auto,0.2,100,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.318590,...,0.312513,0.010288,54,0.320096,0.300288,0.309968,0.306368,0.325588,0.312462,0.009188
8,0.730974,0.024974,0.081087,0.007313,,auto,0.3,10,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.292350,...,0.278094,0.008025,110,0.288248,0.276827,0.276902,0.269840,0.274530,0.277269,0.006059
9,1.283437,0.035412,0.130952,0.024922,,auto,0.3,20,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.275427,...,0.272590,0.003727,125,0.275556,0.283024,0.264850,0.268889,0.273419,0.273147,0.006171


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.3s finished


Model:  abc
Best parameters: {}
Best score: 0.2928034188034188


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,5.975178,1.048595,0.69797,0.123698,{},0.296581,0.296111,0.302436,0.275385,0.293504,0.292803,0.009186,1,0.298868,0.297158,0.306015,0.274573,0.294925,0.294308,0.010544


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.7min finished


Model:  gbc
Best parameters: {}
Best score: 0.6032051282051282


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,140.681571,40.219677,0.608904,0.164921,{},0.59906,0.60265,0.60359,0.608333,0.602393,0.603205,0.002987,1,0.609188,0.61063,0.610694,0.610427,0.609776,0.610143,0.000578


In [55]:
# sample_size = 10000

# Model:  lr
# Best parameters: {}
# Best score: 0.40513571428571427

# Model:  knn
# Best parameters: {'knn__n_neighbors': 3}
# Best score: 0.9692571428571428

# Model:  dtc
# Best parameters: {'dtc__max_depth': None, 'dtc__max_features': None, 'dtc__min_samples_split': 0.1}
# Best score: 0.5181714285714286

# Model:  rfc
# Best parameters: {'rfc__max_depth': 15, 'rfc__max_features': 'auto', 'rfc__min_samples_split': 0.1, 'rfc__n_estimators': 100}
# Best score: 0.4909071428571429

# Model:  abc
# Best parameters: {}
# Best score: 0.23792857142857143

# Model:  gbc
# Best parameters: {}
# Best score: 0.8113571428571429