### Contents

- [EDA](#EDA)

### Header

In [1]:
# import libraries

# maths
import numpy as np
import pandas as pd

# visual
#from matplotlib_venn import venn2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pydotplus

# modelling
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler,PolynomialFeatures,LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix,accuracy_score,r2_score,mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.utils import resample, shuffle
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.externals.six import StringIO
from imblearn.over_sampling import SMOTE

# Others
import warnings
warnings.filterwarnings("ignore")
from IPython.display import Image

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
# file paths

raw_path = '../../data/0_raw/fitrec/' 
input_path = '../../data/1_input/fitrec/'
clean_path = '../../data/2_clean/fitrec/' 
preprocess_path = '../../data/3_preprocess/fitrec/' 
output_path = '../../data/4_output/fitrec/' 

### Import Data

In [3]:
# import data

#file = 'endomondoHR_proper_summary.csv'
file = 'endomondoHR_proper_dist_spd_summary.csv'

in_path = clean_path + file

df = pd.read_csv(in_path)

### Functions

### Pre-process Data

In [4]:
df.columns

Index(['id', 'userId', 'gender', 'sport', 'time_start', 'time_end', 'time_dur',
       'lat_start', 'lon_start', 'lat_end', 'lon_end', 'alt_avg', 'alt_min',
       'alt_05', 'alt_25', 'alt_75', 'alt_95', 'alt_max', 'hr_avg', 'hr_min',
       'hr_05', 'hr_25', 'hr_75', 'hr_95', 'hr_max', 'hr_outof', 'hr_fatburn',
       'hr_cardio', 'hr_peak', 'spd_avg', 'spd_min', 'spd_05', 'spd_25',
       'spd_75', 'spd_95', 'spd_max', 'impute'],
      dtype='object')

In [5]:
# remove rows with abnormal heartrate

print(len(df))

cond_1 = df['hr_avg'] > 60
#cond_2 = df['hr_min'] > 0
#cond_3 = df['hr_25'] > 0
#cond_4 = df['hr_75'] > 0
#cond_5 = df['hr_max'] > 0

df = df[cond_1]
#df = df[cond_1 & cond_2 & cond_3 & cond_4 & cond_5]

print(len(df))

167783
167256


In [6]:
# drop rows if speed is nan

# print(len(df))
# df.dropna(subset=['spd_avg'],inplace=True)
# print(len(df))

In [7]:
df['sport'].value_counts()

bike                       71658
run                        70477
mountain bike              10713
bike (transport)            7676
indoor cycling              1725
walk                        1246
orienteering                 867
cross-country skiing         788
core stability training      443
fitness walking              292
skate                        260
roller skiing                238
hiking                       237
kayaking                      92
circuit training              89
weight training               74
rowing                        71
gymnastics                    66
soccer                        51
downhill skiing               42
treadmill running             28
snowshoeing                   16
swimming                      14
golf                          12
badminton                     10
horseback riding              10
elliptical                    10
tennis                         8
basketball                     8
aerobics                       7
climbing  

In [8]:
# merge similar sports

df['sport'].replace('treadmill running','run',inplace=True)
df['sport'].replace(['treadmill walking','fitness walking'],'walk',inplace=True)
df['sport'].replace(['bike','mountain bike','bike (transport)','indoor cycling'],'cycle',inplace=True)
df['sport'].replace(['windsurfing','kite surfing'],'surfing',inplace=True)
df['sport'].replace(['cross-country skiing','downhill skiing','roller skiing'],'skiing',inplace=True)
df['sport'].replace('pilates','yoga',inplace=True)

In [9]:
df['sport'].value_counts()

cycle                      91772
run                        70505
walk                        1539
skiing                      1068
orienteering                 867
core stability training      443
skate                        260
hiking                       237
kayaking                      92
circuit training              89
weight training               74
rowing                        71
gymnastics                    66
soccer                        51
snowshoeing                   16
swimming                      14
golf                          12
elliptical                    10
badminton                     10
horseback riding              10
tennis                         8
basketball                     8
aerobics                       7
climbing                       5
table tennis                   4
yoga                           4
snowboarding                   3
stair climing                  3
rugby                          3
surfing                        2
squash    

In [10]:
# select columns for feature selection

# cols = ['sport', 'time_dur', 'alt_avg', 'alt_min', 'alt_25', 'alt_75','alt_max', 'hr_avg', 'hr_min', 'hr_25', 'hr_75', 'hr_max']

#cols = ['sport','hr_avg', 'hr_min', 'hr_25', 'hr_75', 'hr_max']
#cols = ['sport','hr_avg', 'hr_min', 'hr_25', 'hr_75', 'hr_max','spd_avg', 'spd_min', 'spd_25', 'spd_75', 'spd_max']

#cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak']
cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak','spd_avg']
#cols = ['sport','hr_outof', 'hr_fatburn', 'hr_cardio', 'hr_peak','spd_avg', 'spd_min', 'spd_25', 'spd_75', 'spd_max']

df_model = df[cols]

In [11]:
# select only sports with minimal rows

count = df_model['sport'].value_counts()

#count_cond = count[count > 1].index
#count_cond = count[count >= 5].index
#count_cond = count[count >= 10].index
count_cond = count[count >= 50].index
#count_cond = count[count >= 100].index
#count_cond = count[count >= 800].index

count_mask = df_model['sport'].isin(count_cond)
df_model = df_model[count_mask]

In [12]:
print(df_model.shape)
df_model.head()

(167134, 6)


Unnamed: 0,sport,hr_outof,hr_fatburn,hr_cardio,hr_peak,spd_avg
0,cycle,0.0,0.013333,0.464444,0.522222,26.152328
1,cycle,0.0,0.02,0.591111,0.388889,27.636272
2,cycle,0.0,0.057778,0.782222,0.16,26.159896
3,cycle,0.0,0.037778,0.637778,0.324444,27.135904
4,cycle,0.0,0.011111,0.08,0.908889,31.241183


In [13]:
df_model['sport'].value_counts()

cycle                      91772
run                        70505
walk                        1539
skiing                      1068
orienteering                 867
core stability training      443
skate                        260
hiking                       237
kayaking                      92
circuit training              89
weight training               74
rowing                        71
gymnastics                    66
soccer                        51
Name: sport, dtype: int64

### Plot df_model

In [14]:
# order = df_model.groupby('sport')['hr_max'].median().sort_values(ascending=False).index

# plt.figure(figsize=(20,15))
# #plt.xlim(0,300)

# sns.boxplot(data=df_model,x='hr_max',y='sport',order=order);

In [15]:
# order = df_model.groupby('sport')['hr_avg'].median().sort_values(ascending=False).index

# plt.figure(figsize=(20,15))
# #plt.xlim(0,300)

# sns.boxplot(data=df_model,x='hr_avg',y='sport',order=order);

In [16]:
# order = df_model.groupby('sport')['hr_min'].median().sort_values(ascending=False).index

# plt.figure(figsize=(20,15))
# #plt.xlim(0,300)

# sns.boxplot(data=df_model,x='hr_min',y='sport',order=order);

### Create Features and Target

In [17]:
df_model.head()

Unnamed: 0,sport,hr_outof,hr_fatburn,hr_cardio,hr_peak,spd_avg
0,cycle,0.0,0.013333,0.464444,0.522222,26.152328
1,cycle,0.0,0.02,0.591111,0.388889,27.636272
2,cycle,0.0,0.057778,0.782222,0.16,26.159896
3,cycle,0.0,0.037778,0.637778,0.324444,27.135904
4,cycle,0.0,0.011111,0.08,0.908889,31.241183


In [18]:
# doing scaling and encoding before create X and y

cols = df_model.columns[1:]

ss = StandardScaler()
df_model[cols] = ss.fit_transform(df_model[cols])

le = LabelEncoder()
df_model['sport'] = le.fit_transform(df_model['sport'])

In [19]:
# create feature and target. next perform train_test_split

X = df_model.drop(columns='sport').values
y = df_model['sport'].values

#le = LabelEncoder()
#y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3,stratify=y,random_state=3050)

In [20]:
# ss = StandardScaler()
# ss.fit(X_train)

# X_train = ss.transform(X_train)
# X_test = ss.transform(X_test)

In [21]:
df_model.head()

Unnamed: 0,sport,hr_outof,hr_fatburn,hr_cardio,hr_peak,spd_avg
0,2,-0.296727,-0.790978,-0.056392,0.759532,0.939275
1,2,-0.296727,-0.764701,0.386535,0.336749,1.11388
2,2,-0.296727,-0.615799,1.054811,-0.38903,0.940165
3,2,-0.296727,-0.694629,0.549719,0.132403,1.055005
4,2,-0.296727,-0.799737,-1.400714,1.985604,1.538045


### Handle Imbalance Data

In [22]:
# check before upsample/downsample

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(116993, 5)
(50141, 5)
(116993,)
(50141,)


In [23]:
# check before upsample/downsample

unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 62,
 1: 310,
 2: 64240,
 3: 46,
 4: 166,
 5: 64,
 6: 607,
 7: 50,
 8: 49353,
 9: 182,
 10: 748,
 11: 36,
 12: 1077,
 13: 52}

In [24]:
# sm = SMOTE(random_state=3050)
# X_train, y_train = sm.fit_sample(X_train, y_train.ravel())

In [25]:
# get majority class index and row count

sport_counts = df_model['sport'].value_counts()
print(sport_counts)

major_class_index = sport_counts.index[0]
major_class_count = sport_counts.values[0]
print(major_class_index,major_class_count)

2     91772
8     70505
12     1539
10     1068
6       867
1       443
9       260
4       237
5        92
0        89
13       74
7        71
3        66
11       51
Name: sport, dtype: int64
2 91772


In [26]:
# perform upsampling and downsampling

sample_size = 10000

df_all_sample = pd.DataFrame()

sport_list = df_model['sport'].unique()

for sport in sport_list:
    
    cond = df_model['sport'] == sport
    df_sport = df_model[cond]
    
    # perform downsampling
    if sport_counts[sport] >= sample_size:    
        print('downsampling',sport,sport_counts[sport])
        df_sample = df_sport.sample(sample_size,replace=False,random_state=3050)
        
    # perform upsampling
    # sport_counts[sport] < sample_size: 
    else:
        print('upsampling',sport,sport_counts[sport])
        df_sample = df_sport.sample(sample_size,replace=True,random_state=3050)
        
    df_all_sample = pd.concat([df_all_sample, df_sample], axis=0)
    
X_train = df_all_sample.drop(columns='sport').values
y_train = df_all_sample['sport'].values

downsampling 2 91772
downsampling 8 70505
upsampling 7 71
upsampling 6 867
upsampling 5 92
upsampling 9 260
upsampling 10 1068
upsampling 1 443
upsampling 12 1539
upsampling 4 237
upsampling 13 74
upsampling 11 51
upsampling 0 89
upsampling 3 66


In [27]:
# # perform upsampling for minority classes

# df_all_sample = pd.DataFrame()

# sport_list = df_model['sport'].unique()

# for sport in sport_list:
    
#     if sport != major_class_index:
        
#         cond = df_model['sport'] == sport
#         df_sport = df_model[cond]
#         #print(sport,len(df_sport))
        
#         df_sample = df_sport.sample(major_class_count,replace=True,random_state=3050)
#         df_all_sample = pd.concat([df_all_sample, df_sample], axis=0)
        
# cond = df_model['sport'] == major_class_index
# df_top = df_model[cond]
# df_all_sample = pd.concat([df_all_sample, df_top], axis=0)

# X_train = df_all_sample.drop(columns='sport').values
# y_train = df_all_sample['sport'].values

In [28]:
# check after upsample/downsample

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(140000, 5)
(50141, 5)
(140000,)
(50141,)


In [29]:
# check after upsample/downsample

unique, counts = np.unique(y_train, return_counts=True)
dict(zip(unique, counts))

{0: 10000,
 1: 10000,
 2: 10000,
 3: 10000,
 4: 10000,
 5: 10000,
 6: 10000,
 7: 10000,
 8: 10000,
 9: 10000,
 10: 10000,
 11: 10000,
 12: 10000,
 13: 10000}

### Logistic Regresion Model

In [30]:
# init model

logreg = LogisticRegression()

In [31]:
# perform cross validation

score = cross_val_score(logreg,X,y,cv=5)
print('score:',score.mean(),score)

score: 0.917078649370848 [0.91511381 0.9144481  0.93364447 0.89755864 0.92462823]


In [32]:
# fit model

logreg.fit(X_train, y_train)

# score model
print("train r2:",logreg.score(X_train, y_train))
print("test r2:",logreg.score(X_test, y_test))

train r2: 0.38929285714285716
test r2: 0.6398356634291299


In [33]:
# confusion matrix
print('rows:actual columns:predicted')

y_pred = logreg.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

# y_pred = logreg.predict_proba(X_test)
# pd.DataFrame(y_pred,columns=le.classes_) 

rows:actual columns:predicted


Unnamed: 0,circuit training,core stability training,cycle,gymnastics,hiking,kayaking,orienteering,rowing,run,skate,skiing,soccer,walk,weight training
circuit training,7,0,0,4,4,0,1,0,0,0,0,6,1,4
core stability training,6,1,14,5,15,2,8,3,17,9,8,21,17,7
cycle,149,55,20981,111,9,21,257,80,1146,3677,424,275,278,69
gymnastics,1,0,0,18,0,0,0,0,0,0,0,0,0,1
hiking,5,0,0,2,22,1,0,1,0,0,0,2,37,1
kayaking,1,3,0,0,2,0,2,1,3,0,0,4,11,1
orienteering,18,3,1,0,7,1,120,17,34,1,1,46,11,0
rowing,0,1,0,0,0,0,3,2,8,0,4,0,2,1
run,54,203,174,3,21,87,7094,1104,10542,92,736,143,896,3
skate,0,0,37,0,1,1,0,0,7,24,2,0,6,0


### KNN Model

In [34]:
# init model

knn = KNeighborsClassifier()

In [35]:
# perform cross validation

score = cross_val_score(knn,X,y,cv=5)
print('score:',score.mean(),score)

score: 0.940431106694629 [0.93895253 0.93924619 0.94483336 0.93836764 0.94075581]


In [36]:
# fit model

knn.fit(X_train, y_train)

# score model
print("train r2:",knn.score(X_train, y_train))
print("test r2:",knn.score(X_test, y_test))

train r2: 0.9753214285714286
test r2: 0.7954568117907501


In [37]:
# confusion matrix
print('rows:actual columns:predicted')

y_pred = knn.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

# y_pred = knn.predict_proba(X_test)
# pd.DataFrame(y_pred,columns=le.classes_)

rows:actual columns:predicted


Unnamed: 0,circuit training,core stability training,cycle,gymnastics,hiking,kayaking,orienteering,rowing,run,skate,skiing,soccer,walk,weight training
circuit training,27,0,0,0,0,0,0,0,0,0,0,0,0,0
core stability training,0,133,0,0,0,0,0,0,0,0,0,0,0,0
cycle,54,852,22715,47,31,24,71,52,540,1337,1620,8,129,52
gymnastics,0,0,0,20,0,0,0,0,0,0,0,0,0,0
hiking,0,0,0,0,71,0,0,0,0,0,0,0,0,0
kayaking,0,0,0,0,0,28,0,0,0,0,0,0,0,0
orienteering,0,0,0,0,0,0,260,0,0,0,0,0,0,0
rowing,0,0,0,0,0,0,0,21,0,0,0,0,0,0
run,28,291,333,2,61,129,1694,206,15724,70,2064,21,514,15
skate,0,0,0,0,0,0,0,0,0,78,0,0,0,0


### DTC Model

In [38]:
# init model

dtc = DecisionTreeClassifier(max_depth=10,random_state=3050)

In [39]:
# perform cross validation

score = cross_val_score(dtc,X,y,cv=5)
print(score.mean(),score)

0.941813230315311 [0.93976012 0.94256656 0.94552145 0.93744016 0.94377786]


In [40]:
# fit model
dtc = dtc.fit(X_train,y_train)

# score model
print("train r2:",dtc.score(X_train, y_train))
print("test r2:",dtc.score(X_test, y_test))

train r2: 0.6870428571428572
test r2: 0.6655032807482898


In [41]:
# confusion matrix
print('rows:actual columns:predicted')

y_pred = dtc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

# y_pred = dtc.predict_proba(X_test)
# pd.DataFrame(y_pred,columns=le.classes_)

rows:actual columns:predicted


Unnamed: 0,circuit training,core stability training,cycle,gymnastics,hiking,kayaking,orienteering,rowing,run,skate,skiing,soccer,walk,weight training
circuit training,16,0,0,0,1,0,0,0,0,0,0,3,0,7
core stability training,6,49,3,3,13,0,4,6,10,3,8,13,6,9
cycle,79,1004,18345,21,58,50,98,194,378,5301,1784,13,52,155
gymnastics,0,0,0,19,0,0,0,0,0,0,0,1,0,0
hiking,5,0,0,2,46,0,0,0,0,0,0,4,14,0
kayaking,1,0,0,0,3,13,0,0,3,0,0,1,7,0
orienteering,10,3,0,0,10,10,147,5,25,1,4,17,26,2
rowing,0,0,0,0,0,0,0,18,1,0,0,0,1,1
run,40,197,187,0,94,508,2474,1708,14145,148,1006,58,424,163
skate,0,1,7,0,1,0,0,2,0,62,4,0,1,0


In [42]:
# # initialize the output file object
# dot_data = StringIO() 

# # my fit DecisionTreeRegressor object here is: dtr1
# # for feature_names i put the columns of my Xr matrix
# export_graphviz(dtc, 
#                 out_file=dot_data,  
#                 filled=True, 
#                 rounded=True,
#                 special_characters=True,
#                 feature_names=df_model[features].columns
#                )  

# graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
# Image(graph.create_png())

### RFC Model

In [43]:
# init model

rfc = RandomForestClassifier(n_estimators=10,max_depth=10,n_jobs=2, random_state=3050)

In [44]:
# perform cross validation

score = cross_val_score(rfc,X,y,cv=5)
print(score.mean(),score)

0.945020226065475 [0.94305028 0.94552797 0.9489918  0.94126975 0.94626133]


In [45]:
# fit model
rfc = rfc.fit(X_train,y_train)

# score model
print("train r2:",rfc.score(X_train, y_train))
print("test r2:",rfc.score(X_test, y_test))

train r2: 0.81255
test r2: 0.7285654454438484


In [46]:
# confusion matrix
print('rows:actual columns:predicted')

y_pred = rfc.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
pd.DataFrame(data=cm, columns=le.classes_, index=le.classes_)

# y_pred = rfc.predict_proba(X_test)
# pd.DataFrame(y_pred,columns=le.classes_)

rows:actual columns:predicted


Unnamed: 0,circuit training,core stability training,cycle,gymnastics,hiking,kayaking,orienteering,rowing,run,skate,skiing,soccer,walk,weight training
circuit training,24,0,0,0,1,0,0,0,0,0,0,0,0,2
core stability training,4,69,5,0,15,0,7,3,9,2,8,2,5,4
cycle,86,588,20316,18,29,43,63,175,374,3916,1702,8,83,131
gymnastics,0,0,0,20,0,0,0,0,0,0,0,0,0,0
hiking,0,0,0,0,60,0,0,0,0,0,0,0,9,2
kayaking,0,0,0,0,2,24,0,0,2,0,0,0,0,0
orienteering,1,1,1,0,11,4,192,3,21,0,3,14,8,1
rowing,0,0,0,0,0,0,0,20,1,0,0,0,0,0
run,33,141,236,0,79,233,2157,1003,15177,69,1518,61,435,10
skate,0,1,6,0,1,0,0,0,0,63,6,0,1,0


### GridSearch Model

In [47]:
# init models

estimators = {
    'lr': LogisticRegression(),
    'knn': KNeighborsClassifier(n_neighbors=5),
    'dtc': DecisionTreeClassifier(),
    'rfc': RandomForestClassifier(),
    'abc': AdaBoostClassifier(),
    'gbc': GradientBoostingClassifier()
}.items()

In [48]:
# init model parameters

params = {
    'lr': {

    },
    'knn': {
        'knn__n_neighbors': [3,5,7,9]
    },

    'dtc': {
        'dtc__max_features': ['auto', 'log2', None],
        'dtc__max_depth': [None, 5, 10, 15],
        'dtc__min_samples_split': np.linspace(0.1, 0.5, 5)
    },
    'rfc': {
        'rfc__n_estimators': [10, 20, 50, 100],
        'rfc__max_depth': [None, 5, 10, 15],
        'rfc__max_features': ['auto', 'log2', None],
        'rfc__min_samples_split': np.linspace(0.1, 0.5, 5)
    },
    'abc': {

    }, 
    'gbc': {

    }
}

In [49]:
models = []
parameters = []
best_score = []
roc_auc = []

for k,v in estimators:
    
    pipe = Pipeline([
        ('sc', StandardScaler()),
        (k,v)])
    
    gridsearch = GridSearchCV(
        estimator=pipe,
        param_grid=params[k],
        verbose=1,
        cv= 5,
        n_jobs=3,
        return_train_score= True
        #scoring = 'roc_auc'
    )

    gridsearch.fit(X_train, y_train)
    
    model = gridsearch.best_estimator_
    cv_score = gridsearch.cv_results_
    best_params = gridsearch.best_params_

    # predict y
    #y_pred = model.predict(X_test)
    y_pred = model.predict_proba(X_test)
    
    # print results
    print("Model: ", k)
    print("Best parameters:", best_params)
    print("Best score:", gridsearch.best_score_)
    #print("AUC/ROC test:", roc_auc_score(y_test,y_pred))
    display(pd.DataFrame(cv_score, columns = cv_score.keys()))    
    
    # append info to list
    models.append(k)
    best_score.append(gridsearch.best_score_)
    parameters.append(best_params)
    #roc_auc.append(roc_auc_score(y_test,y_pred))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:    9.8s finished


Model:  lr
Best parameters: {}
Best score: 0.38899285714285714


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,4.804388,0.352253,0.018034,0.005841,{},0.385821,0.390821,0.388536,0.390143,0.389643,0.388993,0.001752,1,0.390527,0.391545,0.387545,0.389304,0.386241,0.389032,0.00193


Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  20 out of  20 | elapsed:  1.1min finished


Model:  knn
Best parameters: {'knn__n_neighbors': 3}
Best score: 0.9707428571428571


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_knn__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.53425,0.068019,1.99398,0.061279,3,{'knn__n_neighbors': 3},0.9715,0.970393,0.970821,0.971893,...,0.970743,0.00097,1,0.98208,0.982205,0.98217,0.98167,0.981821,0.981989,0.000209
1,0.398262,0.133469,1.822601,0.195014,5,{'knn__n_neighbors': 5},0.962143,0.960714,0.962143,0.962321,...,0.961271,0.001259,2,0.97183,0.971857,0.971679,0.971232,0.97125,0.97157,0.000275
2,0.516618,0.059744,2.186551,0.230543,7,{'knn__n_neighbors': 7},0.951179,0.95125,0.952036,0.9515,...,0.9512,0.000655,3,0.961679,0.962268,0.962562,0.961893,0.962277,0.962136,0.000312
3,0.367499,0.0271,1.716418,0.073453,9,{'knn__n_neighbors': 9},0.94025,0.940429,0.940536,0.940393,...,0.939793,0.001221,4,0.952295,0.952804,0.95258,0.952366,0.952464,0.952502,0.000179


Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:    4.7s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:   22.9s
[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed:   36.4s finished


Model:  dtc
Best parameters: {'dtc__max_depth': None, 'dtc__max_features': None, 'dtc__min_samples_split': 0.1}
Best score: 0.5005857142857143


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_dtc__max_depth,param_dtc__max_features,param_dtc__min_samples_split,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.271813,0.007653,0.015621,0.01397212,,auto,0.1,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.369821,0.441071,...,0.429871,0.032127,8,0.370214,0.442071,0.463759,0.423304,0.448554,0.42958,0.032397
1,0.240569,0.027235,0.015621,1.963736e-06,,auto,0.2,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.357357,0.387821,...,0.346914,0.05529,20,0.357134,0.385232,0.259295,0.417375,0.317429,0.347293,0.054924
2,0.224946,0.03507,0.009373,0.01874552,,auto,0.3,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.186643,0.367571,...,0.259279,0.068211,44,0.18683,0.367375,0.272295,0.187179,0.28075,0.258886,0.067471
3,0.21245,0.012498,0.012498,0.01169036,,auto,0.4,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.212286,0.247893,...,0.232386,0.023385,51,0.211125,0.248509,0.200857,0.263125,0.2385,0.232423,0.023188
4,0.168711,0.018218,0.012496,0.01821728,,auto,0.5,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.141964,0.195786,...,0.160614,0.018476,60,0.143437,0.193134,0.154527,0.160786,0.1495,0.160277,0.017393
5,0.278059,0.015306,0.018745,0.01530506,,log2,0.1,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.438714,0.396679,...,0.400007,0.050128,12,0.440482,0.396902,0.446902,0.30575,0.413714,0.40075,0.050826
6,0.246818,0.030291,0.018745,0.006248427,,log2,0.2,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.3285,0.366893,...,0.332557,0.027321,23,0.328964,0.366036,0.345661,0.337054,0.282429,0.332029,0.027702
7,0.240569,0.045917,0.012495,0.00624733,,log2,0.3,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.320857,0.361643,...,0.294421,0.053478,32,0.320866,0.361679,0.327232,0.218768,0.246634,0.295036,0.053488
8,0.190581,0.030292,0.01562,9.818678e-07,,log2,0.4,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.258286,0.23375,...,0.213164,0.037303,52,0.259759,0.234304,0.149902,0.1965,0.227536,0.2136,0.037692
9,0.193705,0.056409,0.012497,0.01168982,,log2,0.5,"{'dtc__max_depth': None, 'dtc__max_features': ...",0.159429,0.185571,...,0.197421,0.046645,55,0.16075,0.186214,0.140161,0.247741,0.257929,0.198559,0.046767


Fitting 5 folds for each of 240 candidates, totalling 1200 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:  1.5min
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  5.8min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed: 15.4min
[Parallel(n_jobs=3)]: Done 794 tasks      | elapsed: 27.8min
[Parallel(n_jobs=3)]: Done 1200 out of 1200 | elapsed: 44.5min finished


Model:  rfc
Best parameters: {'rfc__max_depth': None, 'rfc__max_features': 'log2', 'rfc__min_samples_split': 0.1, 'rfc__n_estimators': 100}
Best score: 0.4700642857142857


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_rfc__max_depth,param_rfc__max_features,param_rfc__min_samples_split,param_rfc__n_estimators,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,1.428779,0.063067,0.108509,0.010390,,auto,0.1,10,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.453286,...,0.458443,0.008360,24,0.455098,0.462205,0.454857,0.473804,0.453170,0.459827,0.007647
1,2.428104,0.058880,0.172739,0.008402,,auto,0.1,20,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.464000,...,0.466471,0.005728,11,0.462607,0.475848,0.465857,0.462098,0.469196,0.467121,0.005053
2,5.445433,0.177013,0.423467,0.054295,,auto,0.1,50,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.474071,...,0.467679,0.004351,7,0.475000,0.467732,0.459830,0.467464,0.469786,0.467963,0.004884
3,12.157348,0.107170,0.935863,0.037926,,auto,0.1,100,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.467857,...,0.466150,0.004097,12,0.469420,0.468045,0.468929,0.465643,0.462759,0.466959,0.002469
4,1.073727,0.034918,0.100929,0.013009,,auto,0.2,10,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.404071,...,0.413179,0.012985,75,0.404268,0.391786,0.424991,0.425304,0.421723,0.413614,0.013384
5,1.967173,0.087086,0.192186,0.011624,,auto,0.2,20,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.419500,...,0.414129,0.004676,69,0.417973,0.407670,0.413598,0.415018,0.421330,0.415118,0.004573
6,4.610359,0.315278,0.473747,0.096143,,auto,0.2,50,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.442821,...,0.430979,0.007643,56,0.440518,0.425821,0.421857,0.434188,0.433795,0.431236,0.006613
7,8.744178,0.702249,0.781730,0.049692,,auto,0.2,100,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.433286,...,0.432886,0.003309,52,0.430982,0.428455,0.436268,0.429964,0.439955,0.433125,0.004311
8,0.796910,0.035497,0.092042,0.011078,,auto,0.3,10,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.345321,...,0.338393,0.032019,129,0.347027,0.286527,0.353321,0.329295,0.383277,0.339889,0.031855
9,1.415121,0.084696,0.157635,0.008090,,auto,0.3,20,"{'rfc__max_depth': None, 'rfc__max_features': ...",0.310714,...,0.363050,0.033877,118,0.313205,0.404179,0.358179,0.394009,0.345429,0.363000,0.033072


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   33.5s finished


Model:  abc
Best parameters: {}
Best score: 0.2904785714285714


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,9.921075,0.580439,1.305944,0.187353,{},0.307321,0.33175,0.282571,0.25725,0.2735,0.290479,0.026237,1,0.309259,0.329286,0.28133,0.258848,0.272759,0.290296,0.025521


Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:  9.9min finished


Model:  gbc
Best parameters: {}
Best score: 0.8011428571428572


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,300.83637,46.343662,1.099741,0.299766,{},0.800536,0.802464,0.802143,0.802464,0.798107,0.801143,0.001678,1,0.804795,0.805036,0.808018,0.806196,0.806643,0.806137,0.001167
