In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from feature_selector import FeatureSelector #credits to Will Koehrsen
#his class is on github https://github.com/WillKoehrsen/feature-selector/blob/master/Feature%20Selector%20Usage.ipynb

In [4]:
#Read the dataset
data_full = pd.read_csv("CE802_Ass_2019_Data.csv")
data_class = data_full["Class"]
data_full = data_full.drop(columns = ["Class"],axis = 1)


In [None]:
fs = FeatureSelector(data = data_full, labels = data_class)

In [109]:
#dictionary with the parameters to loop for
parameters_features = {'missing_threshold': np.arange(0.35,0.45,0.05),    
                       'correlation_threshold': np.arange(0.35,0.75,0.05),
                       'task': 'classification',    
                       'eval_metric': 'auc', 
                       'cumulative_importance': np.arange(0.65,1,0.05)}



In [110]:
#the following for with search for the best score based on a svm classifier
#and with the best score, it will find the best parameters to do feature selection
score = 0.0000000
for idx,x in enumerate(parameters_features['missing_threshold']):
    for idy,y in enumerate(parameters_features['correlation_threshold']):
        for idz,z in enumerate(parameters_features['cumulative_importance']):
            fs.identify_all(selection_params = {'missing_threshold': parameters_features['missing_threshold'][idx], 
                                                'correlation_threshold': parameters_features['correlation_threshold'][idy], 
                                                'task': 'classification', 'eval_metric': 'auc', 
                                                 'cumulative_importance': parameters_features['cumulative_importance'][idz]})
            train_removed_all_once = fs.remove(methods = 'all', keep_one_hot = True)
            
            data_features= train_removed_all_once.to_numpy()
            
            imputer = KNNImputer(n_neighbors=2, weights="uniform")
            data_features = imputer.fit_transform(data_features)
            data_feat = pd.DataFrame(data_features, index=range(data_features.shape[0]),
                                      columns=range(data_features.shape[1]))

            #Lets first normalize the features for K-NN and SVM
            scaler = StandardScaler()
            scaler.fit(data_feat)
            data_feat = scaler.transform(data_feat)

            #Splitting of data to see model accuracy after cross validation and gridsearch
            data_feat_train, data_feat_test, data_class_train, data_class_test = train_test_split(data_feat,data_class,test_size=0.25,stratify=data_class,random_state=1234)
            
            clf_svm = svm.SVC()
            param_grid = {'C': np.logspace(-1, 3, 9),  
                          'gamma': np.logspace(-7, -0, 8)}

            svm_gridsearch = GridSearchCV(clf_svm,param_grid,n_jobs=-1, cv = 10)
            svm_gridsearch.fit(data_feat_train,data_class_train)
            if svm_gridsearch.best_score_ >= score:
                score = svm_gridsearch.best_score_
                parameters_values = [parameters_features['missing_threshold'][idx],parameters_features['correlation_threshold'][idy],parameters_features['cumulative_importance'][idz]]
            #print("Best parameters: " + str(svm_gridsearch.best_params_))
            #print("Best score : " + str(svm_gridsearch.best_score_))

0 features with greater than 0.35 missing values.

0 features with a single unique value.

4 features with a correlation magnitude greater than 0.35.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[16]	valid_0's auc: 0.668831	valid_0's binary_logloss: 0.640707
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.744228	valid_0's binary_logloss: 0.676268
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.764069	valid_0's binary_logloss: 0.657408
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[30]	valid_0's auc: 0.61039	valid_0's binary_logloss: 0.672997
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[177]	valid_0's auc: 0.775613	valid_0's binary_logloss: 0.586718
Training unt

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[23]	valid_0's auc: 0.626263	valid_0's binary_logloss: 0.660929

0 features with zero importance after one-hot encoding.

12 features required for cumulative importance of 0.80 after one hot encoding.
8 features do not contribute to cumulative importance of 0.80.

12 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 12 features.
0 features with greater than 0.35 missing values.

0 features with a single unique value.

4 features with a correlation magnitude greater than 0.35.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[81]	valid_0's auc: 0.761183	valid_0's binary_logloss: 0.565086
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration 

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[10]	valid_0's auc: 0.662338	valid_0's binary_logloss: 0.652834
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[109]	valid_0's auc: 0.752525	valid_0's binary_logloss: 0.584608
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[119]	valid_0's auc: 0.816017	valid_0's binary_logloss: 0.541659
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.587302	valid_0's binary_logloss: 0.683151

0 features with zero importance after one-hot encoding.

9 features required for cumulative importance of 0.65 after one hot encoding.
11 features do not contribute to cumulative importance of 0.65.

13 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'

Early stopping, best iteration is:
[150]	valid_0's auc: 0.712121	valid_0's binary_logloss: 0.619626
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's auc: 0.670274	valid_0's binary_logloss: 0.660278
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[180]	valid_0's auc: 0.735931	valid_0's binary_logloss: 0.608117
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[61]	valid_0's auc: 0.702742	valid_0's binary_logloss: 0.618625
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.606782	valid_0's binary_logloss: 0.680599
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[13]	valid_0's auc: 0.71645	valid_0's binary_logloss: 0.651486
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:

0 features with greater than 0.35 missing values.

0 features with a single unique value.

3 features with a correlation magnitude greater than 0.45.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[63]	valid_0's auc: 0.694084	valid_0's binary_logloss: 0.630458
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[111]	valid_0's auc: 0.743867	valid_0's binary_logloss: 0.615003
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	valid_0's auc: 0.704185	valid_0's binary_logloss: 0.634622
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[203]	valid_0's auc: 0.841991	valid_0's binary_logloss: 0.507523
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.728355	valid_0's binary_logloss: 0.676846
Training 

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[27]	valid_0's auc: 0.66811	valid_0's binary_logloss: 0.639554

0 features with zero importance after one-hot encoding.

14 features required for cumulative importance of 0.85 after one hot encoding.
6 features do not contribute to cumulative importance of 0.85.

8 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 8 features.
0 features with greater than 0.35 missing values.

0 features with a single unique value.

3 features with a correlation magnitude greater than 0.45.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[17]	valid_0's auc: 0.585859	valid_0's binary_logloss: 0.681016
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.72619	valid_0's binary_logloss: 0.666479
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	valid_0's auc: 0.798701	valid_0's binary_logloss: 0.575175
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[46]	valid_0's auc: 0.750361	valid_0's binary_logloss: 0.621882
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[52]	valid_0's auc: 0.784993	valid_0's binary_logloss: 0.562804

0 features with zero importance after one-hot encoding.

10 features required for cumulative importance of 0.70 after one hot encoding.
10 features do not contribute to cumulative importance of 0.70.

12 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] m

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[8]	valid_0's auc: 0.80303	valid_0's binary_logloss: 0.625106
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[25]	valid_0's auc: 0.821789	valid_0's binary_logloss: 0.585599
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[11]	valid_0's auc: 0.729437	valid_0's binary_logloss: 0.635521
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[69]	valid_0's auc: 0.699134	valid_0's binary_logloss: 0.636866
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[6]	valid_0's auc: 0.576479	valid_0's binary_logloss: 0.674581
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[21]	valid_0's auc: 0.739177	valid_0's binary_logloss: 0.621754
Training until validation scores don'

Early stopping, best iteration is:
[11]	valid_0's auc: 0.601732	valid_0's binary_logloss: 0.669452
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[25]	valid_0's auc: 0.626984	valid_0's binary_logloss: 0.661115
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.694805	valid_0's binary_logloss: 0.678116
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[23]	valid_0's auc: 0.718975	valid_0's binary_logloss: 0.618463
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[65]	valid_0's auc: 0.751804	valid_0's binary_logloss: 0.587241
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]	valid_0's auc: 0.647908	valid_0's binary_logloss: 0.654662
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:


0 features with greater than 0.35 missing values.

0 features with a single unique value.

2 features with a correlation magnitude greater than 0.55.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[11]	valid_0's auc: 0.643939	valid_0's binary_logloss: 0.658812
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[49]	valid_0's auc: 0.743867	valid_0's binary_logloss: 0.587728
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.736291	valid_0's binary_logloss: 0.641092
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's auc: 0.68759	valid_0's binary_logloss: 0.651648
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.709235	valid_0's binary_logloss: 0.661241
Training unti

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41]	valid_0's auc: 0.713564	valid_0's binary_logloss: 0.606528

0 features with zero importance after one-hot encoding.

11 features required for cumulative importance of 0.75 after one hot encoding.
9 features do not contribute to cumulative importance of 0.75.

10 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 10 features.
0 features with greater than 0.35 missing values.

0 features with a single unique value.

1 features with a correlation magnitude greater than 0.60.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[11]	valid_0's auc: 0.751082	valid_0's binary_logloss: 0.631021
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration 

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.6829	valid_0's binary_logloss: 0.646807
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[130]	valid_0's auc: 0.77417	valid_0's binary_logloss: 0.563264
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[124]	valid_0's auc: 0.775613	valid_0's binary_logloss: 0.560415
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[6]	valid_0's auc: 0.76912	valid_0's binary_logloss: 0.63898

0 features with zero importance after one-hot encoding.

16 features required for cumulative importance of 0.95 after one hot encoding.
4 features do not contribute to cumulative importance of 0.95.

5 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] metho

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[36]	valid_0's auc: 0.695527	valid_0's binary_logloss: 0.638516
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[33]	valid_0's auc: 0.794372	valid_0's binary_logloss: 0.5907
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[124]	valid_0's auc: 0.753247	valid_0's binary_logloss: 0.57574
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[38]	valid_0's auc: 0.75469	valid_0's binary_logloss: 0.594414
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[6]	valid_0's auc: 0.694805	valid_0's binary_logloss: 0.660243
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.667749	valid_0's binary_logloss: 0.673981
Training until validation scores don't 

[29]	valid_0's auc: 0.65368	valid_0's binary_logloss: 0.652442
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.656926	valid_0's binary_logloss: 0.671627
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[15]	valid_0's auc: 0.639971	valid_0's binary_logloss: 0.650896
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[73]	valid_0's auc: 0.725108	valid_0's binary_logloss: 0.600024
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[71]	valid_0's auc: 0.697691	valid_0's binary_logloss: 0.606293
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[125]	valid_0's auc: 0.743867	valid_0's binary_logloss: 0.591873
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[27]	valid_0's auc: 0.683983	valid_

0 features with greater than 0.35 missing values.

0 features with a single unique value.

1 features with a correlation magnitude greater than 0.70.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[44]	valid_0's auc: 0.827561	valid_0's binary_logloss: 0.550099
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.686147	valid_0's binary_logloss: 0.670559
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.7886	valid_0's binary_logloss: 0.631839
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.589466	valid_0's binary_logloss: 0.681312
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[95]	valid_0's auc: 0.710678	valid_0's binary_logloss: 0.614788
Training until 

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.760101	valid_0's binary_logloss: 0.67013

0 features with zero importance after one-hot encoding.

9 features required for cumulative importance of 0.65 after one hot encoding.
11 features do not contribute to cumulative importance of 0.65.

14 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 14 features.
0 features with greater than 0.40 missing values.

0 features with a single unique value.

4 features with a correlation magnitude greater than 0.35.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[21]	valid_0's auc: 0.629149	valid_0's binary_logloss: 0.654647
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[36]	valid_0's auc: 0.700577	valid_0's binary_logloss: 0.617654
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[45]	valid_0's auc: 0.727273	valid_0's binary_logloss: 0.600001
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[108]	valid_0's auc: 0.680375	valid_0's binary_logloss: 0.658089
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's auc: 0.65873	valid_0's binary_logloss: 0.656719

0 features with zero importance after one-hot encoding.

14 features required for cumulative importance of 0.85 after one hot encoding.
6 features do not contribute to cumulative importance of 0.85.

10 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] 

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[194]	valid_0's auc: 0.805195	valid_0's binary_logloss: 0.543706
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[36]	valid_0's auc: 0.732323	valid_0's binary_logloss: 0.604468
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's auc: 0.701299	valid_0's binary_logloss: 0.623339
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[45]	valid_0's auc: 0.691919	valid_0's binary_logloss: 0.625919
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[245]	valid_0's auc: 0.742424	valid_0's binary_logloss: 0.600879
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[107]	valid_0's auc: 0.728716	valid_0's binary_logloss: 0.610007
Training until validation score

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.732323	valid_0's binary_logloss: 0.605914
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.702381	valid_0's binary_logloss: 0.648931
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[45]	valid_0's auc: 0.743867	valid_0's binary_logloss: 0.607448
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]	valid_0's auc: 0.708514	valid_0's binary_logloss: 0.624433
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[315]	valid_0's auc: 0.810245	valid_0's binary_logloss: 0.567042
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[104]	valid_0's auc: 0.704906	valid_0's binary_logloss: 0.640261
Training until validation scores 

0 features with greater than 0.40 missing values.

0 features with a single unique value.

3 features with a correlation magnitude greater than 0.45.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's auc: 0.772727	valid_0's binary_logloss: 0.586922
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's auc: 0.670274	valid_0's binary_logloss: 0.632684
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[24]	valid_0's auc: 0.645743	valid_0's binary_logloss: 0.65886
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[158]	valid_0's auc: 0.735931	valid_0's binary_logloss: 0.62001
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[61]	valid_0's auc: 0.660173	valid_0's binary_logloss: 0.658381
Training un

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[85]	valid_0's auc: 0.733766	valid_0's binary_logloss: 0.61481

0 features with zero importance after one-hot encoding.

15 features required for cumulative importance of 0.90 after one hot encoding.
5 features do not contribute to cumulative importance of 0.90.

8 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 8 features.
0 features with greater than 0.40 missing values.

0 features with a single unique value.

3 features with a correlation magnitude greater than 0.45.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.721501	valid_0's binary_logloss: 0.635762
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[106]	valid_0's auc: 0.674603	valid_0's binary_logloss: 0.668951
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[62]	valid_0's auc: 0.733766	valid_0's binary_logloss: 0.606338
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[120]	valid_0's auc: 0.772727	valid_0's binary_logloss: 0.560758
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's auc: 0.761183	valid_0's binary_logloss: 0.588024

0 features with zero importance after one-hot encoding.

11 features required for cumulative importance of 0.75 after one hot encoding.
9 features do not contribute to cumulative importance of 0.75.

11 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.656566	valid_0's binary_logloss: 0.6763
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	valid_0's auc: 0.641414	valid_0's binary_logloss: 0.659942
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[94]	valid_0's auc: 0.723665	valid_0's binary_logloss: 0.628968
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[14]	valid_0's auc: 0.642136	valid_0's binary_logloss: 0.662394
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[168]	valid_0's auc: 0.807359	valid_0's binary_logloss: 0.540307
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[81]	valid_0's auc: 0.727273	valid_0's binary_logloss: 0.607325
Training until validation scores don

Early stopping, best iteration is:
[172]	valid_0's auc: 0.823232	valid_0's binary_logloss: 0.531989
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[49]	valid_0's auc: 0.656566	valid_0's binary_logloss: 0.652281
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's auc: 0.684704	valid_0's binary_logloss: 0.629227
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's auc: 0.695527	valid_0's binary_logloss: 0.640806
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[6]	valid_0's auc: 0.667749	valid_0's binary_logloss: 0.662759
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[88]	valid_0's auc: 0.799423	valid_0's binary_logloss: 0.558356
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:

0 features with greater than 0.40 missing values.

0 features with a single unique value.

1 features with a correlation magnitude greater than 0.60.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[208]	valid_0's auc: 0.748918	valid_0's binary_logloss: 0.611495
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's auc: 0.714286	valid_0's binary_logloss: 0.616054
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[141]	valid_0's auc: 0.746753	valid_0's binary_logloss: 0.598267
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[11]	valid_0's auc: 0.621212	valid_0's binary_logloss: 0.663036
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[43]	valid_0's auc: 0.739538	valid_0's binary_logloss: 0.605185
Training

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[86]	valid_0's auc: 0.878066	valid_0's binary_logloss: 0.475333

0 features with zero importance after one-hot encoding.

13 features required for cumulative importance of 0.80 after one hot encoding.
7 features do not contribute to cumulative importance of 0.80.

8 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 8 features.
0 features with greater than 0.40 missing values.

0 features with a single unique value.

1 features with a correlation magnitude greater than 0.60.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[99]	valid_0's auc: 0.715007	valid_0's binary_logloss: 0.615241
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.676407	valid_0's binary_logloss: 0.6785
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41]	valid_0's auc: 0.744589	valid_0's binary_logloss: 0.595516
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[91]	valid_0's auc: 0.712843	valid_0's binary_logloss: 0.619392
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[33]	valid_0's auc: 0.759019	valid_0's binary_logloss: 0.610051

0 features with zero importance after one-hot encoding.

9 features required for cumulative importance of 0.65 after one hot encoding.
11 features do not contribute to cumulative importance of 0.65.

11 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] met

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[14]	valid_0's auc: 0.58658	valid_0's binary_logloss: 0.672363
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.662338	valid_0's binary_logloss: 0.661021
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[54]	valid_0's auc: 0.633478	valid_0's binary_logloss: 0.670373
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[56]	valid_0's auc: 0.696248	valid_0's binary_logloss: 0.630249
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[70]	valid_0's auc: 0.685426	valid_0's binary_logloss: 0.633986
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[64]	valid_0's auc: 0.650794	valid_0's binary_logloss: 0.66862
Training until validation scores don'

[133]	valid_0's auc: 0.737374	valid_0's binary_logloss: 0.607482
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[148]	valid_0's auc: 0.720779	valid_0's binary_logloss: 0.614443
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[172]	valid_0's auc: 0.708514	valid_0's binary_logloss: 0.664844
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's auc: 0.668831	valid_0's binary_logloss: 0.647164
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[23]	valid_0's auc: 0.660534	valid_0's binary_logloss: 0.645032
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[69]	valid_0's auc: 0.72583	valid_0's binary_logloss: 0.607971
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[17]	valid_0's auc: 0.623377	val

0 features with greater than 0.40 missing values.

0 features with a single unique value.

1 features with a correlation magnitude greater than 0.70.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.771284	valid_0's binary_logloss: 0.66727
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[152]	valid_0's auc: 0.785714	valid_0's binary_logloss: 0.554398
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's auc: 0.727273	valid_0's binary_logloss: 0.601438
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[198]	valid_0's auc: 0.782828	valid_0's binary_logloss: 0.596628
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[86]	valid_0's auc: 0.690476	valid_0's binary_logloss: 0.620568
Training u

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[60]	valid_0's auc: 0.720058	valid_0's binary_logloss: 0.624779

0 features with zero importance after one-hot encoding.

10 features required for cumulative importance of 0.70 after one hot encoding.
10 features do not contribute to cumulative importance of 0.70.

13 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 13 features.
0 features with greater than 0.45 missing values.

0 features with a single unique value.

4 features with a correlation magnitude greater than 0.35.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[26]	valid_0's auc: 0.692641	valid_0's binary_logloss: 0.630461
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.743867	valid_0's binary_logloss: 0.651533
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[19]	valid_0's auc: 0.703102	valid_0's binary_logloss: 0.629809
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[16]	valid_0's auc: 0.668831	valid_0's binary_logloss: 0.651728
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[8]	valid_0's auc: 0.750722	valid_0's binary_logloss: 0.636156

0 features with zero importance after one-hot encoding.

15 features required for cumulative importance of 0.90 after one hot encoding.
5 features do not contribute to cumulative importance of 0.90.

8 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] met

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[123]	valid_0's auc: 0.73088	valid_0's binary_logloss: 0.619957
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[135]	valid_0's auc: 0.792208	valid_0's binary_logloss: 0.547252
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.771645	valid_0's binary_logloss: 0.655949
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[34]	valid_0's auc: 0.710678	valid_0's binary_logloss: 0.619038
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[110]	valid_0's auc: 0.753247	valid_0's binary_logloss: 0.578684
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[30]	valid_0's auc: 0.713564	valid_0's binary_logloss: 0.628315
Training until validation scores 

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[28]	valid_0's auc: 0.67316	valid_0's binary_logloss: 0.637903
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[19]	valid_0's auc: 0.79798	valid_0's binary_logloss: 0.605409
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[47]	valid_0's auc: 0.758297	valid_0's binary_logloss: 0.594272
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[128]	valid_0's auc: 0.762626	valid_0's binary_logloss: 0.580907
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[179]	valid_0's auc: 0.834055	valid_0's binary_logloss: 0.502531
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's auc: 0.756854	valid_0's binary_logloss: 0.591627
Training until validation scores d

0 features with greater than 0.45 missing values.

0 features with a single unique value.

3 features with a correlation magnitude greater than 0.45.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[57]	valid_0's auc: 0.676768	valid_0's binary_logloss: 0.635855
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[48]	valid_0's auc: 0.722222	valid_0's binary_logloss: 0.60608
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[96]	valid_0's auc: 0.726551	valid_0's binary_logloss: 0.60181
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[182]	valid_0's auc: 0.727994	valid_0's binary_logloss: 0.615831
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.59127	valid_0's binary_logloss: 0.682053
Training unti

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[209]	valid_0's auc: 0.777778	valid_0's binary_logloss: 0.586793

0 features with zero importance after one-hot encoding.

17 features required for cumulative importance of 0.95 after one hot encoding.
3 features do not contribute to cumulative importance of 0.95.

6 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 6 features.
0 features with greater than 0.45 missing values.

0 features with a single unique value.

3 features with a correlation magnitude greater than 0.50.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.73557	valid_0's binary_logloss: 0.660836
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.541486	valid_0's binary_logloss: 0.684282
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[11]	valid_0's auc: 0.623377	valid_0's binary_logloss: 0.660494
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[34]	valid_0's auc: 0.618326	valid_0's binary_logloss: 0.664868
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[93]	valid_0's auc: 0.778499	valid_0's binary_logloss: 0.553945

0 features with zero importance after one-hot encoding.

12 features required for cumulative importance of 0.80 after one hot encoding.
8 features do not contribute to cumulative importance of 0.80.

11 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] m

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.790043	valid_0's binary_logloss: 0.611342
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[136]	valid_0's auc: 0.672439	valid_0's binary_logloss: 0.659946
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41]	valid_0's auc: 0.74026	valid_0's binary_logloss: 0.611093
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[89]	valid_0's auc: 0.702742	valid_0's binary_logloss: 0.622126
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[105]	valid_0's auc: 0.685426	valid_0's binary_logloss: 0.629464
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[39]	valid_0's auc: 0.70202	valid_0's binary_logloss: 0.621585
Training until validation scores d

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[21]	valid_0's auc: 0.665224	valid_0's binary_logloss: 0.654258
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[94]	valid_0's auc: 0.689755	valid_0's binary_logloss: 0.632502
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 0.700577	valid_0's binary_logloss: 0.661527
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[43]	valid_0's auc: 0.727994	valid_0's binary_logloss: 0.611497
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[96]	valid_0's auc: 0.722944	valid_0's binary_logloss: 0.618136
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[86]	valid_0's auc: 0.76912	valid_0's binary_logloss: 0.567585
Training until validation scores don

0 features with greater than 0.45 missing values.

0 features with a single unique value.

1 features with a correlation magnitude greater than 0.60.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 0.674603	valid_0's binary_logloss: 0.676834
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[14]	valid_0's auc: 0.655483	valid_0's binary_logloss: 0.649571
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[22]	valid_0's auc: 0.74531	valid_0's binary_logloss: 0.596984
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[123]	valid_0's auc: 0.731602	valid_0's binary_logloss: 0.611538
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[76]	valid_0's auc: 0.736652	valid_0's binary_logloss: 0.607084
Training un

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[186]	valid_0's auc: 0.793651	valid_0's binary_logloss: 0.56

0 features with zero importance after one-hot encoding.

14 features required for cumulative importance of 0.85 after one hot encoding.
6 features do not contribute to cumulative importance of 0.85.

7 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance'] methods have been run

Removed 7 features.
0 features with greater than 0.45 missing values.

0 features with a single unique value.

1 features with a correlation magnitude greater than 0.60.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[160]	valid_0's auc: 0.751804	valid_0's binary_logloss: 0.605608
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:


Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.515873	valid_0's binary_logloss: 0.685652
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[111]	valid_0's auc: 0.735209	valid_0's binary_logloss: 0.620855
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[35]	valid_0's auc: 0.704906	valid_0's binary_logloss: 0.618142
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[25]	valid_0's auc: 0.697691	valid_0's binary_logloss: 0.633292

0 features with zero importance after one-hot encoding.

10 features required for cumulative importance of 0.70 after one hot encoding.
10 features do not contribute to cumulative importance of 0.70.

11 total features out of 20 identified for removal after one-hot encoding.

['missing', 'single_unique', 'collinear', 'zero_importance', 'low_importance']

[123]	valid_0's auc: 0.720779	valid_0's binary_logloss: 0.615895
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[12]	valid_0's auc: 0.580808	valid_0's binary_logloss: 0.679801
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.69228	valid_0's binary_logloss: 0.676813
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[19]	valid_0's auc: 0.701299	valid_0's binary_logloss: 0.631326
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[9]	valid_0's auc: 0.606782	valid_0's binary_logloss: 0.676429
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[30]	valid_0's auc: 0.734488	valid_0's binary_logloss: 0.614024
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[16]	valid_0's auc: 0.750361	valid_0

Early stopping, best iteration is:
[124]	valid_0's auc: 0.755411	valid_0's binary_logloss: 0.581206
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[71]	valid_0's auc: 0.726551	valid_0's binary_logloss: 0.621037
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[142]	valid_0's auc: 0.771284	valid_0's binary_logloss: 0.569505
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[27]	valid_0's auc: 0.694805	valid_0's binary_logloss: 0.633353
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.695887	valid_0's binary_logloss: 0.669964
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[14]	valid_0's auc: 0.784993	valid_0's binary_logloss: 0.618134
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is

0 features with greater than 0.45 missing values.

0 features with a single unique value.

1 features with a correlation magnitude greater than 0.70.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[10]	valid_0's auc: 0.576479	valid_0's binary_logloss: 0.675798
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[37]	valid_0's auc: 0.790043	valid_0's binary_logloss: 0.583154
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[42]	valid_0's auc: 0.684704	valid_0's binary_logloss: 0.628785
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[82]	valid_0's auc: 0.707792	valid_0's binary_logloss: 0.623362
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[218]	valid_0's auc: 0.781385	valid_0's binary_logloss: 0.570993
Training 

In [111]:
#"0.7356330014224751{'C': 3.1622776601683795, 'gamma': 0.1}"
print(score)
print(parameters_values)
#[0.39999999999999997, 0.7, 0.7500000000000001]

0.738122332859175
[0.39999999999999997, 0.7, 0.7500000000000001]


In [14]:
fs = FeatureSelector(data = data_full, labels = data_class)
fs.identify_all(selection_params = {'missing_threshold': 0.39999999999999997,    
                                    'correlation_threshold': 0.7, 
                                    'task': 'classification',    
                                    'eval_metric': 'auc', 
                                    'cumulative_importance': 0.7500000000000001})


train_removed_all_once = fs.remove(methods = 'all', keep_one_hot = True)
print(train_removed_all_once)           
data_features= train_removed_all_once.to_numpy()
          
imputer = KNNImputer(n_neighbors=2, weights="uniform")
data_features = imputer.fit_transform(data_features)
data_feat = pd.DataFrame(data_features, index=range(data_features.shape[0]),
                                      columns=range(data_features.shape[1])) 
#Lets first normalize the features for K-NN and SVM
scaler = StandardScaler()
scaler.fit(data_feat)
data_feat = scaler.transform(data_feat)

#Splitting of data to see model accuracy after cross validation and gridsearch
data_feat_train, data_feat_test, data_class_train, data_class_test = train_test_split(data_feat,data_class,test_size=0.25,stratify=data_class,random_state=1234)
            
clf_svm = svm.SVC()
param_grid = {'C': np.logspace(-1, 3, 9),  
              'gamma': np.logspace(-7, -0, 8)}

svm_gridsearch = GridSearchCV(clf_svm,param_grid,n_jobs=-1, cv = 10)
svm_gridsearch.fit(data_feat_train,data_class_train)
print("Best parameters: " + str(svm_gridsearch.best_params_))
print("Best score : " + str(svm_gridsearch.best_score_))
#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
svm_model = svm.SVC(C = svm_gridsearch.best_params_['C'],gamma=svm_gridsearch.best_params_['gamma'])

#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_svm = cross_val_score(svm_model,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_svm))

#Now lets compute the confussion matrix by splitting the data into trainning and testing
svm_model.fit(data_feat_train,data_class_train)
svm_pred = svm_model.predict(data_feat_test)
print(confusion_matrix(data_class_test, svm_pred))
print(classification_report(data_class_test, svm_pred))

0 features with greater than 0.40 missing values.

0 features with a single unique value.

1 features with a correlation magnitude greater than 0.70.

Training Gradient Boosting Model

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[14]	valid_0's auc: 0.70202	valid_0's binary_logloss: 0.636678
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[40]	valid_0's auc: 0.722944	valid_0's binary_logloss: 0.617109
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[119]	valid_0's auc: 0.723665	valid_0's binary_logloss: 0.614158
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.345599	valid_0's binary_logloss: 0.692999
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[222]	valid_0's auc: 0.779221	valid_0's binary_logloss: 0.564797
Training u

In [8]:
knn_gridcv = KNeighborsClassifier()
#create a dictionary with the number of neighbors to try
param_gridsearch = {'n_neighbors': np.arange(1,80)}

knn_gridsearch = GridSearchCV(knn_gridcv,param_gridsearch,cv=10)
knn_gridsearch.fit(data_feat_train,data_class_train)
print("Best parameters: " + str(knn_gridsearch.best_params_))
print("Best score: "+ str(knn_gridsearch.best_score_))

Best parameters: {'n_neighbors': 14}
Best score: 0.6561877667140825


In [9]:
#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
knn_model = KNeighborsClassifier(n_neighbors = knn_gridsearch.best_params_['n_neighbors'])

#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_knn = cross_val_score(knn_model,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_knn))

#Now lets compute the confussion matrix by splitting the data into trainning and testing
knn_model.fit(data_feat_train,data_class_train)
knn_pred = knn_model.predict(data_feat_test)
print(confusion_matrix(data_class_test, knn_pred))
print(classification_report(data_class_test, knn_pred))

Average accuracy: 0.6561877667140825
[[61  9]
 [33 22]]
              precision    recall  f1-score   support

       False       0.65      0.87      0.74        70
        True       0.71      0.40      0.51        55

    accuracy                           0.66       125
   macro avg       0.68      0.64      0.63       125
weighted avg       0.68      0.66      0.64       125



In [10]:
#Decision tree using grid search
clf_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
param_grid = {'max_depth': np.arange(4,21),'min_samples_split': np.arange(4,21),'min_samples_leaf': np.arange(4,21)}
tree_gridcv = GridSearchCV(clf_tree,param_grid,cv=10 ,n_jobs=-1)
tree_gridcv.fit(data_feat_train,data_class_train)

print("Best parameters: " + str(tree_gridcv.best_params_))
print("Best score: " + str(tree_gridcv.best_score_))

Best parameters: {'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 18}
Best score: 0.6104551920341394


In [11]:
#Now with these parameters, lets perform cross validation
clf_tree_prunned = tree.DecisionTreeClassifier(criterion = 'entropy',
                                               max_depth= tree_gridcv.best_params_['max_depth'],
                                               min_samples_leaf= tree_gridcv.best_params_['min_samples_leaf'],
                                               min_samples_split=tree_gridcv.best_params_['min_samples_split'] )

#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_tree = cross_val_score(clf_tree_prunned,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_tree))

#Now lets compute the confussion matrix by splitting the data into trainning and testing
clf_tree_prunned.fit(data_feat_train,data_class_train)
tree_pred = clf_tree_prunned.predict(data_feat_test)
print(confusion_matrix(data_class_test, tree_pred))
print(classification_report(data_class_test, tree_pred))

Average accuracy: 0.6104551920341394
[[36 34]
 [34 21]]
              precision    recall  f1-score   support

       False       0.51      0.51      0.51        70
        True       0.38      0.38      0.38        55

    accuracy                           0.46       125
   macro avg       0.45      0.45      0.45       125
weighted avg       0.46      0.46      0.46       125



XGBoost