In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, precision_score, recall_score,f1_score
from sklearn.tree import DecisionTreeClassifier  
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn.model_selection import RepeatedStratifiedKFold, GridSearchCV
import os
import shap

In [19]:
svm_model = svm.SVC(random_state=0,gamma='auto')
rf_model=RandomForestClassifier(random_state=0)
dt_model=DecisionTreeClassifier(random_state=0,criterion='entropy',max_depth = 7,min_samples_leaf=30) 
lr_model= LogisticRegression(random_state=0, max_iter=300)
mlp_model =MLPClassifier(random_state=0,activation='relu', solver='sgd',learning_rate='adaptive')
xgb_model = XGBClassifier(random_state=0 ,learning_rate=0.05, max_depth=7,eval_metric='mlogloss',use_label_encoder =False)
gmb_model= GradientBoostingClassifier(random_state=0,n_estimators=20,learning_rate=0.75,max_features=4,max_depth=5)

model_params = {
    'svm': {
        'model': svm_model,
        'params' : {
            'C': [10],
            'kernel': ['rbf']
        }  
     },
    'rf': {
        'model': rf_model,
        'params' : {
            'n_estimators': [5]
        }
    },
     'dt': {
        'model': dt_model,
        'params' : {}
    },
    'lr' : {
        'model':lr_model,
        'params': {
            'C': [5]
        }
    },
    'mlp' : {
        'model':mlp_model,
        'params': {}
    },
    'xg_boost' : {
        'model':xgb_model,
        'params': {}
    },
    'gbm' : {
        'model':gmb_model,
        'params': {}
    }
}

# Dataset 1

In [20]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'1-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'1-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'1-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'1-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.904847   0.631016  0.648352  0.639566            1024   
1        rf    0.914159   0.584541  0.664835  0.622108            1007   
2        dt    0.857500   0.709184  0.763736  0.735450            1036   
3        lr    0.801487   0.361404  0.565934  0.441113             911   
4       mlp    0.860953   0.523438  0.736264  0.611872             971   
5  xg_boost    0.930834   0.770053  0.791209  0.780488            1050   
6       gbm    0.910512   0.610837  0.681319  0.644156            1014   

   true negatives  false postives  false negatives                 best_params  
0             118              69               64  {'C': 10, 'kernel': 'rbf'}  
1             121              86               61         {'n_estimators': 5}  
2             139              57               43                          {}  
3             103             182               79                    {'C': 5}  
4 

# Dataset 2

In [21]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'2-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'2-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'2-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'2-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.903385   0.628415  0.628415  0.628415            1024   
1        rf    0.906055   0.576923  0.737705  0.647482             993   
2        dt    0.825000   0.602410  0.819672  0.694444             993   
3        lr    0.794596   0.372240  0.644809  0.472000             893   
4       mlp    0.858724   0.556000  0.759563  0.642032             981   
5  xg_boost    0.924805   0.777778  0.803279  0.790323            1050   
6       gbm    0.900716   0.623810  0.715847  0.666667            1013   

   true negatives  false postives  false negatives                 best_params  
0             115              68               68  {'C': 10, 'kernel': 'rbf'}  
1             135              99               48         {'n_estimators': 5}  
2             150              99               33                          {}  
3             118             199               65                    {'C': 5}  
4 

# Dataset 3

In [22]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'3-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'3-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'3-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'3-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.911662   0.631868  0.614973  0.623306            1021   
1        rf    0.915368   0.614679  0.716578  0.661728            1004   
2        dt    0.864404   0.606695  0.775401  0.680751             994   
3        lr    0.796545   0.358306  0.588235  0.445344             891   
4       mlp    0.867395   0.546559  0.721925  0.622120             976   
5  xg_boost    0.935258   0.813953  0.748663  0.779944            1056   
6       gbm    0.912768   0.668317  0.721925  0.694087            1021   

   true negatives  false postives  false negatives                 best_params  
0             115              67               72  {'C': 10, 'kernel': 'rbf'}  
1             134              84               53         {'n_estimators': 5}  
2             145              94               42                          {}  
3             110             197               77                    {'C': 5}  
4 

# Dataset 4

In [23]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'4-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'4-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'4-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'4-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.910029   0.594444  0.622093  0.607955            1030   
1        rf    0.908527   0.604762  0.738372  0.664921            1020   
2        dt    0.858574   0.646512  0.808140  0.718346            1027   
3        lr    0.798284   0.320872  0.598837  0.417850             885   
4       mlp    0.869620   0.508197  0.720930  0.596154             983   
5  xg_boost    0.925004   0.823171  0.784884  0.803571            1074   
6       gbm    0.904014   0.587379  0.703488  0.640212            1018   

   true negatives  false postives  false negatives                 best_params  
0             107              73               65  {'C': 10, 'kernel': 'rbf'}  
1             127              83               45         {'n_estimators': 5}  
2             139              76               33                          {}  
3             103             218               69                    {'C': 5}  
4 

# Dataset 5

In [24]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'5-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'5-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'5-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'5-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.904174   0.591837  0.610526  0.601036            1005   
1        rf    0.908774   0.560669  0.705263  0.624709             980   
2        dt    0.843658   0.663551  0.747368  0.702970            1013   
3        lr    0.784185   0.345679  0.589474  0.435798             873   
4       mlp    0.859045   0.553785  0.731579  0.630385             973   
5  xg_boost    0.918064   0.786458  0.794737  0.790576            1044   
6       gbm    0.904039   0.612069  0.747368  0.672986             995   

   true negatives  false postives  false negatives                 best_params  
0             116              80               74  {'C': 10, 'kernel': 'rbf'}  
1             134             105               56         {'n_estimators': 5}  
2             142              72               48                          {}  
3             112             212               78                    {'C': 5}  
4 

# Dataset 6

In [25]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'6-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'6-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'6-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'6-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.905404   0.566372  0.699454  0.625917             994   
1        rf    0.907682   0.545833  0.715847  0.619385             983   
2        dt    0.848503   0.713542  0.748634  0.730667            1037   
3        lr    0.807422   0.327103  0.573770  0.416667             876   
4       mlp    0.857747   0.507634  0.726776  0.597753             963   
5  xg_boost    0.924740   0.746269  0.819672  0.781250            1041   
6       gbm    0.902344   0.594340  0.688525  0.637975            1006   

   true negatives  false postives  false negatives                 best_params  
0             128              98               55  {'C': 10, 'kernel': 'rbf'}  
1             131             109               52         {'n_estimators': 5}  
2             137              55               46                          {}  
3             105             216               78                    {'C': 5}  
4 

# Dataset 7

In [26]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'7-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'7-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'7-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'7-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.903822   0.555000  0.620112  0.585752            1007   
1        rf    0.906625   0.633663  0.715084  0.671916            1022   
2        dt    0.832291   0.758824  0.720670  0.739255            1055   
3        lr    0.800079   0.366771  0.653631  0.469880             894   
4       mlp    0.857262   0.541667  0.726257  0.620525             986   
5  xg_boost    0.924493   0.859551  0.854749  0.857143            1071   
6       gbm    0.906628   0.601810  0.743017  0.665000            1008   

   true negatives  false postives  false negatives                 best_params  
0             111              89               68  {'C': 10, 'kernel': 'rbf'}  
1             128              74               51         {'n_estimators': 5}  
2             129              41               50                          {}  
3             117             202               62                    {'C': 5}  
4 

# Dataset 8

In [27]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'8-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'8-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'8-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'8-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.920929   0.633721  0.592391  0.612360            1028   
1        rf    0.917220   0.562212  0.663043  0.608479             996   
2        dt    0.855719   0.626050  0.809783  0.706161            1002   
3        lr    0.803659   0.376543  0.663043  0.480315             889   
4       mlp    0.877977   0.535565  0.695652  0.605201             980   
5  xg_boost    0.933619   0.811429  0.771739  0.791086            1058   
6       gbm    0.916766   0.603015  0.652174  0.626632            1012   

   true negatives  false postives  false negatives                 best_params  
0             109              63               75  {'C': 10, 'kernel': 'rbf'}  
1             122              95               62         {'n_estimators': 5}  
2             149              89               35                          {}  
3             122             202               62                    {'C': 5}  
4 

# Dataset 9

In [28]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'9-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'9-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'9-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'9-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.887795   0.594059  0.736196  0.657534            1030   
1        rf    0.893373   0.561905  0.723926  0.632708            1020   
2        dt    0.818373   0.708571  0.760736  0.733728            1061   
3        lr    0.771194   0.335277  0.705521  0.454545             884   
4       mlp    0.836614   0.503731  0.828221  0.626450             979   
5  xg_boost    0.914567   0.760870  0.858896  0.806916            1068   
6       gbm    0.887402   0.506494  0.717791  0.593909             998   

   true negatives  false postives  false negatives                 best_params  
0             120              82               43  {'C': 10, 'kernel': 'rbf'}  
1             118              92               45         {'n_estimators': 5}  
2             124              51               39                          {}  
3             115             228               48                    {'C': 5}  
4 

# Dataset 10

In [29]:
#Read data
proccessed_data_path =os.path.join(os.path.pardir,'data','processed')
x_train_path = os.path.join(proccessed_data_path,'10-train-x.csv')
x_test_path = os.path.join(proccessed_data_path,'10-test-x.csv')
y_train_path = os.path.join(proccessed_data_path,'10-train-y.csv')
y_test_path = os.path.join(proccessed_data_path,'10-test-y.csv')

dfx = pd.read_csv(x_train_path)
dfxt = pd.read_csv(x_test_path)
dfy = pd.read_csv(y_train_path)
dfyt = pd.read_csv(y_test_path)

x_train = dfx.drop(columns=['Unnamed: 0'],axis = 'columns')
x_test = dfxt.drop(columns=['Unnamed: 0'],axis = 'columns')
y_train = dfy.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()
y_test = dfyt.drop(columns=['Unnamed: 0'],axis = 'columns').values.flatten()

scores = []
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=cv, return_train_score=False)
    clf.fit(x_train,y_train)
    conf_matrix =confusion_matrix(y_test,clf.predict(x_test))
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_,
        'precision':precision_score(y_test,clf.predict(x_test)),
        'recall':recall_score(y_test,clf.predict(x_test)),
        'f1_score':f1_score(y_test,clf.predict(x_test)),
        'true positives':conf_matrix[0][0],
        'true negatives':conf_matrix[1][1],
        'false postives':conf_matrix[0][1],
        'false negatives':conf_matrix[1][0]
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','precision','recall','f1_score','true positives','true negatives','false postives','false negatives','best_params'])
print(df)

Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.
Stochastic Optimizer

      model  best_score  precision    recall  f1_score  true positives  \
0       svm    0.908723   0.548223  0.627907  0.585366            1014   
1        rf    0.911338   0.570681  0.633721  0.600551            1021   
2        dt    0.858050   0.791411  0.750000  0.770149            1069   
3        lr    0.785671   0.370000  0.645349  0.470339             914   
4       mlp    0.864788   0.532751  0.709302  0.608479             996   
5  xg_boost    0.926507   0.829114  0.761628  0.793939            1076   
6       gbm    0.909179   0.572917  0.639535  0.604396            1021   

   true negatives  false postives  false negatives                 best_params  
0             108              89               64  {'C': 10, 'kernel': 'rbf'}  
1             109              82               63         {'n_estimators': 5}  
2             129              34               43                          {}  
3             111             189               61                    {'C': 5}  
4 