# Load Library

In [5]:
# Load libraries
import pandas as pd
import numpy as np
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Load and Split data

# Load dataset
#dataset = read_csv("D:/Academy/thesis/machin learning part/data/inputML_Q11.csv")
dataset = read_csv("D:/Academy/thesis/machin learning part/data/inputML_Q5_with_CGGA.csv")
####dataset = read_csv("D:/Academy/thesis/machin learning part/data/inputML_random_with_CGGA.csv")
dataset.set_index('name', inplace=True)
dataset['stage'].replace({'brain lower grade glioma':False, 'glioblastoma multiforme':True}, inplace=True)
dataset[dataset.select_dtypes(['object']).columns] = dataset.select_dtypes(['object']).apply(lambda x: x.astype('category'))
print(dataset.shape)

# Split dataset
X = dataset.drop('stage', axis=1)
y = dataset['stage']
x_main, x_test, y_main, y_test = train_test_split(X, y, test_size=0.20, random_state=1, stratify=y)
x_train, x_val, y_train, y_val = train_test_split(x_main, y_main, test_size=0.20, random_state=1, stratify=y_main)
# summarize
print('Train', x_train.shape, y_train.shape)
print('Test', x_test.shape, y_test.shape)
print('Validation', x_val.shape, y_val.shape)

from collections import Counter
print(Counter(y_train))
print(Counter(y_test))
print(Counter(y_val))

# Apply SMOTE

#oversample = SMOTE()
#x_train, y_train = oversample.fit_resample(x_train, y_train)
#print(Counter(y_train))

# Hyperparameter Tunning

# Fitting some Models

# Fitting Model
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.inspection import permutation_importance
from scipy.stats import loguniform
from sklearn.feature_selection import RFE
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import RandomizedSearchCV

# KNN
from sklearn.neighbors import KNeighborsClassifier 
knn = KNeighborsClassifier()
param_knn = {'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

knn = RandomizedSearchCV(estimator=knn, param_distributions=param_knn,
                              n_iter=10, scoring='roc_auc', cv=5,
                              refit=True, n_jobs=-1)
knn = knn.fit(x_train, y_train)
print(knn.best_params_)
print(knn.best_estimator_)


# Decision Tree
from sklearn.tree import DecisionTreeClassifier 
dtc = DecisionTreeClassifier(criterion="gini", random_state=42, max_depth=3, min_samples_leaf=5) 
dtc = dtc.fit(x_train, y_train) 

# SVM
from sklearn.svm import SVC
svm = SVC()
param_svm = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear']}

svm = RandomizedSearchCV(estimator=svm, param_distributions=param_svm,
                              n_iter=10, scoring='roc_auc', cv=5,
                              refit=True, n_jobs=-1)
svm = svm.fit(x_train, y_train)

print(svm.best_params_)
print(svm.best_estimator_)

# Linear SVM
lsvm = LinearSVC(dual=False, random_state=13)
lsvm = lsvm.fit(x_train, y_train) 


# Logestic regression
LR = LogisticRegression(solver='liblinear')
LR = LR.fit(x_train, y_train)

#RFE
rfe = RandomForestClassifier()
param_rf = {'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9]}
rfe = RandomizedSearchCV(estimator=rfe, param_distributions=param_rf,
                              n_iter=10, scoring='roc_auc', cv=5,
                              refit=True, n_jobs=-1)
rfe = rfe.fit(x_train, y_train)  
print(rfe.best_params_)
print(rfe.best_estimator_)

# XGB
xgb = XGBClassifier(tree_method="hist")
xgb = xgb.fit(x_train, y_train) 

# Bagging
classifier = BalancedBaggingClassifier(random_state=42)
classifier.fit(x_train, y_train)


# Compare different Classifiers

from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
models = []
models.append(('LR', LR))
models.append(('KNN', knn))
models.append(('DTC', dtc))
models.append(('SVM', svm))
models.append(('LSVM', lsvm))
models.append(('RFE', rfe))
models.append(('XGB', xgb))
models.append(('Bagging', classifier))


print(models)

# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring='f1')#f1 roc_auc
	results.append(cv_results)
	names.append(name)
	print('%s: %f (%f)' % (name, cv_results.mean()*100, cv_results.std()))


# Model Evaluation on Test  set

Y_knn = knn.predict(x_test) 
Y_dtc = dtc.predict(x_test) 
Y_svm = svm.predict(x_test) 
Y_lr = LR.predict(x_test)
Y_lsvm = lsvm.predict(x_test) 
Y_rfe = rfe.predict(x_test) 
Y_xgb = xgb.predict(x_test) 

# Evaluate predictions 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


print('Accuracy  For Test set')
print('Accuracy  KNN: %.2f' % (f1_score(y_test, Y_knn)*100))
print('Accuracy  DT: %.2f' % (f1_score(y_test, Y_dtc)*100))
print('Accuracy  SVM: %.2f' % (f1_score(y_test, Y_svm)*100))
print('Accuracy  LR: %.2f' % (f1_score(y_test, Y_lr)*100))
print('Accuracy  LSVM: %.2f' % (f1_score(y_test, Y_lsvm)*100))
print('Accuracy  RFE: %.2f' % (f1_score(y_test, Y_rfe)*100))
print('Accuracy  XGB: %.2f' % (f1_score(y_test, Y_xgb)*100))

print('F1 score For Test set')
print('F1 score KNN: %.2f' % (f1_score(y_test, Y_knn)*100))
print('F1 score DT: %.2f' % (f1_score(y_test, Y_dtc)*100))
print('F1 score SVM: %.2f' % (f1_score(y_test, Y_svm)*100))
print('F1 score LR: %.2f' % (f1_score(y_test, Y_lr)*100))
print('F1 score LSVM: %.2f' % (f1_score(y_test, Y_lsvm)*100))
print('F1 score RFE: %.2f' % (f1_score(y_test, Y_rfe)*100))
print('F1 Score XGB: %.2f' % (f1_score(y_test, Y_xgb)*100))

print('Precision score For Test set')
print('Precision KNN: %.2f' % (precision_score(y_test, Y_knn)*100))
print('Precision DT: %.2f' % (precision_score(y_test, Y_dtc)*100))
print('Precision SVM: %.2f' % (precision_score(y_test, Y_svm)*100))
print('Precision LR: %.2f' % (precision_score(y_test, Y_lr)*100))
print('Precision LSVM: %.2f' % (precision_score(y_test, Y_lsvm)*100))
print('Precision RFE: %.2f' % (precision_score(y_test, Y_rfe)*100))
print('Precision XGB: %.2f' % (precision_score(y_test, Y_xgb)*100))


print('Recall score For Test set')
print('Recall KNN: %.2f' % (recall_score(y_test, Y_knn)*100))
print('Recall DT: %.2f' % (recall_score(y_test, Y_dtc)*100))
print('Recall SVM: %.2f' % (recall_score(y_test, Y_svm)*100))
print('Recall LR: %.2f' % (recall_score(y_test, Y_lr)*100))
print('Recall LSVM: %.2f' % (recall_score(y_test, Y_lsvm)*100))
print('Recall RFE: %.2f' % (recall_score(y_test, Y_rfe)*100))
print('Recall XGB: %.2f' % (recall_score(y_test, Y_xgb)*100))

#print(confusion_matrix(y_test, Y_rfe))
#print(classification_report(y_test, Y_knn))
#accuracy_score
# roc_auc_score


# Model Evaluation on Validation set

Y_knn_val = knn.predict(x_val) 
Y_dtc_val = dtc.predict(x_val) 
Y_svm_val = svm.predict(x_val) 
Y_lr_val = LR.predict(x_val) 
Y_lsvm_val = lsvm.predict(x_val) 
Y_rfe_val = rfe.predict(x_val)
Y_xgb_val = xgb.predict(x_val) 


# Evaluate predictions 
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

print('Accuracy  For Validation set')
print('Accuracy  KNN: %.2f' % (f1_score(y_val, Y_knn_val)*100))
print('Accuracy  DT: %.2f' % (f1_score(y_val, Y_dtc_val)*100))
print('Accuracy  SVM: %.2f' % (f1_score(y_val, Y_svm_val)*100))
print('Accuracy  LR: %.2f' % (f1_score(y_val, Y_lr_val)*100))
print('Accuracy  LSVM: %.2f' % (f1_score(y_val, Y_lsvm_val)*100))
print('Accuracy  RFE: %.2f' % (f1_score(y_val, Y_rfe_val)*100))
print('Accuracy  XGB: %.2f' % (f1_score(y_val, Y_xgb_val)*100))

print('F1 score For Validation set')
print('F1 score KNN: %.2f' % (f1_score(y_val, Y_knn_val)*100))
print('F1 score DT: %.2f' % (f1_score(y_val, Y_dtc_val)*100))
print('F1 score SVM: %.2f' % (f1_score(y_val, Y_svm_val)*100))
print('F1 score LR: %.2f' % (f1_score(y_val, Y_lr_val)*100))
print('F1 score LSVM: %.2f' % (f1_score(y_val, Y_lsvm_val)*100))
print('F1 score RFE: %.2f' % (f1_score(y_val, Y_rfe_val)*100))
print('F1 Score XGB: %.2f' % (f1_score(y_val, Y_xgb_val)*100))

print('Precision For Validation set')
print('Precision KNN: %.2f' % (precision_score(y_val, Y_knn_val)*100))
print('Precision DT: %.2f' % (precision_score(y_val, Y_dtc_val)*100))
print('Precision SVM: %.2f' % (precision_score(y_val, Y_svm_val)*100))
print('Precision LR: %.2f' % (precision_score(y_val, Y_lr_val)*100))
print('Precision LSVM: %.2f' % (precision_score(y_val, Y_lsvm_val)*100))
print('Precision RFE: %.2f' % (precision_score(y_val, Y_rfe_val)*100))
print('Precision XGB: %.2f' % (precision_score(y_val, Y_xgb_val)*100))


print('Recall For Validation set')
print('Recall KNN: %.2f' % (recall_score(y_val, Y_knn_val)*100))
print('Recall DT: %.2f' % (recall_score(y_val, Y_dtc_val)*100))
print('Recall SVM: %.2f' % (recall_score(y_val, Y_svm_val)*100))
print('Recall LR: %.2f' % (recall_score(y_val, Y_lr_val)*100))
print('Recall LSVM: %.2f' % (recall_score(y_val, Y_knn_val)*100))
print('Recall RFE: %.2f' % (recall_score(y_val, Y_knn_val)*100))
print('Recall XGB: %.2f' % (recall_score(y_val, Y_knn_val)*100))

#print(confusion_matrix(y_val, Y_rfe_val))
#print(classification_report(y_test, Y_svm))



(1027, 5)
Train (656, 4) (656,)
Test (206, 4) (206,)
Validation (165, 4) (165,)
Counter({False: 339, True: 317})
Counter({False: 106, True: 100})
Counter({False: 85, True: 80})
{'weights': 'distance', 'n_neighbors': 13, 'metric': 'manhattan'}
KNeighborsClassifier(metric='manhattan', n_neighbors=13, weights='distance')
{'kernel': 'rbf', 'gamma': 0.0001, 'C': 100}
SVC(C=100, gamma=0.0001)
{'n_estimators': 150, 'max_leaf_nodes': 9, 'max_features': None, 'max_depth': 3}
RandomForestClassifier(max_depth=3, max_features=None, max_leaf_nodes=9,
                       n_estimators=150)
[('LR', LogisticRegression(solver='liblinear')), ('KNN', RandomizedSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
                   param_distributions={'metric': ['minkowski', 'euclidean',
                                                   'manhattan'],
                                        'n_neighbors': [5, 7, 9, 11, 13, 15],
                                        'weights': ['uniform', 'dist