In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
#Read the dataset
data_full = pd.read_csv("CE802_Ass_2019_Data.csv")

#Replace missing values with mean

data_full.fillna(data_full['F20'].mean(),inplace = True)

#Lets separate the features and target
data_f20mean_class = data_full["Class"]
data_f20mean_feat = data_full.drop(columns = ["Class"],axis = 1)

In [3]:
#Lets first normalize the features for K-NN and SVM
scaler = StandardScaler()
scaler.fit(data_f20mean_feat)
data_f20mean_feat = scaler.transform(data_f20mean_feat)

#Splitting of data to see model accuracy after cross validation and gridsearch
data_feat_train, data_feat_test, data_class_train, data_class_test = train_test_split(data_f20mean_feat,data_f20mean_class,test_size=0.25,stratify=data_f20mean_class,random_state=1234)

Pruned Decision Tree

In [4]:
#Decision tree using grid search
clf_tree = tree.DecisionTreeClassifier(criterion = 'entropy',random_state=1234)
param_grid = {'max_depth': np.arange(4,21),'min_samples_split': np.arange(4,21),'min_samples_leaf': np.arange(4,21)}
tree_gridcv = GridSearchCV(clf_tree,param_grid,cv=10 ,n_jobs=-1)
tree_gridcv.fit(data_feat_train,data_class_train)

print("Best parameters: " + str(tree_gridcv.best_params_))
print("Best score: " + str(tree_gridcv.best_score_))

Best parameters: {'max_depth': 10, 'min_samples_leaf': 6, 'min_samples_split': 4}
Best score: 0.6159317211948792


In [5]:
#Now with these parameters, lets perform cross validation
clf_tree_prunned = tree.DecisionTreeClassifier(criterion = 'entropy',random_state=1234,
                                               max_depth= tree_gridcv.best_params_['max_depth'],
                                               min_samples_leaf= tree_gridcv.best_params_['min_samples_leaf'],
                                               min_samples_split=tree_gridcv.best_params_['min_samples_split'] )

#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_tree = cross_val_score(clf_tree_prunned,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_tree))

#Now lets compute the confussion matrix by splitting the data into trainning and testing
clf_tree_prunned.fit(data_feat_train,data_class_train)
tree_pred = clf_tree_prunned.predict(data_feat_test)
print(confusion_matrix(data_class_test, tree_pred))
print(classification_report(data_class_test, tree_pred))

Average accuracy: 0.6159317211948792
[[37 33]
 [22 33]]
              precision    recall  f1-score   support

       False       0.63      0.53      0.57        70
        True       0.50      0.60      0.55        55

    accuracy                           0.56       125
   macro avg       0.56      0.56      0.56       125
weighted avg       0.57      0.56      0.56       125



K-NN

In [11]:
knn_gridcv = KNeighborsClassifier()
#create a dictionary with the number of neighbors to try
param_gridsearch = {'n_neighbors': np.arange(1,80),'weights':['uniform','distance']}

knn_gridsearch = GridSearchCV(knn_gridcv,param_gridsearch,cv=10)
knn_gridsearch.fit(data_feat_train,data_class_train)
print("Best parameters: " + str(knn_gridsearch.best_params_))
print("Best score: "+ str(knn_gridsearch.best_score_))

Best parameters: {'n_neighbors': 22, 'weights': 'distance'}
Best score: 0.6136557610241822


In [12]:
#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
knn_model = KNeighborsClassifier(n_neighbors = knn_gridsearch.best_params_['n_neighbors'],
                                 weights=knn_gridsearch.best_params_['weights'])

#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_knn = cross_val_score(knn_model,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_knn))

#Now lets compute the confussion matrix by splitting the data into trainning and testing
knn_model.fit(data_feat_train,data_class_train)
knn_pred = knn_model.predict(data_feat_test)
print(confusion_matrix(data_class_test, knn_pred))
print(classification_report(data_class_test, knn_pred))

Average accuracy: 0.6136557610241822
[[62  8]
 [36 19]]
              precision    recall  f1-score   support

       False       0.63      0.89      0.74        70
        True       0.70      0.35      0.46        55

    accuracy                           0.65       125
   macro avg       0.67      0.62      0.60       125
weighted avg       0.66      0.65      0.62       125



Support Vector Machine

In [13]:
clf_svm = svm.SVC()
param_grid = {'C': np.logspace(-1, 3, 9),  
              'gamma': np.logspace(-7, -0, 8)}

svm_gridsearch = GridSearchCV(clf_svm,param_grid,n_jobs=-1, cv = 10)
svm_gridsearch.fit(data_feat_train,data_class_train)

print("Best parameters: " + str(svm_gridsearch.best_params_))
print("Best score : " + str(svm_gridsearch.best_score_))

Best parameters: {'C': 1000.0, 'gamma': 0.01}
Best score : 0.6484352773826458


In [14]:
#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
svm_model = svm.SVC(C = svm_gridsearch.best_params_['C'],gamma=svm_gridsearch.best_params_['gamma'])

#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_svm = cross_val_score(svm_model,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_svm))

#Now lets compute the confussion matrix by splitting the data into trainning and testing
svm_model.fit(data_feat_train,data_class_train)
svm_pred = svm_model.predict(data_feat_test)
print(confusion_matrix(data_class_test, svm_pred))
print(classification_report(data_class_test, svm_pred))

Average accuracy: 0.6484352773826458
[[49 21]
 [23 32]]
              precision    recall  f1-score   support

       False       0.68      0.70      0.69        70
        True       0.60      0.58      0.59        55

    accuracy                           0.65       125
   macro avg       0.64      0.64      0.64       125
weighted avg       0.65      0.65      0.65       125



Random Forest

In [11]:
rf = RandomForestClassifier(criterion='entropy',random_state=1234)
param_grid = {'n_estimators':[400,450,500,550,600],'max_depth': np.arange(4,20)}
#'max_depth': np.arange(4,19),'min_samples_split': np.arange(4,19),'min_samples_leaf': np.arange(4,25)}

rf = GridSearchCV(rf, param_grid,cv=10,n_jobs=-1)
rf.fit(data_feat_train,data_class_train)

print("Best parameters: "+ str(rf.best_params_))
print("Best score: " + str(rf.best_score_))

Best parameters: {'max_depth': 11, 'n_estimators': 450}
Best score: 0.6425320056899004


In [12]:
#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
rf_model = rf = RandomForestClassifier(criterion='entropy',n_estimators= rf.best_params_['n_estimators'],
                                      max_depth=rf.best_params_['max_depth'],random_state=1234)


#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_rf = cross_val_score(rf_model,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_rf))
print(score_rf.std())
#Now lets compute the confussion matrix by splitting the data into trainning and testing
rf_model.fit(data_feat_train,data_class_train)
rf_pred = rf_model.predict(data_feat_test)
print(confusion_matrix(data_class_test, rf_pred))
print(classification_report(data_class_test, rf_pred))

Average accuracy: 0.6425320056899004
0.0775915513341942
[[52 18]
 [29 26]]
              precision    recall  f1-score   support

       False       0.64      0.74      0.69        70
        True       0.59      0.47      0.53        55

    accuracy                           0.62       125
   macro avg       0.62      0.61      0.61       125
weighted avg       0.62      0.62      0.62       125

