In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [2]:
#Read the dataset
data_full = pd.read_csv("CE802_Ass_2019_Data.csv")

#Lets drop F20 feature
data_nof20 = pd.read_csv("CE802_Ass_2019_Data.csv")
data_nof20.drop(columns=['F20'], inplace = True)

#Lets separate the features and target
data_nof20_class = data_nof20["Class"]
data_nof20_feat = data_nof20.drop(columns = ["Class"],axis = 1)

#Lets first normalize the features for K-NN and SVM
scaler = StandardScaler()
scaler.fit(data_nof20_feat)
data_nof20_feat = scaler.transform(data_nof20_feat)

#Splitting of data to see model accuracy after cross validation and gridsearch
data_feat_train, data_feat_test, data_class_train, data_class_test = train_test_split(data_nof20_feat,data_nof20_class,test_size=0.25,stratify=data_nof20_class,random_state=1234)

Pruned Decision Tree

In [3]:
#Decision tree using grid search
clf_tree = tree.DecisionTreeClassifier(criterion = 'entropy',ccp_alpha=0.015,random_state=1234)
param_grid = {'max_depth': np.arange(4,21),'min_samples_split': np.arange(4,21),'min_samples_leaf': np.arange(4,21),
              'max_features': ['sqrt','auto','log2']}
tree_gridcv = GridSearchCV(clf_tree,param_grid,cv=10 ,n_jobs=-1)
tree_gridcv.fit(data_feat_train,data_class_train)

print("Best parameters: " + str(tree_gridcv.best_params_))
print("Best score: " + str(tree_gridcv.best_score_))

Best parameters: {'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 11}
Best score: 0.6134423897581792


In [4]:
#Now with these parameters, lets perform cross validation
clf_tree_prunned = tree.DecisionTreeClassifier(criterion = 'entropy',ccp_alpha=0.015,
                                               max_depth= tree_gridcv.best_params_['max_depth'],
                                               min_samples_leaf= tree_gridcv.best_params_['min_samples_leaf'],
                                               min_samples_split=tree_gridcv.best_params_['min_samples_split'],
                                               max_features=tree_gridcv.best_params_['max_features'],random_state=1234)

#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_tree = cross_val_score(clf_tree_prunned,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_tree))
print(score_tree.std())
#Now lets compute the confussion matrix by splitting the data into trainning and testing
clf_tree_prunned.fit(data_feat_train,data_class_train)
tree_pred = clf_tree_prunned.predict(data_feat_test)
print(confusion_matrix(data_class_test, tree_pred))
print(classification_report(data_class_test, tree_pred))

Average accuracy: 0.6134423897581792
0.09027409087775647
[[65  5]
 [47  8]]
              precision    recall  f1-score   support

       False       0.58      0.93      0.71        70
        True       0.62      0.15      0.24        55

    accuracy                           0.58       125
   macro avg       0.60      0.54      0.47       125
weighted avg       0.60      0.58      0.50       125



K-NN

In [5]:
knn_gridcv = KNeighborsClassifier()
#create a dictionary with the number of neighbors to try
param_gridsearch = {'n_neighbors': np.arange(1,80),'weights':['uniform','distance']}

knn_gridsearch = GridSearchCV(knn_gridcv,param_gridsearch,cv=10)
knn_gridsearch.fit(data_feat_train,data_class_train)
print("Best parameters: " + str(knn_gridsearch.best_params_))
print("Best score: "+ str(knn_gridsearch.best_score_))

Best parameters: {'n_neighbors': 42, 'weights': 'distance'}
Best score: 0.6028449502133713


In [6]:
#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
knn_model = KNeighborsClassifier(n_neighbors = knn_gridsearch.best_params_['n_neighbors'],
                                 weights=knn_gridsearch.best_params_['weights'])

#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_knn = cross_val_score(knn_model,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_knn))
print(score_knn.std())
#Now lets compute the confussion matrix by splitting the data into trainning and testing
knn_model.fit(data_feat_train,data_class_train)
knn_pred = knn_model.predict(data_feat_test)
print(confusion_matrix(data_class_test, knn_pred))
print(classification_report(data_class_test, knn_pred))

Average accuracy: 0.6028449502133713
0.03967940702390046
[[63  7]
 [43 12]]
              precision    recall  f1-score   support

       False       0.59      0.90      0.72        70
        True       0.63      0.22      0.32        55

    accuracy                           0.60       125
   macro avg       0.61      0.56      0.52       125
weighted avg       0.61      0.60      0.54       125



Support Vector Machine

In [7]:
clf_svm = svm.SVC()
param_grid = {'C': np.logspace(-1, 3, 9),  
              'gamma': np.logspace(-7, -0, 8)}

svm_gridsearch = GridSearchCV(clf_svm,param_grid,n_jobs=-1, cv = 10)
svm_gridsearch.fit(data_feat_train,data_class_train)

print("Best parameters: " + str(svm_gridsearch.best_params_))
print("Best score : " + str(svm_gridsearch.best_score_))

Best parameters: {'C': 1000.0, 'gamma': 0.01}
Best score : 0.6694879089615932


In [10]:
#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
svm_model = svm.SVC(C = svm_gridsearch.best_params_['C'],gamma=svm_gridsearch.best_params_['gamma'])

#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_svm = cross_val_score(svm_model,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_svm))
print(score_svm.std())
#Now lets compute the confussion matrix by splitting the data into trainning and testing
svm_model.fit(data_feat_train,data_class_train)
svm_pred = svm_model.predict(data_feat_test)
print(confusion_matrix(data_class_test, svm_pred))
print(classification_report(data_class_test, svm_pred))

Average accuracy: 0.6694879089615932
0.09547714481581449
[[49 21]
 [22 33]]
              precision    recall  f1-score   support

       False       0.69      0.70      0.70        70
        True       0.61      0.60      0.61        55

    accuracy                           0.66       125
   macro avg       0.65      0.65      0.65       125
weighted avg       0.66      0.66      0.66       125



Random Forest

In [11]:
rf = RandomForestClassifier(criterion='entropy',random_state=1234)
param_grid = {'n_estimators':[400,450,500,550,600],'max_depth': np.arange(4,20)}
#'max_depth': np.arange(4,19),'min_samples_split': np.arange(4,19),'min_samples_leaf': np.arange(4,25)}

rf = GridSearchCV(rf, param_grid,cv=10,n_jobs=-1)
rf.fit(data_feat_train,data_class_train)

print("Best parameters: "+ str(rf.best_params_))
print("Best score: " + str(rf.best_score_))

Best parameters: {'max_depth': 4, 'n_estimators': 550}
Best score: 0.5893314366998578


In [12]:
#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
rf_model = rf = RandomForestClassifier(criterion='entropy',n_estimators= rf.best_params_['n_estimators'],
                                      max_depth=rf.best_params_['max_depth'],random_state=1234)


#Now lets used cross validation in the whole data set, but with the best parameters by gridsearch
score_rf = cross_val_score(rf_model,data_feat_train,data_class_train,cv=10,n_jobs=-1)
print('Average accuracy:', np.mean(score_rf))
print(score_rf.std())
#Now lets compute the confussion matrix by splitting the data into trainning and testing
rf_model.fit(data_feat_train,data_class_train)
rf_pred = rf_model.predict(data_feat_test)
print(confusion_matrix(data_class_test, rf_pred))
print(classification_report(data_class_test, rf_pred))

Average accuracy: 0.5893314366998578
0.052208130872285936
[[59 11]
 [43 12]]
              precision    recall  f1-score   support

       False       0.58      0.84      0.69        70
        True       0.52      0.22      0.31        55

    accuracy                           0.57       125
   macro avg       0.55      0.53      0.50       125
weighted avg       0.55      0.57      0.52       125

