# Balancing classes

Classes in our dataset are not balanced. Here, we will try to address that problem.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import pandas as pd

df = pd.read_csv('/content/gdrive/My Drive/ip_files/data/data_without_outliers.csv')
print(df.shape)

(16997, 22251)


## Naive

First we will try naive approach. Out of 17000ish rows, only 66 are of the class2, so we will delete them.

In [0]:
from sklearn.model_selection import train_test_split

df = df.loc[df['class'] != 'class2']
print('new shape: {}'.format(df.shape))

y = df['class']
X = df.loc[:, df.columns != 'class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27)

# free some of the much needed memory
del X
del y
del df

import gc
gc.collect()


new shape: (16931, 22251)


17

Now we will build some of the best models from before.

### KNN

In [0]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

knn = KNeighborsClassifier(n_neighbors=6, weights='distance')
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

print('accuracy train: {}'.format(np.round(knn.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))


accuracy train: 1.0
accuracy test: 0.8902
recall score: [0.8436 0.0755 0.8143 0.9668 0.9878 0.7401]
f1_score: [0.8913 0.1319 0.8268 0.9304 0.9553 0.8371]
confusion matrix: 
[[ 205    0    1    0   35    2]
 [   0   12  122   25    0    0]
 [   0    9 1048  229    0    1]
 [   0    2   72 2239    2    1]
 [   2    0    5    3  887    1]
 [  10    0    0    1   35  131]]


Accuracy on test set is higher, but this classifier struggles with class3.

### Decision trees

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np


dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

print('accuracy train: {}'.format(np.round(dtc.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
 

accuracy train: 1.0
accuracy test: 0.8346
recall score: [0.7737 0.2264 0.7498 0.8998 0.9321 0.7345]
f1_score: [0.7753 0.2051 0.7586 0.8992 0.9321 0.745 ]
confusion matrix: 
[[ 188    1    0    2   33   19]
 [   0   36  100   23    0    0]
 [   0  117  965  205    0    0]
 [   0   38  190 2084    1    3]
 [  35    0    1    5  837   20]
 [  19    0    1    0   27  130]]


In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np


dtc = DecisionTreeClassifier(criterion='gini')
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

print('accuracy train: {}'.format(np.round(dtc.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
 

accuracy train: 1.0
accuracy test: 0.8309
recall score: [0.7654 0.2327 0.7327 0.9093 0.9232 0.678 ]
f1_score: [0.7686 0.2176 0.7568 0.8981 0.9186 0.6877]
confusion matrix: 
[[ 186    1    0    0   42   14]
 [   0   37   90   31    0    1]
 [   0  111  943  231    0    2]
 [   2   31  172 2106    4    1]
 [  30    1    0    4  829   34]
 [  23    0    0    2   32  120]]


In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np


dtc = DecisionTreeClassifier(criterion='entropy', class_weight='balanced')
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

print('accuracy train: {}'.format(np.round(dtc.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
 

accuracy train: 1.0
accuracy test: 0.8285
recall score: [0.8354 0.2893 0.7405 0.8873 0.9232 0.6949]
f1_score: [0.8185 0.2706 0.7492 0.8854 0.9226 0.7214]
confusion matrix: 
[[ 203    0    0    2   30    8]
 [   0   46   86   27    0    0]
 [   0   97  953  236    0    1]
 [   0   38  218 2055    3    2]
 [  34    0    0    5  829   30]
 [  16    0    0    1   37  123]]


In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np


dtc = DecisionTreeClassifier(criterion='gini', class_weight='balanced')
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

print('accuracy train: {}'.format(np.round(dtc.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
 

accuracy train: 1.0
accuracy test: 0.8268
recall score: [0.7984 0.283  0.7382 0.8834 0.9287 0.7401]
f1_score: [0.7854 0.2663 0.7448 0.8832 0.9277 0.7572]
confusion matrix: 
[[ 194    0    0    1   38   10]
 [   1   45   87   25    1    0]
 [   0   94  950  241    2    0]
 [   3   40  225 2046    2    0]
 [  32    0    0    4  834   28]
 [  21    0    2    0   23  131]]


These aren't much better. Gini without class weights is the best model out of all decision trees.

### SVM

In [0]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

clf = SVC(C=300, kernel='rbf', gamma='scale')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy train: {}'.format(np.round(clf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))


accuracy train: 0.9905
accuracy test: 0.963
recall score: [0.9794 0.7736 0.9176 0.9883 0.9955 0.9435]
f1_score: [0.9774 0.8311 0.9366 0.9728 0.9917 0.9653]
confusion matrix: 
[[ 238    0    0    0    5    0]
 [   0  123   30    6    0    0]
 [   0   11 1181   95    0    0]
 [   0    3   23 2289    0    1]
 [   2    0    1    0  894    1]
 [   4    0    0    0    6  167]]


In [0]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

clf = SVC(C=300, kernel='rbf', gamma='scale', class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy train: {}'.format(np.round(clf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))


accuracy train: 0.9921
accuracy test: 0.9665
recall score: [0.9794 0.7987 0.9549 0.9737 0.9944 0.9492]
f1_score: [0.9734 0.8089 0.9468 0.9777 0.9917 0.9683]
confusion matrix: 
[[ 238    0    0    0    5    0]
 [   0  127   28    4    0    0]
 [   1   19 1229   38    0    0]
 [   0    9   51 2255    0    1]
 [   3    0    1    0  893    1]
 [   4    0    0    0    5  168]]


In [0]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

clf = SVC(C=300, kernel='poly', gamma='scale', degree=1, class_weight=None)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy train: {}'.format(np.round(clf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))

accuracy train: 0.9805
accuracy test: 0.9579
recall score: [0.9712 0.6855 0.9029 0.9918 0.9955 0.9492]
f1_score: [0.9752 0.7786 0.9266 0.97   0.9906 0.9683]
confusion matrix: 
[[ 236    0    0    0    7    0]
 [   0  109   40   10    0    0]
 [   0   11 1162  113    0    1]
 [   0    1   18 2297    0    0]
 [   2    0    1    0  894    1]
 [   3    0    0    0    6  168]]


In [0]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

clf = SVC(C=300, kernel='poly', gamma='scale', degree=1, class_weight='balanced')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy train: {}'.format(np.round(clf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))

accuracy train: 0.9823
accuracy test: 0.9531
recall score: [0.9835 0.6918 0.8982 0.9814 0.9955 0.9605]
f1_score: [0.9755 0.694  0.9211 0.9681 0.9944 0.9742]
confusion matrix: 
[[ 239    0    0    0    4    0]
 [   0  110   38   11    0    0]
 [   0   34 1156   96    0    1]
 [   0   14   28 2273    0    1]
 [   3    0    1    0  894    0]
 [   5    0    0    0    2  170]]


### Neural networks

In [0]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

n = len(X_train.loc[0, :])
clf = MLPClassifier(solver='adam', activation='relu', hidden_layer_sizes=(n // 16, n // 64, n // 128))
clf.fit(X_train, y_train)
                    
y_pred = clf.predict(X_test)

print('accuracy train: {}'.format(np.round(clf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))

accuracy train: 0.9964
accuracy test: 0.9504
recall score: [1.     0.4591 0.9332 0.9732 0.9944 0.9266]
f1_score: [0.9739 0.5935 0.9242 0.9639 0.9944 0.9563]
confusion matrix: 
[[ 243    0    0    0    0    0]
 [   0   73   50   36    0    0]
 [   0   14 1201   71    0    1]
 [   0    0   61 2254    0    1]
 [   5    0    0    0  893    0]
 [   8    0    0    0    5  164]]


### Voting

In [0]:
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

svc1 = SVC(C=100, kernel='poly', degree=1, gamma='scale')
svc2 = SVC(C=100, kernel='rbf', degree=1, gamma='scale', class_weight='balanced')
svc3 = SVC(C=300, kernel='rbf', gamma='scale', class_weight='balanced')
svc4 = SVC(C=300, kernel='rbf', gamma='scale')

vclf = VotingClassifier(estimators=[('SVC1', svc1), ('SVC2', svc2), ('SVC3', svc3), ('SVC4', svc4)], voting='hard')
vclf.fit(X_train, y_train)

y_pred = vclf.predict(X_test)

print('accuracy train: {}'.format(np.round(vclf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))

accuracy train: 0.9909
accuracy test: 0.9657
recall score: [0.9835 0.805  0.9308 0.9853 0.9944 0.9379]
f1_score: [0.9775 0.8232 0.9444 0.976  0.9911 0.9623]
confusion matrix: 
[[ 239    0    0    0    4    0]
 [   0  128   25    6    0    0]
 [   0   17 1198   72    0    0]
 [   0    7   26 2282    0    1]
 [   3    0    1    0  893    1]
 [   4    0    0    0    7  166]]


## Sampling

Here we will, using sklearn.utils.resample add some of the patterns of class2 into our data, and build models on that.

In [0]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

import gc

X = df.loc[:, df.columns != 'class']
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27)

del df
del y
gc.collect()

X = pd.concat([X_train, y_train], axis=1)

gc.collect()

class2 = X.loc[X['class'] == 'class2']
rest_of_the_data = X.loc[X['class'] != 'class2']

del X
gc.collect()

# class2 = X[X.class == 'class2']
# rest_of_the_data = X[X.class != 'class2']

class2_upsampled = resample(class2, random_state=27, n_samples=400, replace=True)
upsampled = pd.concat([rest_of_the_data, class2_upsampled])

gc.collect()


0

In [0]:
from collections import Counter

classes = upsampled['class']

print('classes count:')
print(sorted(Counter(classes).items()))

classes count:
[('class1', 551), ('class2', 400), ('class3', 418), ('class4', 2844), ('class5', 5514), ('class6', 2172), ('class7', 348)]


In [0]:
X_train = upsampled.loc[:, upsampled.columns != 'class']
y_train = upsampled['class']

del upsampled


### SVM

In [0]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

clf = SVC(C=300, kernel='rbf', gamma='scale')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy train: {}'.format(np.round(clf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))


accuracy train: 0.9935
accuracy test: 0.9592
recall score: [0.9835 0.25   0.7421 0.9263 0.985  0.9966 0.8922]
f1_score: [0.9814 0.3478 0.8027 0.9383 0.9713 0.9877 0.9401]
confusion matrix: 
[[ 238    0    0    0    0    4    0]
 [   0    4    0    8    4    0    0]
 [   0    1  118   25   15    0    0]
 [   0    1   12 1194   82    0    0]
 [   0    1    5   29 2304    0    0]
 [   2    0    0    0    0  885    1]
 [   3    0    0    0    0   15  149]]


In [0]:
from joblib import dump
dump(clf, '/content/gdrive/My Drive/ip_files/models/svm_sampled_class2_400_rbf_C300_gamaScale.pkl')

['/content/gdrive/My Drive/ip_files/models/svm_sampled_class2_400_rbf_C300_gamaScale.pkl']

In [0]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

clf = SVC(C=300, kernel='poly', gamma='scale', degree=1, class_weight=None)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy train: {}'.format(np.round(clf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))

accuracy train: 0.9801
accuracy test: 0.9529
recall score: [0.9711 0.4375 0.6792 0.9061 0.9876 0.9944 0.8922]
f1_score: [0.9792 0.4    0.7633 0.9296 0.9669 0.9844 0.9401]
confusion matrix: 
[[ 235    0    0    0    0    7    0]
 [   0    7    0    4    5    0    0]
 [   0    4  108   29   18    0    0]
 [   0    6   10 1168  105    0    0]
 [   0    2    6   21 2310    0    0]
 [   2    0    0    1    1  883    1]
 [   1    0    0    1    0   16  149]]


In [0]:
from joblib import dump
dump(clf, '/content/gdrive/My Drive/ip_files/models/svm_sampled_class2_400_poly_C300_gamaScale_degree1.pkl')

['/content/gdrive/My Drive/ip_files/models/svm_sampled_class2_400_poly_C300_gamaScale_degree1.pkl']

### Voting

In [0]:
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

svc1 = SVC(C=100, kernel='poly', degree=1, gamma='scale')
svc2 = SVC(C=300, kernel='poly', degree=1, gamma='scale')
svc3 = SVC(C=100, kernel='rbf', gamma='scale')
svc4 = SVC(C=300, kernel='rbf', gamma='scale')

vclf = VotingClassifier(estimators=[('SVC1', svc1), ('SVC2', svc2), ('SVC3', svc3), ('SVC4', svc4)], voting='hard')
vclf.fit(X_train, y_train)

y_pred = vclf.predict(X_test)

print('accuracy train: {}'.format(np.round(vclf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))


from joblib import dump
dump(vclf, '/content/gdrive/My Drive/ip_files/models/voting_sampled_class2_400_svm_examples.pkl')
print()

accuracy train: 0.9814
accuracy test: 0.9569
recall score: [0.9711 0.4375 0.7547 0.9092 0.9906 0.9944 0.8743]
f1_score: [0.9771 0.4667 0.8136 0.9357 0.9697 0.9833 0.9299]
confusion matrix: 
[[ 235    0    0    0    0    7    0]
 [   0    7    0    4    5    0    0]
 [   0    3  120   22   14    0    0]
 [   0    3   12 1172  102    0    0]
 [   0    1    4   17 2317    0    0]
 [   2    0    0    0    2  883    1]
 [   2    0    0    1    0   18  146]]



In [0]:
from joblib import dump
dump(vclf, '/content/gdrive/My Drive/ip_files/models/voting_sampled_class2_400_svm_examples.pkl')

['/content/gdrive/My Drive/ip_files/models/voting_sampled_class2_400_svm_examples.pkl']

## Balance

Here we will change our data in that way that every class has same number of patterns. We will choose 2500. That number is selected because data frame with 17500 patterns will fit our RAM memory.

In [0]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

import gc

X = df.loc[:, df.columns != 'class']
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27)

del df
del y
gc.collect()

X = pd.concat([X_train, y_train], axis=1)

gc.collect()

class1 = X.loc[X['class'] == 'class1']
class2 = X.loc[X['class'] == 'class2']
class3 = X.loc[X['class'] == 'class3']
class4 = X.loc[X['class'] == 'class4']
class5 = X.loc[X['class'] == 'class5']
class6 = X.loc[X['class'] == 'class6']
class7 = X.loc[X['class'] == 'class7']

del X
gc.collect()

class1_sampled = resample(class1, random_state=27, n_samples=2500, replace=True)
del class1
class2_sampled = resample(class2, random_state=27, n_samples=2500, replace=True)
del class2
class3_sampled = resample(class3, random_state=27, n_samples=2500, replace=True)
del class3
class4_sampled = resample(class4, random_state=27, n_samples=2500, replace=True)
del class4
class5_sampled = resample(class5, random_state=27, n_samples=2500, replace=True)
del class5
class6_sampled = resample(class6, random_state=27, n_samples=2500, replace=True)
del class6
class7_sampled = resample(class7, random_state=27, n_samples=2500, replace=True)
del class7

gc.collect()

balanced = pd.concat([class1_sampled, class2_sampled,
                       class3_sampled, class4_sampled,
                       class5_sampled, class6_sampled,
                       class7_sampled])

del class1_sampled
del class2_sampled
del class3_sampled
del class4_sampled
del class5_sampled
del class6_sampled
del class7_sampled

gc.collect()


0

In [0]:
from collections import Counter

classes = balanced['class']

print('classes count:')
print(sorted(Counter(classes).items()))

classes count:
[('class1', 2500), ('class2', 2500), ('class3', 2500), ('class4', 2500), ('class5', 2500), ('class6', 2500), ('class7', 2500)]


### Decision trees

In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np


dtc = DecisionTreeClassifier(criterion='gini')
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

print('accuracy train: {}'.format(np.round(dtc.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
 

accuracy train: 1.0
accuracy test: 0.8271
recall score: [0.7851 0.     0.1698 0.7401 0.8961 0.9381 0.7066]
f1_score: [0.79   0.     0.1731 0.7471 0.8912 0.9302 0.7262]
confusion matrix: 
[[ 190    0    0    0    3   37   12]
 [   0    0    2   11    3    0    0]
 [   0    0   27  104   28    0    0]
 [   0   14   90  954  230    0    1]
 [   0    3   33  194 2096    4    9]
 [  31    0    0    2    4  833   18]
 [  18    0    1    0    1   29  118]]


In [0]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np


dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(X_train, y_train)

y_pred = dtc.predict(X_test)

print('accuracy train: {}'.format(np.round(dtc.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))
 

accuracy train: 1.0
accuracy test: 0.838
recall score: [0.7851 0.     0.1698 0.7541 0.9162 0.9347 0.6707]
f1_score: [0.7755 0.     0.1714 0.7723 0.9079 0.93   0.6892]
confusion matrix: 
[[ 190    0    0    0    0   32   20]
 [   0    0    5    8    3    0    0]
 [   0    7   27   93   32    0    0]
 [   2   23   90  972  202    0    0]
 [   3    1   33  155 2143    3    1]
 [  32    0    1    0    0  830   25]
 [  21    0    0    0    2   32  112]]


### SVM

In [0]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

clf = SVC(C=300, kernel='rbf', gamma='scale')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy train: {}'.format(np.round(clf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))


accuracy train: 0.993
accuracy test: 0.9588
recall score: [0.9835 0.1875 0.7484 0.9255 0.9846 0.9966 0.8922]
f1_score: [0.9814 0.3    0.8095 0.9361 0.9711 0.9877 0.9401]
confusion matrix: 
[[ 238    0    0    0    0    4    0]
 [   0    3    0    9    4    0    0]
 [   0    0  119   26   14    0    0]
 [   0    1   12 1193   83    0    0]
 [   0    0    4   32 2303    0    0]
 [   2    0    0    0    0  885    1]
 [   3    0    0    0    0   15  149]]


In [0]:
from joblib import dump
dump(clf, '/content/gdrive/My Drive/ip_files/models/svm_balanced_rbf_C300_gamaScale.pkl')

['/content/gdrive/My Drive/ip_files/models/svm_balanced_rbf_C300_gamaScale.pkl']

In [0]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

clf = SVC(C=300, kernel='poly', gamma='scale', degree=1, class_weight=None)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print('accuracy train: {}'.format(np.round(clf.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))

accuracy train: 0.9787
accuracy test: 0.9547
recall score: [0.9752 0.5    0.6918 0.9085 0.988  0.9944 0.8982]
f1_score: [0.9813 0.5714 0.7692 0.9308 0.9671 0.9855 0.9434]
confusion matrix: 
[[ 236    0    0    0    0    6    0]
 [   0    8    0    3    5    0    0]
 [   0    2  110   29   18    0    0]
 [   0    2   11 1171  105    0    0]
 [   0    0    6   22 2311    0    0]
 [   2    0    0    1    1  883    1]
 [   1    0    0    1    0   15  150]]


In [0]:
from joblib import dump
dump(clf, '/content/gdrive/My Drive/ip_files/models/svm_balanced_poly_C300_gamaScale_degree1.pkl')

['/content/gdrive/My Drive/ip_files/models/svm_balaned_poly_C300_gamaScale_degree1.pkl']

### Gradient boosting

In [0]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score
import numpy as np

gbc = GradientBoostingClassifier(loss='deviance', n_estimators=500)
gbc.fit(X_train, y_train)

y_pred = gbc.predict(X_test)

print('accuracy train: {}'.format(np.round(gbc.score(X_train, y_train), 4)))
print('accuracy test: {}'.format(np.round(accuracy_score(y_test, y_pred), 4)))
print('recall score: {}'.format(np.round(recall_score(y_test, y_pred, average=None), 4)))
print('f1_score: {}'.format(np.round(f1_score(y_test, y_pred, average=None), 4)))
print('confusion matrix: \n{}'.format(confusion_matrix(y_test, y_pred)))


accuracy train: 0.9998
accuracy test: 0.9237
recall score: [0.9298 0.     0.283  0.8914 0.9714 0.9955 0.8144]
f1_score: [0.9554 0.     0.4036 0.8842 0.956  0.9736 0.8977]
confusion matrix: 
[[ 225    2    0    0    1   14    0]
 [   0    0    2    8    6    0    0]
 [   0    2   45   95   17    0    0]
 [   2    9   12 1149  117    0    0]
 [   0    5    4   58 2272    0    0]
 [   2    0    1    0    1  884    0]
 [   0    1    0    0    0   30  136]]
