In [7]:
import pandas as pd
import numpy as np

df = pd.read_parquet('../features/dev_features_dataset.parquet')

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import (cross_val_predict, KFold, cross_val_score)
from sklearn.metrics import (
    confusion_matrix, auc, roc_curve, plot_confusion_matrix, classification_report, accuracy_score)
import seaborn as sns
from sklearn.model_selection import GridSearchCV


def get_estimators(model, params, x, y):
    clf = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1, cv=5)
    clf.fit(x, y)
    return clf


def plot_confusion_matrix(m):
    g1 = sns.heatmap(m, annot=True, cmap="YlGnBu")
    g1.set_xlabel('Predicted labels')
    g1.set_ylabel('True labels')
    g1.set_title('Confusion Matrix')


def norm_confusion_matrix(y, y_pred):
    m = confusion_matrix(y, y_pred)
    return m.astype('float')/m.sum(axis=1)[:, np.newaxis]


def svm(params: dict, x: np.ndarray, y: np.ndarray):
    model = SVC()
    clf = GridSearchCV(estimator=model, param_grid=params, n_jobs=-1)
    clf.fit(x, y)
    return clf

In [5]:
from sklearn.model_selection import (train_test_split, KFold)

x = df.iloc[:, :-1]
y = df.loc[:, 'label']

X_train, X_test, y_train, y_test = train_test_split(
    x, y, test_size=0.3, random_state=0)

In [10]:
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier as DTC

param_svm = [{
    'C': [1, 10, 100, 1000],
    'gamma': [0.01, 0.001, 0.0001, 0.00001],
    'kernel': ['rbf']
}]


param_knn = [{
    'n_neighbors': [3, 5, 7, 9, 11, 15],
    'algorithm':['ball_tree', 'kd_tree', 'brute']
}]


param_tree = [{
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth':np.arange(1, 7)
}]

result_svm = get_estimators(model=SVC(), params=param_svm, x=X_train, y=y_train)
result_knn = get_estimators(model=KNN(), params=param_knn, x=X_train, y=y_train)
# result_tree = get_estimators(model=DTC(), params=param_tree, x=X_train, y=y_train)

In [11]:
from pprint import pprint

best_params = {
    'SVM': result_svm.cv_results_['params'][0],
    'KNN': result_knn.cv_results_['params'][0]
}

pprint(best_params)

{'KNN': {'algorithm': 'ball_tree', 'n_neighbors': 3},
 'SVM': {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}}


In [12]:
from pprint import pprint

print(classification_report(y_test, result_knn.best_estimator_.predict(X_test)))

print(classification_report(y_test, result_svm.best_estimator_.predict(X_test)))



              precision    recall  f1-score   support

        ABSZ       1.00      1.00      1.00        33
        CPSZ       1.00      1.00      1.00      1676
        FNSZ       1.00      1.00      1.00      9521
        GNSZ       1.00      1.00      1.00      5851
        MYSZ       1.00      1.00      1.00         6
        SPSZ       1.00      1.00      1.00        36
        TCSZ       1.00      1.00      1.00      2421
        TNSZ       1.00      1.00      1.00       143

    accuracy                           1.00     19687
   macro avg       1.00      1.00      1.00     19687
weighted avg       1.00      1.00      1.00     19687

              precision    recall  f1-score   support

        ABSZ       1.00      1.00      1.00        33
        CPSZ       1.00      1.00      1.00      1676
        FNSZ       1.00      1.00      1.00      9521
        GNSZ       1.00      1.00      1.00      5851
        MYSZ       1.00      1.00      1.00         6
        SPSZ       1.00 