## Libreries

In [1]:
import pandas as pd
import pickle
import numpy as np

## Data

In [3]:
%%time
features_train = pd.read_excel("../../feature-engineering/features/beto_pt_features_train_task_C1.xlsx", index_col="id")
features_val = pd.read_excel("../../feature-engineering/features/beto_pt_features_val_task_C1.xlsx", index_col="id")
features_test = pd.read_excel("../../feature-engineering/features/beto_pt_features_test_task_C1.xlsx", index_col="id")

CPU times: total: 2min 30s
Wall time: 2min 30s


In [4]:
A_train = pd.read_excel("../../data/train_task_C1.xlsx", index_col="id")
A_val = pd.read_excel("../../data/val_task_C1.xlsx", index_col="id")
A_test = pd.read_excel("../../data/test_task_C1.xlsx", index_col="id")

In [5]:
X_train, y_train = features_train, A_train["label"]
X_train = X_train.loc[y_train.index]

X_test, y_test = features_test, A_test["label"]
X_test = X_test.loc[y_test.index]

X_val, y_val = features_val, A_val["label"]
X_val = X_val.loc[y_val.index]

In [6]:
from sklearn.svm import SVC

In [7]:
%%time
svc = SVC(
    probability=True, 
    random_state=2022
)
svc.fit(X_train, y_train)

CPU times: total: 1min 11s
Wall time: 1min 11s


SVC(probability=True, random_state=2022)

In [8]:
y_pred = svc.predict(X_val)

In [9]:
from sklearn.metrics import classification_report

In [10]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      2510
           1       0.84      0.56      0.67       388

    accuracy                           0.93      2898
   macro avg       0.89      0.77      0.82      2898
weighted avg       0.92      0.93      0.92      2898



In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
parameters = {
    'kernel': ['linear', 'rbf'], 
    "class_weight": ["balanced", None],
    "gamma": ["scale", "auto"],
    'C':[1, 10, 15],
}

In [13]:
clf = GridSearchCV(svc, parameters, scoring="f1_macro", cv=5, verbose=1)

In [14]:
%%time
clf.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
CPU times: total: 2h 27min 13s
Wall time: 2h 27min 15s


GridSearchCV(cv=5, estimator=SVC(probability=True, random_state=2022),
             param_grid={'C': [1, 10, 15], 'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf']},
             scoring='f1_macro', verbose=1)

In [15]:
best_params = {}
for k in parameters:
    best_params[k] = vars(clf.best_estimator_)[k]
    print(f"{k}:", best_params[k])

kernel: rbf
class_weight: None
gamma: scale
C: 15


In [16]:
y_pred = clf.predict(X_train)
report = classification_report(y_train, y_pred, output_dict=True)
train_report = pd.DataFrame(report)
train_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.970106,0.932729,0.965827,0.951417,0.965126
recall,0.991117,0.801299,0.965827,0.896208,0.965827
f1-score,0.980499,0.862033,0.965827,0.921266,0.964715
support,10019.0,1540.0,0.965827,11559.0,11559.0


In [17]:
y_pred = clf.predict(X_val)
report = classification_report(y_val, y_pred, output_dict=True)
val_report = pd.DataFrame(report)
val_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.95814,0.880503,0.94962,0.919321,0.947745
recall,0.984861,0.721649,0.94962,0.853255,0.94962
f1-score,0.971316,0.793201,0.94962,0.882259,0.947469
support,2510.0,388.0,0.94962,2898.0,2898.0


In [18]:
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
test_report = pd.DataFrame(report)
test_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.917939,0.607843,0.847858,0.762891,0.855645
recall,0.889094,0.683824,0.847858,0.786459,0.847858
f1-score,0.903286,0.643599,0.847858,0.773443,0.851119
support,541.0,136.0,0.847858,677.0,677.0


## Save pickle

In [19]:
pickle.dump(clf, open("results/si.pickle", "wb"))

In [20]:
pickle.dump(best_params, open("results/best_params_svm_beto_pt_task_C1.pickle", "wb"))

In [21]:
pickle.dump(train_report, open("results/train_report_svm_beto_pt_task_C1.pickle", "wb"))

In [22]:
pickle.dump(val_report, open("results/val_report_svm_beto_pt_task_C1.pickle", "wb"))

In [23]:
pickle.dump(test_report, open("results/test_report_svm_beto_pt_task_C1.pickle", "wb"))