## Libreries

In [1]:
import pandas as pd
import pickle
import numpy as np

## Data

In [2]:
features_train = pd.read_excel("../../feature-engineering/features/beto_pt_features_train_task_Q.xlsx", index_col="id")
features_val = pd.read_excel("../../feature-engineering/features/beto_pt_features_val_task_Q.xlsx", index_col="id")
features_test = pd.read_excel("../../feature-engineering/features/beto_pt_features_test_task_Q.xlsx", index_col="id")

In [3]:
Q_train = pd.read_excel("../../data/train_task_Q.xlsx", index_col="id")
Q_val = pd.read_excel("../../data/val_task_Q.xlsx", index_col="id")
Q_test = pd.read_excel("../../data/test_task_Q.xlsx", index_col="id")

In [4]:
X_train, y_train = features_train, Q_train["label"]
X_train = X_train.loc[y_train.index]

X_test, y_test = features_test, Q_test["label"]
X_test = X_test.loc[y_test.index]

X_val, y_val = features_val, Q_val["label"]
X_val = X_val.loc[y_val.index]

In [5]:
from sklearn.svm import SVC

In [6]:
%%time
svc = SVC(
    probability=True, 
    random_state=2022
)
svc.fit(X_train, y_train)

CPU times: total: 125 ms
Wall time: 119 ms


SVC(probability=True, random_state=2022)

In [7]:
y_pred = svc.predict(X_val)

In [8]:
from sklearn.metrics import classification_report

In [9]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.80      1.00      0.89        12
           2       0.65      1.00      0.78        20
           3       1.00      0.88      0.94        17
           4       1.00      0.20      0.33         5
           5       1.00      0.43      0.60         7

    accuracy                           0.78        65
   macro avg       0.74      0.59      0.59        65
weighted avg       0.79      0.78      0.74        65



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
parameters = {
    'kernel': ['linear', 'rbf', "sigmoid", "poly"], 
    "class_weight": ["balanced", None],
    "gamma": ["scale", "auto"],
    'C':[1, 10, 100, 1000],
}

In [12]:
clf = GridSearchCV(svc, parameters, scoring="f1_macro", cv=5, verbose=1)

In [13]:
%%time
clf.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
CPU times: total: 25.8 s
Wall time: 26 s


GridSearchCV(cv=5, estimator=SVC(probability=True, random_state=2022),
             param_grid={'C': [1, 10, 100, 1000],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid', 'poly']},
             scoring='f1_macro', verbose=1)

In [14]:
best_params = {}
for k in parameters:
    best_params[k] = vars(clf.best_estimator_)[k]
    print(f"{k}:", best_params[k])

kernel: rbf
class_weight: balanced
gamma: auto
C: 100


In [15]:
y_pred = clf.predict(X_train)
report = classification_report(y_train, y_pred, output_dict=True)
train_report = pd.DataFrame(report)
train_report

Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
support,13.0,48.0,78.0,64.0,17.0,27.0,1.0,247.0,247.0


In [16]:
y_pred = clf.predict(X_val)
report = classification_report(y_val, y_pred, output_dict=True)
val_report = pd.DataFrame(report)
val_report

Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,1.0,0.785714,0.826087,1.0,0.5,1.0,0.846154,0.851967,0.868466
recall,0.25,0.916667,0.95,0.882353,0.6,0.857143,0.846154,0.742694,0.846154
f1-score,0.4,0.846154,0.883721,0.9375,0.545455,0.923077,0.846154,0.755984,0.839301
support,4.0,12.0,20.0,17.0,5.0,7.0,0.846154,65.0,65.0


In [17]:
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
test_report = pd.DataFrame(report)
test_report

Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,0.55814,0.4,0.380282,1.0,1.0,0.83871,0.727941,0.696188,0.833397
recall,0.888889,0.571429,0.84375,0.794521,0.5,0.448276,0.727941,0.674477,0.727941
f1-score,0.685714,0.470588,0.524272,0.885496,0.666667,0.58427,0.727941,0.636168,0.746649
support,27.0,7.0,32.0,146.0,2.0,58.0,0.727941,272.0,272.0


## Save pickle

In [18]:
pickle.dump(clf, open("results/clf_svm_beto_pt_task_Q.pickle", "wb"))

In [19]:
pickle.dump(best_params, open("results/best_params_svm_beto_pt_task_Q.pickle", "wb"))

In [20]:
pickle.dump(train_report, open("results/train_report_svm_beto_pt_task_Q.pickle", "wb"))

In [21]:
pickle.dump(val_report, open("results/val_report_svm_beto_pt_task_Q.pickle", "wb"))

In [22]:
pickle.dump(test_report, open("results/test_report_svm_beto_pt_task_Q.pickle", "wb"))