## Libreries

In [25]:
import pandas as pd
import pickle
import numpy as np

## Data

In [2]:
lf_features_train = pd.read_excel("../../feature-engineering/features/lf_features_train_task_Q.xlsx", index_col="id")
lf_features_val = pd.read_excel("../../feature-engineering/features/lf_features_val_task_Q.xlsx", index_col="id")
lf_features_test = pd.read_excel("../../feature-engineering/features/lf_features_test_task_Q.xlsx", index_col="id")

In [3]:
Q_train = pd.read_excel("../../data/train_task_Q.xlsx", index_col="id")
Q_val = pd.read_excel("../../data/val_task_Q.xlsx", index_col="id")
Q_test = pd.read_excel("../../data/test_task_Q.xlsx", index_col="id")

In [4]:
X_train, y_train = lf_features_train, Q_train["label"]
X_train = X_train.loc[y_train.index]

X_test, y_test = lf_features_test, Q_test["label"]
X_test = X_test.loc[y_test.index]

X_val, y_val = lf_features_val, Q_val["label"]
X_val = X_val.loc[y_val.index]

In [6]:
from sklearn.svm import SVC

In [7]:
%%time
svc = SVC(
    probability=True, 
    random_state=2022
)
svc.fit(X_train, y_train)

Wall time: 535 ms


SVC(probability=True, random_state=2022)

In [8]:
y_pred = svc.predict(X_val)

Feature names unseen at fit time:
- lemma<&>        
- lemma<&>                                                                                                                       
- lemma<&>16
- lemma<&>268
- lemma<&>27
- ...
Feature names seen at fit time, yet now missing:
- lemma<&>   
- lemma<&>      
- lemma<&>          
- lemma<&>                            
- lemma<&>"
- ...



ValueError: X has 868 features, but SVC is expecting 1351 features as input.

In [19]:
from sklearn.metrics import classification_report

In [33]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80         6
           1       0.92      1.00      0.96        23
           2       0.97      0.82      0.89        44
           3       0.98      1.00      0.99        42
           4       0.93      1.00      0.97        14
           5       0.76      1.00      0.86        16

    accuracy                           0.93       145
   macro avg       0.93      0.91      0.91       145
weighted avg       0.94      0.93      0.93       145



In [34]:
from sklearn.model_selection import GridSearchCV

In [74]:
parameters = {
    'kernel': ['linear', 'rbf', "sigmoid", "poly"], 
    "class_weight": ["balanced", None],
    "gamma": ["scale", "auto"],
    'C':[1, 10, 100, 1000],
}

In [75]:
clf = GridSearchCV(svc, parameters, scoring="f1_macro", cv=5, verbose=1)

In [76]:
%%time
clf.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
Wall time: 41.7 s


GridSearchCV(cv=5,
             estimator=SVC(class_weight='balanced', gamma='auto',
                           probability=True, random_state=2022),
             param_grid={'C': [1, 10, 100, 1000],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid', 'poly']},
             scoring='f1_macro', verbose=1)

In [77]:
clf.best_estimator_

SVC(C=10, probability=True, random_state=2022)

In [78]:
y_pred = clf.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.67      0.73         6
           1       0.91      0.91      0.91        23
           2       0.95      0.93      0.94        44
           3       0.98      1.00      0.99        42
           4       1.00      1.00      1.00        14
           5       0.94      1.00      0.97        16

    accuracy                           0.95       145
   macro avg       0.93      0.92      0.92       145
weighted avg       0.95      0.95      0.95       145



In [79]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.96      0.78        27
           1       0.67      0.86      0.75         7
           2       0.46      0.84      0.59        32
           3       1.00      0.89      0.94       146
           4       1.00      0.50      0.67         2
           5       0.85      0.48      0.62        58

    accuracy                           0.80       272
   macro avg       0.77      0.76      0.72       272
weighted avg       0.86      0.80      0.81       272

