In [8]:
import pickle
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.metrics import f1_score

In [9]:
# load data
with open('output/y.pkl', 'rb') as f:
    y = pickle.load(f)
with open('output/x.pkl', 'rb') as f:
    x = pickle.load(f)    

In [12]:
# Build model and use 5 fold cross validation
kf = KFold(5, shuffle=True, random_state=42)
cv_svcsgd_f1  = []

for train_idx, val_idx in kf.split(x, y):
    # Assign indices
    x_train, y_train = x[train_idx], y[train_idx]
    x_val, y_val = x[val_idx], y[val_idx]
    
    # Scale Data
    scaler = StandardScaler()
    x_train_scale = scaler.fit_transform(x_train)
    x_val_scale = scaler.transform(x_val)
    
    # SGD Hinge
    sgd_hinge = linear_model.SGDClassifier(
        max_iter=1000,
        alpha=20,
        loss='hinge',
        class_weight='balanced'
    ).fit(x_train_scale, y_train)
    
    y_pred = sgd_hinge.predict(x_val_scale)
    cv_svcsgd_f1.append(f1_score(y_val, y_pred, average='binary'))
    
print(f'SVM Hinge Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}')

SVM Hinge Val f1: 0.907 +- 0.001
