In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV, learning_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import LinearSVC, SVC
from transform_data import get_train_test_ds

random_state = 42

In [2]:
X, X_train, X_test, y, y_train, y_test = get_train_test_ds('income', 'data/adult.data', random_state)
# X, X_train, X_test, y, y_train, y_test = get_train_test_ds('bank', 'data/bank.csv', random_state)
X_train.shape, X_test.shape

((26048, 58), (6513, 58))

In [3]:
clf = SVC(
  kernel='poly', 
  class_weight='balanced',
  random_state=random_state,
)
pipe = Pipeline([
  ('scaler', MinMaxScaler()), 
  ('svm', clf),
])
f1_scorer = make_scorer(f1_score)

In [None]:
param_grid = {'svm__C': [1, 2, 4, 10],
              'svm__degree': [2, 3, 4, 5],
}

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

In [None]:
print("Best parameters:", grid_search.best_params_)

print("Train F1")
y_train_pred = best_model.predict(X_train)
train_f1_score = f1_score(y_train, y_train_pred)
print(train_f1_score)

print("Test F1")
y_test_pred = best_model.predict(X_test)
test_f1_score = f1_score(y_test, y_test_pred)
print(test_f1_score)

income
```
Best parameters: {'svm__C': 10, 'svm__degree': 3}
Train F1
0.7041920636895136
Test F1
0.6671630677587491
```

bank
```
Best parameters: {'svm__C': 10, 'svm__degree': 4}
Train F1
0.6355340273503358
Test F1
0.6228406909788867
```

In [None]:
# Define the training sizes
train_sizes = np.linspace(0.1, 1.0, 10)

##### TMP
clf = SVC(
  kernel='poly', 
  class_weight='balanced',
  random_state=random_state,
  C=10,
  degree=4,
)
pipe = Pipeline([
  ('scaler', MinMaxScaler()), 
  ('svm', clf),
])

# Calculate learning curves
train_sizes_abs, train_scores, val_scores = learning_curve(
    pipe, X_train, y_train, train_sizes=train_sizes, cv=5, scoring=f1_scorer, n_jobs=-1)

# Calculate the mean and standard deviation of training and validation scores
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
val_scores_mean = np.mean(val_scores, axis=1)
val_scores_std = np.std(val_scores, axis=1)

# Plot learning curve
plt.plot(train_sizes_abs, train_scores_mean, label='Training Score', color='blue')
plt.fill_between(train_sizes_abs, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color='blue')
plt.plot(train_sizes_abs, val_scores_mean, label='Validation Score', color='red')
plt.fill_between(train_sizes_abs, val_scores_mean - val_scores_std,
                 val_scores_mean + val_scores_std, alpha=0.1, color='red')
plt.xlabel('Training Examples')
plt.ylabel('F1 Score')
plt.title('SVM Learning Curve')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
plot_param = 'svm__degree'

##### temporary here to skip grid search #####
param_list = param_grid[plot_param]
print(param_list)
train_f1_score_list = []
test_f1_score_list = []

best_params = {
    'svm__C': 10, 'svm__degree': 3
}
##### temporary here to skip grid search #####

for param in param_list:
    params = best_params.copy()
    params_key = list(params.keys())
    for k in params_key:
        params[k.removeprefix('svm__')] = params.pop(k)
    params[plot_param.removeprefix('svm__')] = param

    clf = SVC(
      kernel='poly', 
      class_weight='balanced',
      random_state=random_state,
      **params,
    )
    pipe = Pipeline([
      ('scaler', MinMaxScaler()), 
      ('svm', clf),
    ])

    pipe.fit(X_train, y_train)

    y_train_pred = pipe.predict(X_train)
    train_f1_score_list.append(f1_score(y_train, y_train_pred))

    y_test_pred = pipe.predict(X_test)
    test_f1_score_list.append(f1_score(y_test, y_test_pred))

plot_df = pd.DataFrame({"train_f1": train_f1_score_list, "test_f1": test_f1_score_list}, index=param_list)
plot_df = plot_df.reset_index()

default_x_ticks = range(plot_df.shape[0])
plt.plot(default_x_ticks, plot_df['train_f1'], label='train_f1')
plt.plot(default_x_ticks, plot_df['test_f1'], label='test_f1')
plt.xticks(default_x_ticks, plot_df['index'])

plt.title(f"F1 score vs {plot_param}")
plt.grid()
plt.legend()
plt.show()

In [4]:
%%time

##### temporary here to skip grid search #####
best_params = {
    'svm__C': 10, 'svm__degree': 3
}
##### temporary here to skip grid search #####

params = best_params
params_key = list(params.keys())
for k in params_key:
    params[k.removeprefix('svm__')] = params.pop(k)
clf = SVC(
  kernel='poly', 
  class_weight='balanced',
  random_state=random_state,
  **params,
)
pipe = Pipeline([
  ('scaler', MinMaxScaler()), 
  ('svm', clf),
])
pipe.fit(X_train, y_train)

CPU times: user 17.4 s, sys: 129 ms, total: 17.5 s
Wall time: 17.5 s


In [5]:
%%time

y_test_pred = pipe.predict(X_test)

CPU times: user 1.71 s, sys: 230 µs, total: 1.72 s
Wall time: 1.71 s
