In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel, SelectKBest, SelectPercentile
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

In [3]:
SEED=12345

In [4]:
train_raw = pd.read_csv("data/sms_train.csv")
test_raw = pd.read_csv("data/sms_test.csv")

In [5]:
train_raw["label"].value_counts()

0    3959
1     613
Name: label, dtype: int64

In [6]:
vectorizer = CountVectorizer(dtype=np.float64)
X_train = vectorizer.fit_transform(train_raw["message"])
y_train = train_raw["label"]
X_test = vectorizer.transform(test_raw["message"])

In [7]:
skf = StratifiedKFold(n_splits=10, random_state=SEED, shuffle=True)
scores = []
for train_idx, val_idx in skf.split(X_train, y_train):
    clf = DecisionTreeClassifier()
    clf = clf.fit(X_train[train_idx], y_train[train_idx])
    scores += [clf.score(X_train[val_idx], y_train[val_idx])]

In [8]:
sum(scores) / len(scores)

0.9623804382100849

In [9]:
from sklearn.svm import LinearSVC
cs = [10, 1, 0.1, 0.01]
res = []
for c in cs:
    print(c)
    scores = []
    n_selected = []
    for train_idx, val_idx in skf.split(X_train, y_train):
        clf = LinearSVC(C=c, penalty='l1', dual=False, max_iter=1_000_000)
        clf = clf.fit(X_train[train_idx], y_train[train_idx])
        selector = SelectFromModel(clf, prefit=True)
        n_selected += [selector.transform(X_train).shape[1]]
        scores += [clf.score(X_train[val_idx], y_train[val_idx])]

    res += [{
        'c': c,
        'score': sum(scores) / len(scores),
        'n_selected': sum(n_selected) / len(n_selected)
    }]

10
1
0.1
0.01


In [10]:
res

[{'c': 10, 'score': 0.9787827391474684, 'n_selected': 418.1},
 {'c': 1, 'score': 0.9805328084240299, 'n_selected': 357.1},
 {'c': 0.1, 'score': 0.9761593074254918, 'n_selected': 126.8},
 {'c': 0.01, 'score': 0.9341633780206969, 'n_selected': 24.8}]

## Optuna

In [7]:
import optuna
from optuna_utils import Objective

In [11]:
single_objective = optuna.create_study(direction="maximize")
single_objective.optimize(
    Objective(X_train, y_train, feature_selectors=[SelectFromModel, SelectKBest, SelectPercentile], mode='single',
              scaling_factor=0.01,
              cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED), use_scaler=False),
    n_trials=1000,
    timeout=12*60,
    gc_after_trial=True,
    show_progress_bar=True,
    n_jobs=2
)

[32m[I 2023-05-23 22:21:29,276][0m A new study created in memory with name: no-name-1a000198-6f43-4d9a-8547-ab331e27c578[0m

Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



  0%|          | 0/1000 [00:00<?, ?it/s]

[32m[I 2023-05-23 22:21:30,444][0m Trial 0 finished with value: 0.8993877033206434 and parameters: {'classifier': 'XGB', 'xgb_booster': 'gbtree', 'xgb_max_depth': 10, 'xgb_n_estimators': 14, 'xgb_subsample': 0.7699294496600398, 'feature_selector': 'SelectFromModel'}. Best is trial 0 with value: 0.8993877033206434.[0m
[32m[I 2023-05-23 22:21:30,865][0m Trial 1 finished with value: 0.8819885437006286 and parameters: {'classifier': 'LGBM', 'lgbm_boosting_type': 'gbdt', 'lgbm_max_depth': 7, 'lgbm_n_estimators': 27, 'lgbm_subsample': 0.8480064728092374, 'feature_selector': 'SelectFromModel'}. Best is trial 0 with value: 0.8993877033206434.[0m
[32m[I 2023-05-23 22:21:32,035][0m Trial 2 finished with value: 0.2647 and parameters: {'classifier': 'SVC', 'svc_kernel': 'sigmoid', 'svc_C': 0.0003945208824180399, 'feature_selector_svm': 'SelectPercentile', 'percentile': 31}. Best is trial 0 with value: 0.8993877033206434.[0m
[32m[I 2023-05-23 22:21:32,683][0m Trial 3 finished with value:

In [12]:
optuna.visualization.plot_optimization_history(single_objective)