In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel, SelectKBest, SelectPercentile
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np

In [2]:
SEED=12345

In [3]:
train_raw = pd.read_csv("data/sms_train.csv")
test_raw = pd.read_csv("data/sms_test.csv")

In [4]:
train_raw["label"].value_counts()

0    3959
1     613
Name: label, dtype: int64

In [5]:
vectorizer = CountVectorizer(dtype=np.float64)
X_train = vectorizer.fit_transform(train_raw["message"])
y_train = train_raw["label"]
X_test = vectorizer.transform(test_raw["message"])

In [None]:
skf = StratifiedKFold(n_splits=10, random_state=SEED, shuffle=True)
scores = []
for train_idx, val_idx in skf.split(X_train, y_train):
    clf = DecisionTreeClassifier()
    clf = clf.fit(X_train[train_idx], y_train[train_idx])
    scores += [clf.score(X_train[val_idx], y_train[val_idx])]

In [None]:
sum(scores) / len(scores)

In [None]:
from sklearn.svm import LinearSVC
cs = [10, 1, 0.1, 0.01]
res = []
for c in cs:
    print(c)
    scores = []
    n_selected = []
    for train_idx, val_idx in skf.split(X_train, y_train):
        clf = LinearSVC(C=c, penalty='l1', dual=False, max_iter=1_000_000)
        clf = clf.fit(X_train[train_idx], y_train[train_idx])
        selector = SelectFromModel(clf, prefit=True)
        n_selected += [selector.transform(X_train).shape[1]]
        scores += [clf.score(X_train[val_idx], y_train[val_idx])]

    res += [{
        'c': c,
        'score': sum(scores) / len(scores),
        'n_selected': sum(n_selected) / len(n_selected)
    }]

In [None]:
res

## Naive Bayes

In [6]:
from sklearn.naive_bayes import BernoulliNB
from optuna_utils import spam_scorer
from sklearn.pipeline import Pipeline
skf = StratifiedKFold(n_splits=10, random_state=SEED, shuffle=True)
scores = []
for train_idx, val_idx in skf.split(X_train, y_train):
    clf = Pipeline([('model', BernoulliNB())])
    clf = clf.fit(X_train[train_idx], y_train[train_idx])
    y_hat = clf.predict(X_train[val_idx])
    scores += [spam_scorer(clf, X_train[val_idx], y_train[val_idx])]

In [7]:
scores

[0.12493981753014016,
 0.1529212447051157,
 0.1695196721311475,
 0.15312622950819665,
 0.19284720980294756,
 0.1682570458685213,
 0.13420753435999344,
 0.17645376717999672,
 0.13294490809736714,
 0.11006382196815023]

## Optuna

In [6]:
import optuna
from optuna_utils import Objective, spam_scorer

In [7]:
single_objective = optuna.create_study(direction="maximize")
single_objective.optimize(
    Objective(X_train, y_train, feature_selectors=[SelectFromModel, SelectKBest, SelectPercentile], mode='single',
              single_scorer=spam_scorer, cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED), use_scaler=False),
    n_trials=10_000,
    timeout=20*60,
    gc_after_trial=True,
    show_progress_bar=True,
    n_jobs=1
)

[32m[I 2023-05-26 20:41:51,977][0m A new study created in memory with name: no-name-0567eade-592b-4bd4-853d-263f9c5abeed[0m
  self._init_valid()


  0%|          | 0/10000 [00:00<?, ?it/s]

[32m[I 2023-05-26 20:41:57,681][0m Trial 0 finished with value: 0.48095516966743795 and parameters: {'classifier': 'XGB', 'xgb_booster': 'gbtree', 'xgb_max_depth': 8, 'xgb_n_estimators': 31, 'xgb_subsample': 0.9538649892582399, 'feature_selector': 'SelectPercentile', 'percentile': 56}. Best is trial 0 with value: 0.48095516966743795.[0m
[32m[I 2023-05-26 20:41:58,158][0m Trial 1 finished with value: 0.8645991471360223 and parameters: {'classifier': 'XGB', 'xgb_booster': 'dart', 'xgb_max_depth': 8, 'xgb_n_estimators': 19, 'xgb_subsample': 0.6252272355359757, 'feature_selector': 'SelectPercentile', 'percentile': 7}. Best is trial 1 with value: 0.8645991471360223.[0m
[32m[I 2023-05-26 20:41:59,934][0m Trial 2 finished with value: 0.9199593002172384 and parameters: {'classifier': 'LGBM', 'lgbm_boosting_type': 'dart', 'lgbm_max_depth': 13, 'lgbm_n_estimators': 121, 'lgbm_subsample': 0.8503993447149194, 'feature_selector': 'SelectFromModel'}. Best is trial 2 with value: 0.91995930021

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\joblib\memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\base.py", line 881, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\utils\_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\feature_selection\_base.py", line 90, in transform
    return self._transform(X)
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\feature_selection\_base.py", line 94, in _transform
    mask = self.get_support()
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\feature_selection\_base.py", line 53, in get_support
    mask = self._get_support_mask()
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\feature_selection\_from_model.py", line 289, in _get_support_mask
    scores = _get_feature_importances(
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\sklearn\feature_selection\_base.py", line 228, in _get_feature_importances
    importances = np.linalg.norm(importances, axis=0, ord=norm_order)
  File "<__array_function__ internals>", line 200, in norm
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\numpy\linalg\linalg.py", line 2538, in norm
    return add.reduce(abs(x), axis=axis, keepdims=keepdims)
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\scipy\sparse\_data.py", line 36, in __abs__
    return self._with_data(abs(self._deduped_data()))
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\scipy\sparse\_data.py", line 32, in _deduped_data
    self.sum_duplicates()
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\scipy\sparse\_compressed.py", line 1118, in sum_duplicates
    self.sort_indices()
  File "E:\Studies\DataScience-1sem\AdvancedMachineLearning\Projects\venv\lib\site-packages\scipy\sparse\_compressed.py", line 1164, in sort_indices
    _sparsetools.csr_sort_indices(len(self.indptr) - 1, self.indptr,
ValueError: WRITEBACKIFCOPY base is read-only


LinearSVC params:
 Trial 109 finished with value: 0.9421226548518368 and parameters: {'classifier': 'L1_SVC', 'l1_svc_C': 0.05064493683369135, 'feature_selector': 'SelectFromModel'}. Best is trial 109 with value: 0.9421226548518368.

In [9]:
optuna.visualization.plot_optimization_history(single_objective)

In [7]:
from optuna_utils import SpamObjective

In [8]:
single_objective_nb = optuna.create_study(direction="maximize")
single_objective_nb.optimize(
    SpamObjective(X_train, y_train, mode='single', cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)),
    n_trials=10_000,
    timeout=20*60,
    gc_after_trial=True,
    show_progress_bar=True,
    n_jobs=1
)

[32m[I 2023-05-26 21:37:44,141][0m A new study created in memory with name: no-name-dc9e4290-7839-4d9a-975f-729a61e14b81[0m
  self._init_valid()


  0%|          | 0/10000 [00:00<?, ?it/s]

[32m[I 2023-05-26 21:37:49,546][0m Trial 0 finished with value: 0.7805649296508139 and parameters: {'classifier': 'Complement', 'nb_alpha': 1.3892037644691392e-05, 'svc_C': 3.2476545299764963}. Best is trial 0 with value: 0.7805649296508139.[0m
[32m[I 2023-05-26 21:37:49,821][0m Trial 1 finished with value: 0.7611306200418094 and parameters: {'classifier': 'Multi', 'nb_alpha': 5.651748454720223, 'svc_C': 5.712750299552787}. Best is trial 0 with value: 0.7805649296508139.[0m
[32m[I 2023-05-26 21:37:50,121][0m Trial 2 finished with value: 0.7811995558922609 and parameters: {'classifier': 'Complement', 'nb_alpha': 7.376232900495803e-08, 'svc_C': 9.412343835356754}. Best is trial 2 with value: 0.7811995558922609.[0m
[32m[I 2023-05-26 21:37:50,327][0m Trial 3 finished with value: 0.7855512934694081 and parameters: {'classifier': 'Bern', 'nb_alpha': 5.082306072013617e-10, 'svc_C': 1.387575690610146}. Best is trial 3 with value: 0.7855512934694081.[0m
[32m[I 2023-05-26 21:37:50,5

Best naive bayes classifier: Trial 685 finished with value: 0.7932094719980451 and parameters: {'classifier': 'Bern', 'nb_alpha': 0.0016900367347166736, 'svc_C': 8.063260152518698}.
