# Hyperparameter Fine-Tuning

This notebook performs hyperparameter fine-tuning for the following models:

- **Decision Tree**
- **Gaussian Naive Bayes**
- **Logistic Regression (L1 penalty)**
- **Logistic Regression (L2 penalty)**
- **Multi-Layer Perceptron (MLP)**

Hyperparameter search will be performed using **Grid Search**. Although it is computationally expensive, it allows exploring a wide range of reasonable hyperparameter values.  

We will first process the raw data or load a precomputed TF-IDF encoded dataset if it already exists, and then proceed with hyperparameter optimization.


In [1]:
from tfidf_extractor import load_and_vectorize, write_to_file, is_valid_csv
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
import os
import pandas as pd
import joblib

In [2]:
PROCESSED = "../data/processed/"
RAW = "../data/raw/"
RANDOM_STATE = 1
REFIT_METRIC = "f1_macro"
N_JOBS = 4
VERBOSE = 2

## Preprocessing and TF-IDF encoding

In [3]:
raw_files = [
    f"{RAW}train.csv",
    f"{RAW}test.csv",
    f"{RAW}validation.csv"
]

processed_files = [
    f"{PROCESSED}train_features.csv",
    f"{PROCESSED}test_features.csv",
    f"{PROCESSED}validation_features.csv"
]

In [None]:
# Check if processed files are valid; if not, process raw data
need_preprocessing = not all(is_valid_csv(file) for file in processed_files)

if need_preprocessing:
    print("Processing raw data and extracting TF-IDF features...")
    
    # Check if raw files are valid
    if not all(is_valid_csv(file) for file in raw_files):
        print("ERROR: Raw data files appear to be Git LFS pointers.")
        print("Please run: git lfs pull")
        print("Or download the actual data files manually.")
        raise FileNotFoundError("Raw data files are not available. Please pull from Git LFS.")
    
    X_train, y_train, X_test, y_test, X_val, y_val, vectorizer = load_and_vectorize(
        train_path=raw_files[0],
        test_path=raw_files[1],
        validation_path=raw_files[2]
    )

    # Save processed data
    write_to_file(processed_files[0], X_train, y_train)
    write_to_file(processed_files[1], X_test, y_test)
    write_to_file(processed_files[2], X_val, y_val)
    
    print("TF-IDF feature extraction completed. Files saved.")
else:
    print("Processed files already exist and are valid. Skipping preprocessing.")

Processed files already exist and are valid. Skipping preprocessing.


In [None]:
# Load processed data
try:
    df_train = pd.read_csv(processed_files[0])
    
    # Verify the data was loaded correctly
    print(f"Training set shape: {df_train.shape}")
    print(f"\nTraining set columns: {df_train.columns.tolist()[:5]}... (showing first 5)")
    print(f"Training set has 'Label' column: {'Label' in df_train.columns}")
    
    if 'Label' not in df_train.columns:
        raise ValueError("'Label' column not found in processed data. Please regenerate the processed files.")
        
except Exception as e:
    print(f"Error loading processed data: {e}")
    print("\nIf you see Git LFS pointer errors, you need to:")
    print("1. Install Git LFS: brew install git-lfs (on macOS) or visit https://git-lfs.github.com/")
    print("2. Initialize Git LFS: git lfs install")
    print("3. Pull the actual files: git lfs pull")
    print("\nAlternatively, run the preprocessing cell above to regenerate the processed files.")
    raise

Training set shape: (50587, 5001)
Test set shape: (6324, 5001)
Validation set shape: (6324, 5001)

Training set columns: ['0', '1', '2', '3', '4']... (showing first 5)
Training set has 'Label' column: True


### K-fold cross validation

In [6]:
# K-fold cross validation on the training set
X = df_train.drop(columns=['Label'])
y = df_train['Label']

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

In [7]:
X.shape

(50587, 5000)

In [8]:
scoring = {
    'accuracy': 'accuracy',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

## Hyperparemter tuning

### Decision Tree
To extract the best performance out of the decision tree model we are going to tune the following hyperparameters:
* `min_sample_split` - minimum number of observations to split a node.
* `max_depth` - max depth of the tree

In [9]:
# Optimized grid: removed max_depth=None (very slow with 5000 features)
# Added max_features to speed up splits (only evaluates sqrt of features per split)
param_grid = {
    'max_depth': [10, 20, 30, 50, 100, 1000, 5000],  # Removed None and very deep trees
    'min_samples_split': [2, 5, 10, 20, 50],
    'max_features': ['sqrt', 'log2']  # Limits features evaluated per split (much faster!)
}

grid = GridSearchCV(
    DecisionTreeClassifier(random_state=RANDOM_STATE),
    param_grid=param_grid,
    scoring=scoring,
    refit=REFIT_METRIC,
    cv=kf,
    return_train_score=False,
    n_jobs=N_JOBS,
    verbose=VERBOSE
)

print("Starting Decision Tree grid search (optimized for speed)...")
grid.fit(X, y)

Starting Decision Tree grid search (optimized for speed)...
Fitting 5 folds for each of 70 candidates, totalling 350 fits
[CV] END max_depth=10, max_features=sqrt, min_samples_split=2; total time=   5.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=2; total time=   5.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=2; total time=   5.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=2; total time=   5.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=2; total time=   0.9s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=5; total time=   1.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=5; total time=   1.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=5; total time=   1.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=5; total time=   1.1s
[CV] END max_depth=10, max_features=sqrt, min_samples_split=5; total time=   1.0s
[CV] END max_depth=10, max_features=sqrt, min_samples_spli

0,1,2
,estimator,DecisionTreeC...andom_state=1)
,param_grid,"{'max_depth': [10, 20, ...], 'max_features': ['sqrt', 'log2'], 'min_samples_split': [2, 5, ...]}"
,scoring,"{'accuracy': 'accuracy', 'f1_macro': 'f1_macro', 'precision_macro': 'precision_macro', 'recall_macro': 'recall_macro'}"
,n_jobs,4
,refit,'f1_macro'
,cv,KFold(n_split... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,50
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,random_state,1
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Saving the model

In [10]:
# Save the best model
os.makedirs("../models", exist_ok=True)
best = grid.best_estimator_
joblib.dump(best, "../models/decision_tree.pkl")

['../models/decision_tree.pkl']

### Gaussian Naive Bayes
Gaussian Naive Bayes hyperparameters to tune:
* `var_smoothing` - portion of the largest variance of all features that is added to variances for calculation stability


In [11]:
param_grid_nb = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
}

grid_nb = GridSearchCV(
    GaussianNB(),
    param_grid=param_grid_nb,
    scoring=scoring,
    refit=REFIT_METRIC,
    cv=kf,
    return_train_score=False,
    n_jobs=N_JOBS,
    verbose=VERBOSE
)

grid_nb.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ................................var_smoothing=1e-09; total time=   4.2s
[CV] END ................................var_smoothing=1e-09; total time=   4.3s
[CV] END ................................var_smoothing=1e-09; total time=   4.3s
[CV] END ................................var_smoothing=1e-09; total time=   4.3s
[CV] END ................................var_smoothing=1e-09; total time=   3.0s
[CV] END ................................var_smoothing=1e-08; total time=   3.2s
[CV] END ................................var_smoothing=1e-08; total time=   3.2s
[CV] END ................................var_smoothing=1e-08; total time=   3.2s
[CV] END ................................var_smoothing=1e-08; total time=   2.3s
[CV] END ................................var_smoothing=1e-07; total time=   2.7s
[CV] END ................................var_smoothing=1e-08; total time=   2.7s
[CV] END ................................var_smoo

0,1,2
,estimator,GaussianNB()
,param_grid,"{'var_smoothing': [1e-09, 1e-08, ...]}"
,scoring,"{'accuracy': 'accuracy', 'f1_macro': 'f1_macro', 'precision_macro': 'precision_macro', 'recall_macro': 'recall_macro'}"
,n_jobs,4
,refit,'f1_macro'
,cv,KFold(n_split... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,priors,
,var_smoothing,1e-08


### Saving the model

In [12]:
# Save the best model
os.makedirs("../models", exist_ok=True)
best = grid_nb.best_estimator_
joblib.dump(best, "../models/gaussian_nb.pkl")

['../models/gaussian_nb.pkl']

### Logistic Regression with L2 Regularization
Logistic Regression L2 hyperparameters to tune:
* `C` - inverse of regularization strength (smaller values specify stronger regularization)


In [13]:
param_grid_lr_l2 = {
    'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}

grid_lr_l2 = GridSearchCV(
    LogisticRegression(penalty='l2', max_iter=1000, random_state=RANDOM_STATE, solver='lbfgs'),
    param_grid=param_grid_lr_l2,
    scoring=scoring,
    refit=REFIT_METRIC,
    cv=kf,
    return_train_score=False,
    n_jobs=N_JOBS,
    verbose=VERBOSE
)

grid_lr_l2.fit(X, y)


Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] END ............................................C=0.001; total time=   2.9s
[CV] END ............................................C=0.001; total time=   3.0s
[CV] END ............................................C=0.001; total time=   3.0s
[CV] END ............................................C=0.001; total time=   3.1s
[CV] END ............................................C=0.001; total time=   2.0s
[CV] END .............................................C=0.01; total time=   2.8s
[CV] END .............................................C=0.01; total time=   2.9s
[CV] END .............................................C=0.01; total time=   3.0s
[CV] END .............................................C=0.01; total time=   2.5s
[CV] END .............................................C=0.01; total time=   2.3s
[CV] END ..............................................C=0.1; total time=   2.9s
[CV] END ........................................

0,1,2
,estimator,LogisticRegre...andom_state=1)
,param_grid,"{'C': [0.001, 0.01, ...]}"
,scoring,"{'accuracy': 'accuracy', 'f1_macro': 'f1_macro', 'precision_macro': 'precision_macro', 'recall_macro': 'recall_macro'}"
,n_jobs,4
,refit,'f1_macro'
,cv,KFold(n_split... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,10.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,1
,solver,'lbfgs'
,max_iter,1000


### Saving the model

In [14]:
# Save the best model
os.makedirs("../models", exist_ok=True)
best = grid_lr_l2.best_estimator_
joblib.dump(best, "../models/lr_l2.pkl")

['../models/lr_l2.pkl']

### Logistic Regression with L1 Regularization
Logistic Regression L1 hyperparameters to tune:
* `C` - inverse of regularization strength (smaller values specify stronger regularization)


In [15]:
param_grid_lr_l1 = {
    'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
}

grid_lr_l1 = GridSearchCV(
    LogisticRegression(penalty='l1', max_iter=1000, random_state=RANDOM_STATE, solver='liblinear'),
    param_grid=param_grid_lr_l1,
    scoring=scoring,
    refit=REFIT_METRIC,
    cv=kf,
    return_train_score=False,
    n_jobs=N_JOBS,
    verbose=VERBOSE
)

grid_lr_l1.fit(X, y)


Fitting 5 folds for each of 6 candidates, totalling 30 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ............................................C=0.001; total time=   3.1s
[CV] END ............................................C=0.001; total time=   3.2s
[CV] END ............................................C=0.001; total time=   3.2s
[CV] END ............................................C=0.001; total time=   3.2s


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[CV] END ............................................C=0.001; total time=   2.2s
[CV] END .............................................C=0.01; total time=   2.3s
[CV] END .............................................C=0.01; total time=   2.4s
[CV] END .............................................C=0.01; total time=   2.4s
[CV] END .............................................C=0.01; total time=   2.2s
[CV] END .............................................C=0.01; total time=   2.3s
[CV] END ..............................................C=0.1; total time=   2.5s
[CV] END ..............................................C=0.1; total time=   2.5s
[CV] END ..............................................C=0.1; total time=   2.3s
[CV] END ..............................................C=0.1; total time=   2.2s
[CV] END ..............................................C=0.1; total time=   2.3s
[CV] END ..............................................C=1.0; total time=   2.8s
[CV] END ...................

0,1,2
,estimator,LogisticRegre...r='liblinear')
,param_grid,"{'C': [0.001, 0.01, ...]}"
,scoring,"{'accuracy': 'accuracy', 'f1_macro': 'f1_macro', 'precision_macro': 'precision_macro', 'recall_macro': 'recall_macro'}"
,n_jobs,4
,refit,'f1_macro'
,cv,KFold(n_split... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,1
,solver,'liblinear'
,max_iter,1000


### Saving the model

In [16]:
# Save the best model
os.makedirs("../models", exist_ok=True)
best = grid_lr_l1.best_estimator_
joblib.dump(best, "../models/lr_l1.pkl")

['../models/lr_l1.pkl']

### Multi-Layer Perceptron (MLP)
MLP hyperparameters to tune:
* `hidden_layer_sizes` - number of neurons in each hidden layer
* `alpha` - L2 penalty (regularization) parameter
* `learning_rate_init` - initial learning rate


In [18]:
param_grid_mlp = {
    'hidden_layer_sizes': [(32,), (64,), (128,), (64, 32), (128, 64)],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate_init': [0.001, 0.01, 0.1]
}

grid_mlp = GridSearchCV(
    MLPClassifier(max_iter=500, random_state=RANDOM_STATE, early_stopping=True, validation_fraction=0.1),
    param_grid=param_grid_mlp,
    scoring=scoring,
    refit=REFIT_METRIC,
    cv=kf,
    return_train_score=False,
    n_jobs=N_JOBS,
    verbose=VERBOSE
)

grid_mlp.fit(X, y)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.001; total time=  12.9s
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.001; total time=  13.5s
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.001; total time=  14.0s
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.001; total time=  14.6s
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.001; total time=  12.8s
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.01; total time=  11.9s
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.01; total time=  12.5s
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.01; total time=  15.1s
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.01; total time=  11.6s
[CV] END alpha=0.0001, hidden_layer_sizes=(32,), learning_rate_init=0.1; total time=  11.5s
[CV]

0,1,2
,estimator,MLPClassifier...andom_state=1)
,param_grid,"{'alpha': [0.0001, 0.001, ...], 'hidden_layer_sizes': [(32,), (64,), ...], 'learning_rate_init': [0.001, 0.01, ...]}"
,scoring,"{'accuracy': 'accuracy', 'f1_macro': 'f1_macro', 'precision_macro': 'precision_macro', 'recall_macro': 'recall_macro'}"
,n_jobs,4
,refit,'f1_macro'
,cv,KFold(n_split... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,hidden_layer_sizes,"(128, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.01
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,500
,shuffle,True


### Saving the model

In [19]:
# Save the best model
os.makedirs("../models", exist_ok=True)
best = grid_mlp.best_estimator_
joblib.dump(best, "../models/mlp.pkl")


['../models/mlp.pkl']