In [35]:
import json
import torch
import numpy as np

import sys
sys.path.append('../')

import optuna

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import lightgbm as lgb
import src.custom_utils as custom_utils

torch.set_default_device('cpu')

In [36]:
# # importing data
# with open('../data/combine_true/X_train.json', 'r') as json_file:
#     X_train = json.load(json_file)
   
# with open('../data/combine_true/y_train.json', 'r') as json_file:
#     y_train = json.load(json_file)
   
# with open('../data/combine_true/X_test.json', 'r') as json_file:
#     X_test = json.load(json_file) 

# with open('../data/combine_true/y_test.json', 'r') as json_file:
#     y_test = json.load(json_file)
    
# # converting to appropriate format (and device)
# X_train = torch.Tensor(X_train).float()
# y_train = torch.Tensor(y_train).long()

# X_test = torch.Tensor(X_test).float()
# y_test = torch.Tensor(y_test).long()

# # getting shapes
# num_features = X_train.shape[1]
# num_classes = 2

In [37]:
# importing data
with open('../data/combine_false_full/X_train.json', 'r') as json_file:
    X_train = json.load(json_file)
   
with open('../data/combine_false_full/y_train.json', 'r') as json_file:
    y_train = json.load(json_file)
   
# with open('../data/combine_false/X_test.json', 'r') as json_file:
#     X_test = json.load(json_file) 

# with open('../data/combine_false_full/y_test.json', 'r') as json_file:
#     y_test = json.load(json_file)
    
# converting to appropriate format (and device)
X_train = torch.Tensor(X_train).float()
y_train = torch.Tensor(y_train).long()

# X_test = torch.Tensor(X_test).float()
# y_test = torch.Tensor(y_test).long()

# getting shapes
num_features = X_train.shape[1]
num_classes = 2

In [38]:
# set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [39]:
N_TRIALS = 1

## Random Forest

In [22]:
def objective_RF(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for Random Forest
        n_estimators = trial.suggest_int('n_estimators', 10, 100)
        max_depth = trial.suggest_int('max_depth', 5, 100)
        min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1.0)
        min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.1, 0.5)
        
        # Create and train Random Forest model
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42  # You can adjust this for reproducibility
        )
        model.fit(X_train_fold, y_train_fold)
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score

In [23]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_RF, n_trials=N_TRIALS)

[I 2023-12-02 21:15:08,770] A new study created in memory with name: no-name-62da9d1b-0ab4-42b3-bb91-7ea9c34a2285
[I 2023-12-02 21:15:39,111] Trial 0 finished with value: 0.0 and parameters: {'n_estimators': 58, 'max_depth': 18, 'min_samples_split': 0.22137575729059794, 'min_samples_leaf': 0.17831503384491945}. Best is trial 0 with value: 0.0.


## XGBoost

In [24]:
def objective_XGB(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for XGBoost
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 100),
            'max_depth': trial.suggest_int('max_depth', 3, 50),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
            'subsample': trial.suggest_float('subsample', 0.1, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
            'gamma': trial.suggest_float('gamma', 0.0, 1.0),
            'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'random_state': 42  # You can adjust this for reproducibility
        }
        
        # Create and train XGBoost model
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)], verbose=False, **{'early_stopping_rounds': 10})
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score

In [25]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [26]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_XGB, n_trials=N_TRIALS)

[I 2023-12-02 21:15:39,259] A new study created in memory with name: no-name-e8031a49-0c9f-4707-a9d7-50eb03b40194
[I 2023-12-02 21:18:28,765] Trial 0 finished with value: 0.11004403111898922 and parameters: {'n_estimators': 58, 'max_depth': 55, 'learning_rate': 0.01822354886884994, 'subsample': 0.49001009793614614, 'colsample_bytree': 0.7757923583948666, 'gamma': 0.14104952479183452, 'min_child_weight': 4.599690775752921}. Best is trial 0 with value: 0.11004403111898922.


## LightGBM

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

def objective_LGBM(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for LightGBM
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'num_leaves': trial.suggest_int('num_leaves', 10, 100),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),
            'random_state': 42,  # You can adjust this for reproducibility
        }
        
        # Create and train LightGBM model
        d_train = lgb.Dataset(X_train_fold, label=y_train_fold)
        d_valid = lgb.Dataset(X_valid_fold, label=y_valid_fold, reference=d_train)
        
        model = lgb.train(
            params,
            d_train,
            valid_sets=[d_valid],
            num_boost_round=1000,  # You can adjust the number of boosting rounds
            early_stopping_rounds=10,  # Early stopping rounds
            verbose_eval=False
        )
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold, num_iteration=model.best_iteration)
        y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]
        
        score = f1_score(y_valid_fold, y_pred_binary, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_LGBM, n_trials=N_TRIALS)

[I 2023-12-02 20:29:35,008] A new study created in memory with name: no-name-239fe368-7d07-47e4-a9b7-eda9c6183171
[W 2023-12-02 20:29:35,077] Trial 0 failed with parameters: {'num_leaves': 64, 'learning_rate': 0.0027627360244080273, 'feature_fraction': 0.1257642049203703, 'bagging_fraction': 0.2868202336170464, 'bagging_freq': 1, 'min_child_samples': 7} because of the following error: TypeError("train() got an unexpected keyword argument 'early_stopping_rounds'").
Traceback (most recent call last):
  File "/home/nicolas/Documents/projects/extractive-summarization/env/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_27121/1810493956.py", line 35, in objective_LGBM
    model = lgb.train(
            ^^^^^^^^^^
TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'
[W 2023-12-02 20:29:35,079] Trial 0 failed with value None.


TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

In [48]:
def objective_LogisticRegression(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for Logistic Regression with Elastic Net
        params = {
            'C': trial.suggest_float('C', 0.001, 10.0),
            'penalty': 'elasticnet',
            'solver': 'saga',  # 'saga' is the appropriate solver for elastic net
            'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0),
            'max_iter': trial.suggest_int('max_iter', 50, 500),
            'random_state': 42  # You can adjust this for reproducibility
        }
        
        # Create and train Logistic Regression model with Elastic Net
        model = LogisticRegression(**params)
        model.fit(X_train_fold, y_train_fold)
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score

In [49]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_LogisticRegression, n_trials=N_TRIALS)

[I 2023-12-02 21:32:37,321] A new study created in memory with name: no-name-b36386ba-764d-4a91-95d7-7a5c49d0a29b
[I 2023-12-02 21:33:25,508] Trial 0 finished with value: 0.4635860658566684 and parameters: {'C': 4.029977224386709, 'l1_ratio': 0.8558355080811554, 'max_iter': 94}. Best is trial 0 with value: 0.4635860658566684.


## Naive Bayes

In [46]:
def objective_GaussianNB(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Create and train Gaussian Naive Bayes model
        model = GaussianNB()
        model.fit(X_train_fold, y_train_fold)
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score

In [47]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_GaussianNB, n_trials=1)

[I 2023-12-02 21:32:21,190] A new study created in memory with name: no-name-47554e60-2b7b-45fa-b4c7-bde2f4bc777b
[I 2023-12-02 21:32:22,524] Trial 0 finished with value: 0.5263261241469139 and parameters: {}. Best is trial 0 with value: 0.5263261241469139.


## SVM

In [31]:
# def objective_SVM(trial):
#     n_folds = 5
#     avg_score = 0
#     skf = StratifiedKFold(n_splits=n_folds)
    
#     for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
#         # Selecting fold train and validation
#         X_train_fold = X_train[train_idx]
#         y_train_fold = y_train[train_idx]
#         X_valid_fold = X_train[val_idx]
#         y_valid_fold = y_train[val_idx]
        
#         # Hyperparameters for SVM
#         params = {
#             'C': trial.suggest_float('C', 0.1, 10.0),
#             'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
#             'gamma': trial.suggest_float('gamma', 0.1, 1.0, log=True),
#             'random_state': 42  # You can adjust this for reproducibility
#         }
        
#         # Create and train SVM model
#         model = SVC(**params)
#         model.fit(X_train_fold, y_train_fold)
        
#         # Evaluate performance
#         y_pred = model.predict(X_valid_fold)
#         score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
#         avg_score += score / n_folds
    
#     return avg_score

In [32]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective_SVM, n_trials=100)

## KNN

In [33]:
def objective_KNN(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for KNN
        params = {
            'n_neighbors': trial.suggest_int('n_neighbors', 1, 20),
            'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
            'p': trial.suggest_int('p', 1, 2),  # 1 for Manhattan distance, 2 for Euclidean distance
        }
        
        # Create and train KNN model
        model = KNeighborsClassifier(**params)
        model.fit(X_train_fold, y_train_fold)
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score

In [34]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_KNN, n_trials=N_TRIALS)

[I 2023-12-02 21:19:33,258] A new study created in memory with name: no-name-0a510f3d-08e8-46b4-bedb-c8b1cd33fcff
[I 2023-12-02 21:20:01,050] Trial 0 finished with value: 0.4429024662302684 and parameters: {'n_neighbors': 17, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 0.4429024662302684.
