In [2]:
import json
import torch
import custom_utils
import numpy as np

import optuna

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import lightgbm as lgb

torch.set_default_device('cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# importing data
with open('data/X_train.json', 'r') as json_file:
    X_train = json.load(json_file)
   
with open('data/y_train.json', 'r') as json_file:
    y_train = json.load(json_file)
   
with open('data/X_test.json', 'r') as json_file:
    X_test = json.load(json_file) 

with open('data/y_test.json', 'r') as json_file:
    y_test = json.load(json_file)
    
# converting to appropriate format (and device)
X_train = torch.Tensor(X_train).float()
y_train = torch.Tensor(y_train).long()

X_test = torch.Tensor(X_test).float()
y_test = torch.Tensor(y_test).long()

# getting shapes
num_features = X_train.shape[1]
num_classes = 2
# edge_dim = edge_attr.shape[1]

In [4]:
# set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Random Forest

In [None]:
def objective_RF(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for Random Forest
        n_estimators = trial.suggest_int('n_estimators', 10, 100)
        max_depth = trial.suggest_int('max_depth', 5, 30)
        min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1.0)
        min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.1, 0.5)
        
        # Create and train Random Forest model
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42  # You can adjust this for reproducibility
        )
        model.fit(X_train_fold, y_train_fold)
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_RF, n_trials=100)

[I 2023-12-02 19:28:15,343] A new study created in memory with name: no-name-dc4edafe-09ae-4101-bd12-4a3eb14b8173
[W 2023-12-02 19:29:22,408] Trial 0 failed with parameters: {'n_estimators': 37, 'max_depth': 20, 'min_samples_split': 0.30510790372075314, 'min_samples_leaf': 0.16862571394463444, 'max_features': None, 'bootstrap': False, 'criterion': 'gini'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/nicolas/Documents/projects/extractive-summarization/env/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_27121/3180067052.py", line 33, in objective
    model.fit(X_train_fold, y_train_fold)
  File "/home/nicolas/Documents/projects/extractive-summarization/env/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^

KeyboardInterrupt: 

## XGBoost

In [None]:
def objective_XGB(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for XGBoost
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 10, 100),
            'max_depth': trial.suggest_int('max_depth', 3, 80),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
            'subsample': trial.suggest_float('subsample', 0.1, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
            'gamma': trial.suggest_float('gamma', 0.0, 1.0),
            'min_child_weight': trial.suggest_float('min_child_weight', 1, 10),
            'objective': 'binary:logistic',
            'eval_metric': 'logloss',
            'random_state': 42  # You can adjust this for reproducibility
        }
        
        # Create and train XGBoost model
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)], verbose=False, **{'early_stopping_rounds': 10})
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_XGB, n_trials=100)

[I 2023-12-02 20:28:28,471] A new study created in memory with name: no-name-ea2c7139-9739-40a9-8bf5-dfa6848c8894
[W 2023-12-02 20:29:31,566] Trial 0 failed with parameters: {'n_estimators': 23, 'max_depth': 34, 'learning_rate': 0.0640928466508549, 'subsample': 0.5847614924730823, 'colsample_bytree': 0.20144257488842598, 'gamma': 0.43150780044748693, 'min_child_weight': 4.6316787721365795} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/nicolas/Documents/projects/extractive-summarization/env/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_27121/824403819.py", line 29, in objective_XGB
    model.fit(X_train_fold, y_train_fold, eval_set=[(X_valid_fold, y_valid_fold)], verbose=False, **{'early_stopping_rounds': 10})
  File "/home/nicolas/Documents/projects/extractive-summarization/env/lib/python3.11/site-packages/

KeyboardInterrupt: 

## LightGBM

In [None]:
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

def objective_LGBM(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for LightGBM
        params = {
            'objective': 'binary',
            'metric': 'binary_logloss',
            'boosting_type': 'gbdt',
            'num_leaves': trial.suggest_int('num_leaves', 10, 100),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
            'feature_fraction': trial.suggest_float('feature_fraction', 0.1, 1.0),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.1, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 20),
            'random_state': 42,  # You can adjust this for reproducibility
        }
        
        # Create and train LightGBM model
        d_train = lgb.Dataset(X_train_fold, label=y_train_fold)
        d_valid = lgb.Dataset(X_valid_fold, label=y_valid_fold, reference=d_train)
        
        model = lgb.train(
            params,
            d_train,
            valid_sets=[d_valid],
            num_boost_round=1000,  # You can adjust the number of boosting rounds
            early_stopping_rounds=10,  # Early stopping rounds
            verbose_eval=False
        )
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold, num_iteration=model.best_iteration)
        y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]
        
        score = f1_score(y_valid_fold, y_pred_binary, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_LGBM, n_trials=100)

[I 2023-12-02 20:29:35,008] A new study created in memory with name: no-name-239fe368-7d07-47e4-a9b7-eda9c6183171
[W 2023-12-02 20:29:35,077] Trial 0 failed with parameters: {'num_leaves': 64, 'learning_rate': 0.0027627360244080273, 'feature_fraction': 0.1257642049203703, 'bagging_fraction': 0.2868202336170464, 'bagging_freq': 1, 'min_child_samples': 7} because of the following error: TypeError("train() got an unexpected keyword argument 'early_stopping_rounds'").
Traceback (most recent call last):
  File "/home/nicolas/Documents/projects/extractive-summarization/env/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_27121/1810493956.py", line 35, in objective_LGBM
    model = lgb.train(
            ^^^^^^^^^^
TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'
[W 2023-12-02 20:29:35,079] Trial 0 failed with value None.


TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

In [None]:
def objective_LogisticRegression(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for Logistic Regression with Elastic Net
        params = {
            'C': trial.suggest_float('C', 0.001, 10.0),
            'penalty': 'elasticnet',
            'solver': 'saga',  # 'saga' is the appropriate solver for elastic net
            'l1_ratio': trial.suggest_float('l1_ratio', 0.0, 1.0),
            'max_iter': trial.suggest_int('max_iter', 50, 500),
            'random_state': 42  # You can adjust this for reproducibility
        }
        
        # Create and train Logistic Regression model with Elastic Net
        model = LogisticRegression(**params)
        model.fit(X_train_fold, y_train_fold)
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score


In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_LogisticRegression, n_trials=100)

[I 2023-12-02 20:31:36,385] A new study created in memory with name: no-name-b243c6cc-38bf-47e8-854a-4a92e3e3b713
[I 2023-12-02 20:32:30,952] Trial 0 finished with value: 0.4996319763979803 and parameters: {'C': 8.120712416092816, 'l1_ratio': 0.8476294877524637, 'max_iter': 101}. Best is trial 0 with value: 0.4996319763979803.
[I 2023-12-02 20:33:01,602] Trial 1 finished with value: 0.3486029437283562 and parameters: {'C': 0.03501949344044405, 'l1_ratio': 0.5145918244025874, 'max_iter': 319}. Best is trial 0 with value: 0.4996319763979803.
[I 2023-12-02 20:34:02,102] Trial 2 finished with value: 0.4995666266061958 and parameters: {'C': 7.07888009267358, 'l1_ratio': 0.8010864882548108, 'max_iter': 203}. Best is trial 0 with value: 0.4996319763979803.
[I 2023-12-02 20:34:44,592] Trial 3 finished with value: 0.4970186077927708 and parameters: {'C': 2.5840796030938376, 'l1_ratio': 0.3609682692048404, 'max_iter': 416}. Best is trial 0 with value: 0.4996319763979803.
[I 2023-12-02 20:35:48,1

KeyboardInterrupt: 

## Naive Bayes

In [None]:
def objective_GaussianNB(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Create and train Gaussian Naive Bayes model
        model = GaussianNB()
        model.fit(X_train_fold, y_train_fold)
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_GaussianNB, n_trials=1)

[I 2023-12-02 20:41:45,729] A new study created in memory with name: no-name-5bc6c8d3-a93c-4051-8fdd-65e037bbc709
[I 2023-12-02 20:41:46,897] Trial 0 finished with value: 0.5512340509047009 and parameters: {}. Best is trial 0 with value: 0.5512340509047009.


## SVM

In [None]:
# def objective_SVM(trial):
#     n_folds = 5
#     avg_score = 0
#     skf = StratifiedKFold(n_splits=n_folds)
    
#     for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
#         # Selecting fold train and validation
#         X_train_fold = X_train[train_idx]
#         y_train_fold = y_train[train_idx]
#         X_valid_fold = X_train[val_idx]
#         y_valid_fold = y_train[val_idx]
        
#         # Hyperparameters for SVM
#         params = {
#             'C': trial.suggest_float('C', 0.1, 10.0),
#             'kernel': trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly']),
#             'gamma': trial.suggest_float('gamma', 0.1, 1.0, log=True),
#             'random_state': 42  # You can adjust this for reproducibility
#         }
        
#         # Create and train SVM model
#         model = SVC(**params)
#         model.fit(X_train_fold, y_train_fold)
        
#         # Evaluate performance
#         y_pred = model.predict(X_valid_fold)
#         score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
#         avg_score += score / n_folds
    
#     return avg_score

In [84]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective_SVM, n_trials=100)

[I 2023-12-02 20:43:26,897] A new study created in memory with name: no-name-ee213695-d95a-4d2c-ba4c-481a1ce9919f


## KNN

In [5]:
def objective_KNN(trial):
    n_folds = 5
    avg_score = 0
    skf = StratifiedKFold(n_splits=n_folds)
    
    for i, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
        # Selecting fold train and validation
        X_train_fold = X_train[train_idx]
        y_train_fold = y_train[train_idx]
        X_valid_fold = X_train[val_idx]
        y_valid_fold = y_train[val_idx]
        
        # Hyperparameters for KNN
        params = {
            'n_neighbors': trial.suggest_int('n_neighbors', 1, 20),
            'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
            'p': trial.suggest_int('p', 1, 2),  # 1 for Manhattan distance, 2 for Euclidean distance
        }
        
        # Create and train KNN model
        model = KNeighborsClassifier(**params)
        model.fit(X_train_fold, y_train_fold)
        
        # Evaluate performance
        y_pred = model.predict(X_valid_fold)
        score = f1_score(y_valid_fold, y_pred, average='binary')  # Assuming binary classification
        
        avg_score += score / n_folds
    
    return avg_score


In [6]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_KNN, n_trials=100)

[I 2023-12-02 20:53:09,600] A new study created in memory with name: no-name-39ed0243-0c08-4cc3-8757-dffe30958894
[I 2023-12-02 20:53:43,127] Trial 0 finished with value: 0.32015148253715203 and parameters: {'n_neighbors': 2, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 0.32015148253715203.
