In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
import json
import re
import shap
import lightgbm as lgb
import itertools
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score,make_scorer

from utils.utils import *
from utils.pre_processing import *
from utils.nn import *
from utils.nn_utils import *
from utils.ml_utils import *

In [5]:
df = load_data(main_df='data/df_tags',reviews_df='data/final_reviews')
df.to_csv('final_df.csv')

In [2]:
datasets = create_dataset(oversampling=True,undersampling=True,no_sampling=True)

In [3]:
# models = {
#     'LogisticRegression': (LogisticRegression(), {
#         'C': [1],
#         'penalty': ['l2'],
#         'solver': ['liblinear']
#     }),
#     'RandomForest': (RandomForestClassifier(), {
#         'n_estimators': [200],
#         'max_depth': [20],
#         'min_samples_split': [5]
#     }),
#     'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {
#         'n_estimators': [100, 200],
#         'max_depth': [3, 6],
#         'learning_rate': [0.01, 0.1]
#     }),
#     'LightGBM': (LGBMClassifier(), {
#         'n_estimators': [100, 200],
#         'max_depth': [-1, 10],
#         'learning_rate': [0.01, 0.1],
#         'num_leaves': [25,50,100],
#     }),
# }

In [4]:
models = {
    'LogisticRegression': (LogisticRegression(), {
        'C': [0.1, 1, 10, 100],
        'penalty': ['l2'],
        'solver': ['liblinear', 'saga'],
        'max_iter': [100, 200, 500]
    }),
    'RandomForest': (RandomForestClassifier(), {
        'n_estimators': [100, 200, 500],
        'max_depth': [5, 10, 20, 50],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [2, 4, 6]
    }),
    # 'XGBoost': (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {
    #     'n_estimators': [100, 200, 500],
    #     'max_depth': [3, 6, 10],
    #     'learning_rate': [0.001, 0.01, 0.1],
    #     'subsample': [0.6, 0.8, 1.0],
    #     'colsample_bytree': [0.6, 0.8, 1.0]
    # }),
    'LightGBM': (LGBMClassifier(), {
        'n_estimators': [200, 500, 750],
        'max_depth': [5, 10, 20],
        'learning_rate': [0.001, 0.01, 0.1],
        'num_leaves': [10, 20, 50],
    }),
}

In [5]:
best_models_dict = {}

for dataset_name,dataset in datasets.items():
    # Adiciona pesos ao modelo no caso desbalanceado
    if dataset_name == 'no_sampling':
        new_models = {}
        for model in models:
            if model in ['LogisticRegression','RandomForest']:
                weight_val = 'balanced'
            else:
                weight_val = int(dataset.get('scale_pos_weight'))
            new_models[model] = (models[model][0],models[model][1] |{weight_variable.get(model):[weight_val]})
    else:
        new_models = models

    x_train,y_train,x_val,y_val = dataset['X_train'],dataset['y_train'],dataset['X_val'],dataset['y_val']
    temp_models_dict = {dataset_name:run_ml_models(new_models,x_train,y_train,x_val,y_val,n_jobs=-1)}
    best_models_dict = best_models_dict | temp_models_dict

    rows = []

    for model_name, model_info in temp_models_dict[dataset_name].items():
        params = model_info['params']
        test_score = model_info['test_score']
        val_score = model_info['val_score']
        rows.append({
            'Model': model_name,
            'Sampling Technique': dataset_name,
            'Hyperparameters': str(params),
            'Test_Score': test_score,
            'Val_Score': val_score,
        })

    df = pd.DataFrame(rows)
    df.to_csv(f'ml_{dataset_name}_model_results.csv')

LogisticRegression: Best F1 Score = 0.8659892041207524, Best Params = {'C': 100, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
LogisticRegression on validation set: F1 Score = 0.12669683257918551
RandomForest: Best F1 Score = 0.972917829259161, Best Params = {'max_depth': 50, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 500}
RandomForest on validation set: F1 Score = 0.02857142857142857
[LightGBM] [Info] Number of positive: 2705, number of negative: 2705
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75579
[LightGBM] [Info] Number of data points in the train set: 5410, number of used features: 304
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
LightGBM: Best F1 Score = 0.9835435981159314, Best Params = {'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'num_leaves': 50}
Ligh



LogisticRegression: Best F1 Score = 0.11148047687971552, Best Params = {'C': 1, 'class_weight': 'balanced', 'max_iter': 500, 'penalty': 'l2', 'solver': 'saga'}
LogisticRegression on validation set: F1 Score = 0.14937759336099585
RandomForest: Best F1 Score = 0.10283151107373142, Best Params = {'class_weight': 'balanced', 'max_depth': 5, 'min_samples_leaf': 6, 'min_samples_split': 10, 'n_estimators': 500}
RandomForest on validation set: F1 Score = 0.07547169811320754
[LightGBM] [Info] Number of positive: 150, number of negative: 2705
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009132 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 69557
[LightGBM] [Info] Number of data points in the train set: 2855, number of used features: 310
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.052539 -> initscore=-2.892222
[LightGBM] [Info] Start training from score -2.892222
LightGBM: Best F1 Score = 0.192748807

In [6]:
rows = []

for best_model_name,best_model_item in best_models_dict.items():
    for model_name, model_info in best_model_item.items():
        params = model_info['params']
        test_score = model_info['test_score']
        val_score = model_info['val_score']
        rows.append({
            'Model': model_name,
            'Sampling Technique': best_model_name,
            'Hyperparameters': str(params),
            'Test_Score': test_score,
            'Val_Score': val_score,
        })

In [7]:
df = pd.DataFrame(rows)
df.to_csv(f'ml_model_results.csv')