In [5]:
import pandas as pd
import numpy as np
import nltk
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, roc_auc_score
import scipy.stats as stats

In [3]:
from sklearnex import patch_sklearn
patch_sklearn()

Collecting scikit-learn-intelex
  Downloading scikit_learn_intelex-2025.5.0-py311-none-manylinux_2_28_x86_64.whl.metadata (11 kB)
Collecting daal==2025.5.0 (from scikit-learn-intelex)
  Downloading daal-2025.5.0-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (1.1 kB)
Downloading scikit_learn_intelex-2025.5.0-py311-none-manylinux_2_28_x86_64.whl (4.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading daal-2025.5.0-py2.py3-none-manylinux_2_28_x86_64.whl (110.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.3/110.3 MB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: daal, scikit-learn-intelex
Successfully installed daal-2025.5.0 scikit-learn-intelex-2025.5.0


Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [6]:
clf = XGBClassifier()

params =  {
 "n_estimators": [100, 200, 300, 500],
 "learning_rate": [0.05, 0.10, 0.20],
 "max_depth": [6, 8, 10],
 "min_child_weight": [1, 3, 5],
 "gamma": [0.0, 0.1, 0.3],
 "subsample": [0.6, 0.8, 1.0],
 "colsample_bytree": [0.4, 0.6, 0.8],
 "reg_alpha": [0, 0.1, 1],
 "reg_lambda": [1, 1.5, 2]
}

In [None]:
i = 1
for param in params.keys():
  i*=len(params[param])
iter = i
n_iter = int(20/100*iter)
print(f'Número de combinações possíveis: {iter}')
print(f'Número de combinações a serem testadas: {n_iter}')

Número de combinações possíveis: 50
Número de combinações a serem testadas: 10


In [None]:
save_path = '/code/RQ1/results'
dir_train = '/dataset/train.csv'
dir_test = '/dataset/test.csv'
df_train = pd.read_csv(dir_train)
df_test = pd.read_csv(dir_test)
df = pd.concat([df_train, df_test])
df['categoria_rating'] = df['categoria'].astype(str) + "_" + df['rating'].astype(str)

In [None]:
X = df.drop(columns=['categoria','text','rating','categoria_rating'])
y = df['rating']

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=False, random_state=42)
splits = list(kfold.split(X, df['categoria_rating'])) 

In [None]:
random_search = RandomizedSearchCV(
    clf,
    param_distributions = params,
    n_iter = n_iter,
    scoring = 'neg_root_mean_squared_error',
    cv = splits,
    random_state = 42,
    verbose = 3)

In [None]:
random_search.fit(X, y)

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


In [None]:
best_params = random_search.best_params_
print(f'Melhores parâmetros: {best_params}')

In [None]:
def metrics(y_pred, y_test):
      report = metrics.classification_report(y_test, y_pred, target_names=list(map(str, [1,2,3,4,5])), output_dict=True)
      cm = confusion_matrix(y_test, y_pred, labels=[1,2,3,4,5])
      f1_macro = report['macro avg']['f1-score']

      mae = mean_absolute_error(y_test, y_pred)
      rmse = np.sqrt(mean_squared_error(y_test, y_pred))

      y_true_bin = np.array([1 if y in [4,5] else 0 for y in y_test])
      y_pred_bin = np.array([1 if y in [4,5] else 0 for y in y_pred])
      auc = roc_auc_score(y_true_bin, y_pred_bin)

      return report, round(f1_macro,4), cm, round(mae,4), round(rmse,4), round(auc,4)

In [None]:
all_metrics = []

In [None]:
for i, (train_idx, test_idx) in enumerate(splits):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model = clf(**best_params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n***FOLD {i + 1}***")
    all_metrics.append((metrics(y_pred, y_test)))

In [None]:
for i, (report, f1_macro, cm, mae, rmse, auc) in enumerate(all_metrics):
    print(f"\n***FOLD {i + 1}***")
    print(f"\nClassification Report:\n{report}")
    print(f"\nF1 Macro: {f1_macro}")
    print(f"\nConfusion Matrix:\n{cm}")
    print(f"\nMAE: {mae}")
    print(f"\nRMSE: {rmse}")
    print(f"\nAUC: {auc}")

In [None]:
results = pd.DataFrame(columns = ['MAE','STD_MAE','RMSE','STD_RMSE','AUC','STD_AUC',])
for i, (report, f1_macro, cm, mae, rmse, auc) in enumerate(all_metrics):
    results.loc[i] = [mae, 0, rmse, 0, auc, 0]

In [None]:
mean_mae = results['MAE'].mean()
mean_rmse = results['RMSE'].mean()
mean_auc = results['AUC'].mean()
std_mae = results['MAE'].std()
std_rmse = results['RMSE'].std()
std_auc = results['AUC'].std()
mean = {'MAE': mean_mae, 'STD_MAE': std_mae, 'RMSE': mean_rmse, 'STD_RMSE': std_rmse, 'AUC': mean_auc, 'STD_AUC': std_auc}

In [None]:
std_err_mae = stats.sem(results['MAE'])
std_err_rmse = stats.sem(results['RMSE'])
std_err_auc = stats.sem(results['AUC'])
conf_int_mae = stats.t.interval(0.95, len(results['MAE'])-1, loc=mean_mae, scale=std_err_mae)
conf_int_rmse = stats.t.interval(0.95, len(results['RMSE'])-1, loc=mean_rmse, scale=std_err_rmse)
conf_int_auc = stats.t.interval(0.95, len(results['AUC'])-1, loc=mean_auc, scale=std_err_auc)

In [None]:
std_err_mae = stats.sem(results['MAE'])
std_err_rmse = stats.sem(results['RMSE'])
std_err_auc = stats.sem(results['AUC'])

conf_int_mae = stats.t.interval(0.95, len(results['MAE'])-1, loc=mean_mae, scale=std_err_mae)
conf_int_rmse = stats.t.interval(0.95, len(results['RMSE'])-1, loc=mean_rmse, scale=std_err_rmse)
conf_int_auc = stats.t.interval(0.95, len(results['AUC'])-1, loc=mean_auc, scale=std_err_auc)

print(f'MAE: {mean_mae:.4f} ± {std_err_mae:.4f} 95% IC: {conf_int_mae}')
print(f'RMSE: {mean_rmse:.4f} ± {std_err_rmse:.4f} 95% IC: {conf_int_rmse}')
print(f'AUC: {mean_auc:.4f} ± {std_err_auc:.4f} 95% IC: {conf_int_auc}')

In [None]:
conf_int = {'MAE': conf_int_mae, 'STD_MAE': std_err_mae, 'RMSE': conf_int_rmse, 'STD_RMSE': std_err_rmse, 'AUC': conf_int_auc, 'STD_AUC': std_err_auc}
results = pd.concat([results, pd.DataFrame([conf_int])], ignore_index=True)

In [None]:
results.to_csv(f'{save_path}/{clf.__class__.__name__}_results.csv')