## Libreries

In [1]:
import pandas as pd
import pickle
import numpy as np

## Data

In [2]:
beto_pt_features_train = pd.read_excel("../../feature-engineering/features/beto_pt_features_train_task_Q.xlsx", index_col="id")
beto_pt_features_val = pd.read_excel("../../feature-engineering/features/beto_pt_features_val_task_Q.xlsx", index_col="id")
beto_pt_features_test = pd.read_excel("../../feature-engineering/features/beto_pt_features_test_task_Q.xlsx", index_col="id")

In [3]:
lf_features_train = pd.read_excel("../../feature-engineering/features/lf_features_train_task_Q.xlsx", index_col="id")
lf_features_val = pd.read_excel("../../feature-engineering/features/lf_features_val_task_Q.xlsx", index_col="id")
lf_features_test = pd.read_excel("../../feature-engineering/features/lf_features_test_task_Q.xlsx", index_col="id")

In [4]:
Q_train = pd.read_excel("../../data/train_task_Q.xlsx", index_col="id")
Q_val = pd.read_excel("../../data/val_task_Q.xlsx", index_col="id")
Q_test = pd.read_excel("../../data/test_task_Q.xlsx", index_col="id")

In [5]:
cols_beto = [f"beto<&>{x}" for x in range(768)]

In [6]:
beto_pt_features_train.columns = cols_beto
beto_pt_features_val.columns = cols_beto
beto_pt_features_test.columns = cols_beto

In [7]:
X_train = pd.concat([lf_features_train, beto_pt_features_train], axis=1)
X_val = pd.concat([lf_features_val, beto_pt_features_val], axis=1)
X_test = pd.concat([lf_features_test, beto_pt_features_test], axis=1)

In [8]:
y_train = Q_train["label"]
X_train = X_train.loc[y_train.index]

y_test = Q_test["label"]
X_test = X_test.loc[y_test.index]

y_val = Q_val["label"]
X_val = X_val.loc[y_val.index]

In [9]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [10]:
X_train.columns = range(X_train.shape[1])

In [11]:
%%time
xgb = XGBClassifier(max_depth=4, n_estimators=100, learning_rate=0.001, seed=2022)
xgb.fit(X_train, y_train)

CPU times: total: 27.3 s
Wall time: 2.51 s


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.001, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', random_state=2022, reg_alpha=0, ...)

In [12]:
y_pred = xgb.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         4
           1       0.88      0.58      0.70        12
           2       0.73      0.95      0.83        20
           3       1.00      1.00      1.00        17
           4       1.00      0.80      0.89         5
           5       0.50      0.71      0.59         7

    accuracy                           0.80        65
   macro avg       0.68      0.67      0.67        65
weighted avg       0.78      0.80      0.78        65



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.21      0.11      0.15        27
           1       0.08      0.14      0.10         7
           2       0.42      0.50      0.46        32
           3       0.83      0.92      0.88       146
           4       0.08      0.50      0.14         2
           5       0.73      0.41      0.53        58

    accuracy                           0.66       272
   macro avg       0.39      0.43      0.38       272
weighted avg       0.68      0.66      0.65       272



In [14]:
from sklearn.model_selection import GridSearchCV

In [16]:
parameters = {
    'max_depth': [4, 6, 8], 
    "n_estimators": [150, 125, 100],
    "learning_rate": [0.1, 0.3, 0.5],
}

In [17]:
clf = GridSearchCV(xgb, parameters, scoring="f1_macro", cv=5, verbose=1)

In [18]:
%%time
clf.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
CPU times: total: 34min 49s
Wall time: 3min 3s


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=1,
                                     colsample_bynode=1, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=0, gpu_id=-1,
                                     grow_policy='depthwise',
                                     importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.001, max_bin=256,
                                     max_cat_to_onehot=4, max_delta_step=0,
                                     max_depth=4, max_leaves=0,
                                     min_child_weight=1, missing=nan,
                                     monotone_constraints='()',
                         

In [19]:
best_params = {}
for k in parameters:
    best_params[k] = vars(clf.best_estimator_)[k]
    print(f"{k}:", best_params[k])

max_depth: 8
n_estimators: 100
learning_rate: 0.3


In [20]:
y_pred = clf.predict(X_train)
report = classification_report(y_train, y_pred, output_dict=True)
train_report = pd.DataFrame(report)
train_report

Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
recall,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
f1-score,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
support,13.0,48.0,78.0,64.0,17.0,27.0,1.0,247.0,247.0


In [21]:
y_pred = clf.predict(X_val)
report = classification_report(y_val, y_pred, output_dict=True)
val_report = pd.DataFrame(report)
val_report

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,0.0,0.785714,0.869565,1.0,1.0,0.666667,0.861538,0.720324,0.82287
recall,0.0,0.916667,1.0,1.0,0.4,0.857143,0.861538,0.695635,0.861538
f1-score,0.0,0.846154,0.930233,1.0,0.571429,0.75,0.861538,0.682969,0.828702
support,4.0,12.0,20.0,17.0,5.0,7.0,0.861538,65.0,65.0


In [22]:
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
test_report = pd.DataFrame(report)
test_report

Unnamed: 0,0,1,2,3,4,5,accuracy,macro avg,weighted avg
precision,0.25,0.307692,0.482759,0.832298,0.666667,0.827586,0.713235,0.561167,0.717651
recall,0.074074,0.571429,0.875,0.917808,1.0,0.413793,0.713235,0.642017,0.713235
f1-score,0.114286,0.4,0.622222,0.872964,0.8,0.551724,0.713235,0.560199,0.686947
support,27.0,7.0,32.0,146.0,2.0,58.0,0.713235,272.0,272.0


In [23]:
pickle.dump(clf, open("results/clf_xgboost_mix_task_Q.pickle", "wb"))

In [24]:
pickle.dump(best_params, open("results/best_params_xgboost_mix_task_Q.pickle", "wb"))

In [25]:
pickle.dump(train_report, open("results/train_report_xgboost_mix_task_Q.pickle", "wb"))

In [26]:
pickle.dump(val_report, open("results/val_report_xgboost_mix_task_Q.pickle", "wb"))

In [27]:
pickle.dump(test_report, open("results/test_report_xgboost_mix_task_Q.pickle", "wb"))