## Libreries

In [1]:
import pandas as pd
import pickle
import numpy as np

## Data

In [2]:
%%time
features_train = pd.read_excel("../../feature-engineering/features/mf_features_train_task_C1.xlsx", index_col="id")
features_val = pd.read_excel("../../feature-engineering/features/mf_features_val_task_C1.xlsx", index_col="id")
features_test = pd.read_excel("../../feature-engineering/features/mf_features_test_task_C1.xlsx", index_col="id")

Wall time: 4min 7s


In [3]:
%%time
A_train = pd.read_excel("../../data/train_task_C1.xlsx", index_col="id")
A_val = pd.read_excel("../../data/val_task_C1.xlsx", index_col="id")
A_test = pd.read_excel("../../data/test_task_C1.xlsx", index_col="id")

Wall time: 1.84 s


In [4]:
X_train, y_train = features_train, A_train["label"]
X_train = X_train.loc[y_train.index]

X_test, y_test = features_test, A_test["label"]
X_test = X_test.loc[y_test.index]

X_val, y_val = features_val, A_val["label"]
X_val = X_val.loc[y_val.index]

In [5]:
set([c.split("<&>")[0] for c in features_train.columns])

{'contextual', 'linguistic', 'semantic', 'traditional'}

In [6]:
!pip install xgboost



In [7]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [11]:
X_train.columns = range(X_train.shape[1])
X_val.columns = range(X_val.shape[1])
X_test.columns = range(X_test.shape[1])

In [12]:
%%time
xgb = XGBClassifier(max_depth=4, n_estimators=100, learning_rate=0.001, seed=2022)
xgb.fit(X_train, y_train)

Wall time: 7.81 s


XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.001, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=4, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=2022, ...)

In [13]:
y_pred = xgb.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96      2510
           1       0.81      0.63      0.71       388

    accuracy                           0.93      2898
   macro avg       0.88      0.80      0.83      2898
weighted avg       0.93      0.93      0.93      2898



In [14]:
y_pred = xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.98      0.95       541
           1       0.89      0.64      0.74       136

    accuracy                           0.91       677
   macro avg       0.90      0.81      0.85       677
weighted avg       0.91      0.91      0.91       677



In [15]:
from sklearn.model_selection import GridSearchCV

In [16]:
parameters = {
    'max_depth': [6, 8, 10], 
    "n_estimators": [125, 100, 75],
    "learning_rate": [0.03, 0.1, 0.3],
}

In [17]:
clf = GridSearchCV(xgb, parameters, scoring="f1_macro", cv=5, verbose=1)

In [18]:
%%time
clf.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Wall time: 26min 13s


GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     callbacks=None, colsample_bylevel=1,
                                     colsample_bynode=1, colsample_bytree=1,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=0, gpu_id=-1,
                                     grow_policy='depthwise',
                                     importance_type=None,
                                     interaction_constraints='',
                                     learning_rate=0.001, max_b...6,
                                     max_cat_threshold=64, max_cat_to_onehot=4,
                                     max_delta_step=0, max_depth=4,
                                     max_leaves=0, min_child_weight=1,
                                     missing=nan, monotone_con

In [19]:
best_params = {}
for k in parameters:
    best_params[k] = vars(clf.best_estimator_)[k]
    print(f"{k}:", best_params[k])

max_depth: 8
n_estimators: 125
learning_rate: 0.1


In [20]:
y_pred = clf.predict(X_train)
report = classification_report(y_train, y_pred, output_dict=True)
train_report = pd.DataFrame(report)
train_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.988891,0.966825,0.986071,0.977858,0.985951
recall,0.995109,0.927273,0.986071,0.961191,0.986071
f1-score,0.99199,0.946636,0.986071,0.969313,0.985948
support,10019.0,1540.0,0.986071,11559.0,11559.0


In [21]:
y_pred = clf.predict(X_val)
report = classification_report(y_val, y_pred, output_dict=True)
val_report = pd.DataFrame(report)
val_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.966862,0.90991,0.960317,0.938386,0.959237
recall,0.988048,0.780928,0.960317,0.884488,0.960317
f1-score,0.97734,0.840499,0.960317,0.90892,0.959019
support,2510.0,388.0,0.960317,2898.0,2898.0


In [22]:
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
test_report = pd.DataFrame(report)
test_report

Unnamed: 0,0,1,accuracy,macro avg,weighted avg
precision,0.927509,0.697842,0.880355,0.812676,0.881372
recall,0.922366,0.713235,0.880355,0.817801,0.880355
f1-score,0.92493,0.705455,0.880355,0.815193,0.880841
support,541.0,136.0,0.880355,677.0,677.0


## Save pickle

In [31]:
clf.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.1, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=8, max_leaves=0, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=125, n_jobs=0,
              num_parallel_tree=1, predictor='auto', random_state=2022, ...)

In [32]:
pickle.dump(clf.best_estimator_, open("results/clf_xgboost_mf_task_C1.pickle", "wb"))

In [24]:
pickle.dump(best_params, open("results/best_params_xgboost_mf_task_C1.pickle", "wb"))

In [25]:
pickle.dump(train_report, open("results/train_report_xgboost_mf_task_C1.pickle", "wb"))

In [26]:
pickle.dump(val_report, open("results/val_report_xgboost_mf_task_C1.pickle", "wb"))

In [27]:
pickle.dump(test_report, open("results/test_report_xgboost_mf_task_C1.pickle", "wb"))