# Baseline Models

The following Notebook contains the first round of experimentation for the USM Datathon. We show our resulting for 5-Fold Cross Validation metrics (used to determine the generalization power of the model in the competition leaderboard) and the final Test Score measured with the whole Training Data. 

The evaluation Score used during the competition was a Custom Implementation of the F1-Score defined as:

$$F_{1\,custom} = 0.5 \cdot (F_{1\,Hate} + Macro \, F_{1\,communities})$$

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils import import_data, validation_train, full_train
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from loguru import logger
import pandas as pd
import numpy as np
import time


### Validation Schema

Training Process following a 5-Fold Cross Validation Schema.

In [3]:
RANDOM_STATE = 42
df_train, df_test, stopwords = import_data()

LABELS = [
    "Odio",
    "Mujeres",
    "Comunidad LGBTQ+",
    "Comunidades Migrantes",
    "Pueblos Originarios",
]

nb = MultinomialNB()
lr = LogisticRegression(random_state=RANDOM_STATE, max_iter=1000)
et = ExtraTreesClassifier(n_estimators=500, random_state=RANDOM_STATE, n_jobs=-1)
rf = RandomForestClassifier(n_estimators=500, random_state=RANDOM_STATE, n_jobs=-1)
cb = CatBoostClassifier(
    n_estimators=500, random_state=RANDOM_STATE, verbose=False, thread_count=-1
)
lgb = LGBMClassifier(n_estimators=500, random_state=RANDOM_STATE, n_jobs=-1)
xgb = XGBClassifier(n_estimators=500, random_state=RANDOM_STATE, n_jobs=-1)
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation="relu",
    solver="adam",
    random_state=RANDOM_STATE,
    alpha=0.1,
)
models_dict = {
    "Naive Bayes": nb,
    "Logistic Regression": lr,
    "Extra Trees": et,
    "Random Forest": rf,
    "Catboost": cb,
    "LightGBM": lgb,
    "XGBoost": xgb,
    "MLP": mlp,
}


[32m2023-09-15 13:16:28.289[0m | [1mINFO    [0m | [36mutils.utilities[0m:[36mimport_data[0m:[36m12[0m - [1mTraining and Test data succesfully loaded...[0m
[32m2023-09-15 13:16:28.290[0m | [1mINFO    [0m | [36mutils.utilities[0m:[36mimport_data[0m:[36m13[0m - [1mTrain Shape: (2256, 9), Test Shape: (2291, 9)[0m


In [4]:
validation_results = []
for name, model in models_dict.items():
    start_time = time.time()
    dict_results = {}
    dict_results["model_name"] = name
    dict_results.update(
        validation_train(df_train, model, LABELS, stopwords, random_state=RANDOM_STATE)
    )
    dict_results["cv_training_time (min)"] = np.round(
        (time.time() - start_time) / 60, 2
    )
    logger.info(f"{name} Results:")
    logger.info(f"Mean Training Score: {dict_results['mean_train_score']}")
    logger.info(f"Mean Test Score: {dict_results['mean_val_score']}")
    validation_results.append(dict_results)

validation_df = pd.DataFrame(validation_results)
validation_df


[32m2023-09-15 13:16:28.656[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mNaive Bayes Results:[0m
[32m2023-09-15 13:16:28.657[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mMean Training Score: 0.969342010645185[0m
[32m2023-09-15 13:16:28.657[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mMean Test Score: 0.7284914745443303[0m
[32m2023-09-15 13:16:30.666[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mLogistic Regression Results:[0m
[32m2023-09-15 13:16:30.667[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m13[0m - [1mMean Training Score: 0.9767475099969705[0m
[32m2023-09-15 13:16:30.667[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m14[0m - [1mMean Test Score: 0.7302634181239027[0m
[32m2023-09-15 13:16:44.988[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m12[0m - [1mExtra Trees Results

Unnamed: 0,model_name,mean_train_score,sd_train_score,mean_val_score,sd_val_score,mean_precision_train_score,sd_precision_train_score,mean_precision_val_score,sd_precision_val_score,mean_recall_train_score,sd_recall_train_score,mean_recall_val_score,sd_recall_val_score,cv_training_time (min)
0,Naive Bayes,0.969342,0.001314,0.728491,0.023792,0.979976,0.001648,0.830191,0.024515,0.949345,0.002773,0.619482,0.032544,0.01
1,Logistic Regression,0.976748,0.001081,0.730263,0.012259,0.995945,0.00076,0.886027,0.008727,0.945608,0.003457,0.603291,0.020781,0.03
2,Extra Trees,0.99979,0.000171,0.777197,0.025783,1.0,0.0,0.876066,0.009494,0.999672,0.000268,0.710161,0.038386,0.24
3,Random Forest,0.99979,0.000171,0.747509,0.015967,0.999962,7.6e-05,0.875421,0.016485,0.99971,0.000247,0.650014,0.024626,0.2
4,Catboost,0.84293,0.003843,0.767565,0.012398,0.928142,0.002028,0.885169,0.012828,0.763974,0.006187,0.685882,0.015977,1.12
5,LightGBM,0.770482,0.005972,0.676694,0.010509,0.90176,0.008869,0.770532,0.047771,0.664634,0.010542,0.561061,0.011954,0.06
6,XGBoost,0.891976,0.00337,0.752562,0.01531,0.955648,0.002724,0.847096,0.012692,0.82729,0.00945,0.68237,0.021205,0.11
7,MLP,0.999634,0.000208,0.751827,0.017548,1.0,0.0,0.863398,0.019239,0.999403,0.000373,0.657323,0.025072,1.12


### Full Train and Predictions

Test Predictions using the Whole Training Set.

In [5]:
training_results = []
for name, model in models_dict.items():
    start_time = time.time()
    dict_results = {}
    dict_results["model_name"] = name
    dict_results.update(full_train(df_train, df_test, model, LABELS, stopwords))
    dict_results["full_training_time (min)"] = np.round(
        (time.time() - start_time) / 60, 2
    )
    logger.info(f"{name} Results:")
    logger.info(f"Test Score: {dict_results['test_score']}")
    training_results.append(dict_results)

training_df = pd.DataFrame(training_results)
training_df


[32m2023-09-15 13:19:22.006[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mNaive Bayes Results:[0m
[32m2023-09-15 13:19:22.007[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mTest Score: 0.7455443267553221[0m
[32m2023-09-15 13:19:22.600[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mLogistic Regression Results:[0m
[32m2023-09-15 13:19:22.600[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mTest Score: 0.7669777805756477[0m
[32m2023-09-15 13:19:25.538[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mExtra Trees Results:[0m
[32m2023-09-15 13:19:25.539[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m11[0m - [1mTest Score: 0.8032498173030194[0m
[32m2023-09-15 13:19:27.920[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m10[0m - [1mRandom Forest Results:[0m
[32m2023-09-15 13:19:27.920

Unnamed: 0,model_name,test_score,test_precision,test_recall,full_training_time (min)
0,Naive Bayes,0.745544,0.751765,0.731662,0.0
1,Logistic Regression,0.766978,0.797755,0.731777,0.01
2,Extra Trees,0.80325,0.768195,0.848739,0.05
3,Random Forest,0.799074,0.78934,0.81081,0.04
4,Catboost,0.786534,0.784202,0.802355,0.24
5,LightGBM,0.699077,0.711117,0.678284,0.02
6,XGBoost,0.782395,0.759743,0.809113,0.02
7,MLP,0.768206,0.75868,0.80517,0.27
