In [1]:
import pandas as pd
import numpy as np

RANDOM_STATE = 42
df_train = pd.read_csv("./public_data/tweets_train.csv", index_col=0)
df_test = pd.read_csv("./Datathon-2022-full/Datos/tweets_test.csv", index_col=0)
stopwords = np.loadtxt("public_data/stopwords.txt", dtype=str)
df_train.shape, df_test.shape


((2256, 9), (2291, 9))

# Definición de Modelos

In [2]:
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

LABELS = [
    "Odio",
    "Mujeres",
    "Comunidad LGBTQ+",
    "Comunidades Migrantes",
    "Pueblos Originarios",
]

et = ExtraTreesClassifier(n_estimators=500, random_state=RANDOM_STATE)
cb = CatBoostClassifier(n_estimators=500, random_state=RANDOM_STATE, verbose=False)
xgb = XGBClassifier(n_estimators=500, random_state=RANDOM_STATE)
lr = LogisticRegression(random_state=RANDOM_STATE)
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    activation="relu",
    solver="adam",
    random_state=RANDOM_STATE,
    alpha=0.1,
)


# Definición Métrica Competencia

In [3]:
from sklearn.metrics import f1_score


def f1_custom(y, y_pred):
    y_odio = y[:, 0]
    y_pred_odio = y_pred[:, 0]

    y_comunidades = y[:, 1:]
    y_pred_comunidades = y_pred[:, 1:]

    f1_odio = f1_score(y_odio, y_pred_odio)
    f1_comunidades = f1_score(y_comunidades, y_pred_comunidades, average="macro")

    return 0.5 * f1_odio + 0.5 * f1_comunidades


# Definición Stacking

In [4]:
from sklearn.ensemble import StackingClassifier

estimators = [("et", et), ("cb", cb), ("xgb", xgb), ("lr", lr), ("mlp", mlp)]

hate_stack = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(random_state=42), cv=3
)


# Validation Schema

In [5]:
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier


def train(df, target_labels):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    score = []
    train_score = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(df[target_labels]), start=1):
        X_train = df.iloc[train_idx]["text"]
        X_val = df.iloc[val_idx]["text"]

        # Labels are transformed to 0 and 1 if Label is greater than 0.

        y_train = df.iloc[train_idx][target_labels].astype(bool).astype(int)
        y_val = df.iloc[val_idx][target_labels].astype(bool).astype(int)

        # Pipeline Definition
        pipe = Pipeline(
            [
                (
                    "featurizer",
                    CountVectorizer(
                        stop_words=list(stopwords),
                        lowercase=True,
                        ngram_range=(1, 1),
                        dtype=np.float32,
                    ),
                ),
                ("clf", MultiOutputClassifier(hate_stack)),
            ]
        )

        pipe.fit(X_train, y_train)
        y_pred_train = pipe.predict(X_train)
        y_pred = pipe.predict(X_val)
        sc_train = f1_custom(y_train.values, y_pred_train)
        sc = f1_custom(y_val.values, y_pred)

        # Reporting Results during Training

        print(f"Train Score fold {fold}: {sc_train}")
        print(f"Validation Score fold {fold}: {sc}")
        print("--------------------------------------------")

        train_score.append(sc_train)
        score.append(sc)

    return train_score, score


In [6]:
%%time

train_score, validation_score = train(df_train, LABELS)

Train Score fold 1: 0.9992974418134202
Validation Score fold 1: 0.770332130752034
--------------------------------------------
Train Score fold 2: 0.9985402204834405
Validation Score fold 2: 0.8071042427462933
--------------------------------------------
Train Score fold 3: 0.9994154555206065
Validation Score fold 3: 0.8061963612916446
--------------------------------------------
Train Score fold 4: 0.999507874015748
Validation Score fold 4: 0.7943036784996272
--------------------------------------------
Train Score fold 5: 1.0
Validation Score fold 5: 0.8177478834597189
--------------------------------------------
Mean Training Score: 0.9993521983666429
Mean Validation Score: 0.7991368593498637
CPU times: user 1h 2min 22s, sys: 3min 6s, total: 1h 5min 29s
Wall time: 15min 36s


In [7]:
print(f"Mean Training Score: {np.mean(train_score)}")
print(f"Mean Validation Score: {np.mean(validation_score)}")


Mean Training Score: 0.9993521983666429
Mean Validation Score: 0.7991368593498637


## Full Re-Training

In [8]:
pipe = Pipeline(
    steps=[
        (
            "cv",
            CountVectorizer(
                stop_words=list(stopwords),
                lowercase=True,
                ngram_range=(1, 1),
                dtype=np.float32,
            ),
        ),
        ("model", MultiOutputClassifier(hate_stack)),
    ]
)

pipe.fit(df_train["text"], df_train[LABELS].astype("bool").astype("int"))
y_pred = pipe.predict(df_test["text"])


In [9]:
ground_truth = df_test[LABELS].astype(bool).astype(int)
print(f"Test Score: {f1_custom(ground_truth.values, y_pred)}")


Test Score: 0.8172866645873844
