# Treinamento de modelos de aprendizagem de máquina em classificação binária e multilabel usando o dataset ToLD-Br

## Imports e configurações iniciais

In [None]:
import os
import re
import string
import unicodedata
import json
import logging

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download("stopwords")

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, confusion_matrix, classification_report)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
SEED = 42
np.random.seed(SEED)

## Carregamento dos dados

In [None]:
dataset_path = "../ToLD-BR.csv"

df = pd.read_csv(dataset_path, encoding="utf-8")

In [None]:
df.head()

## Pré processamento de dados

Remoção de acentos, URLs, menções, hashtags, dígitos e pontuação

In [None]:
def remove_accentuation(text):
    nfkd = unicodedata.normalize("NFKD", text)
    return "".join([c for c in nfkd if not unicodedata.combining(c)])

In [None]:
def clean_text(text, stopwords_set=None):
    if text is None:
        return ""
    text = text.lower()
    text = remove_accentuation(text)
    text = re.sub(r"http\S+|www\S+|https\S+", " ", text)
    text = re.sub(r"@\w+", " ", text)
    text = re.sub(r"#\w+", " ", text)
    text = re.sub(r"\d+", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = text.split()
    if stopwords_set is not None:
        tokens = [w for w in tokens if w not in stopwords_set]
    return " ".join(tokens)

In [None]:
def preprocess(series, stopwords_set=None):
    return series.map(lambda x: clean_text(str(x), stopwords_set=stopwords_set))

## Tratamento e Rotulagem

In [None]:
stopwords_pt = set(stopwords.words("portuguese"))
df["text_clean"] = preprocess(df["text"], stopwords_set=stopwords_pt)

In [None]:
categories = ["homophobia","obscene","insult","racism","misogyny","xenophobia"]

In [None]:
vect = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9)

### Binária

In [None]:
df_binary = df.copy()
df_binary["toxic_binary"] = (df_binary[categories].sum(axis=1) > 0).astype(int)

In [None]:
X_binary = df_binary["text_clean"].values
y_binary = df_binary["toxic_binary"].values

In [None]:
X_train_bin, X_test_bin, y_train_bin, y_test_bin = train_test_split(
    X_binary, y_binary, test_size=0.3, stratify=y_binary, random_state=SEED
)

In [None]:
X_train_bin_tfidf = vect.fit_transform(X_train_bin)
X_test_bin_tfidf  = vect.transform(X_test_bin)

### Multilabel

In [None]:
df_multil = df.copy()

In [None]:
y_multil = df_multil[categories].apply(lambda col: (col > 0).astype(int)).values
X_multil = df_multil["text_clean"].values

In [None]:
X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(
    X_multil, y_multil, test_size=0.3, random_state=SEED
)

In [None]:
X_train_ml_tfidf = vect.fit_transform(X_train_ml)
X_test_ml_tfidf  = vect.transform(X_test_ml)

## Execução e Avaliação

In [None]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=200, random_state=SEED, class_weight="balanced"),
    "NaiveBayes":        MultinomialNB(),
    "DecisionTree":      DecisionTreeClassifier(random_state=SEED, class_weight="balanced"),
    "RandomForest":      RandomForestClassifier(n_estimators=100, random_state=SEED, class_weight="balanced", n_jobs=-1),
    "KNN":               KNeighborsClassifier(n_neighbors=5)
}

In [None]:
estimators = [(name, model) for name, model in models.items()]

In [None]:
def save_results_json(execution_type, results):
  with open(f"./results_{execution_type}.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4, ensure_ascii=False)
  logger.info(f"Resultados de classificação {execution_type} salvos em results_{execution_type}.json")

### Binária

In [None]:
def test_results_binary(name, y_test, y_pred, results):
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec  = recall_score(y_test, y_pred, zero_division=0)
    f1   = f1_score(y_test, y_pred, zero_division=0)
    cm   = confusion_matrix(y_test, y_pred).tolist()
    logger.info(f"{name} — acc:{acc:.4f}, prec:{prec:.4f}, rec:{rec:.4f}, f1:{f1:.4f}")
    results[name] = {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1_score": f1,
        "confusion_matrix": cm
    }

In [None]:
def run_models_binary(results, X_train, y_train, X_test, y_test):
  for name, model in models.items():
    logger.info(f"[Binária] Treinando modelo: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    test_results_binary(name, y_test, y_pred, results)

### Multilabel

In [None]:
def test_results_multilabel(name, y_test, y_pred, results):
    acc  = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="macro", zero_division=0)
    rec  = recall_score(y_test, y_pred, average="macro", zero_division=0)
    f1   = f1_score(y_test, y_pred, average="macro", zero_division=0)
    cm   = [confusion_matrix(y_test[:,i], y_pred[:,i]).tolist() for i in range(y_test.shape[1])]
    logger.info(f"{name} multilabel — acc:{acc:.4f}, prec:{prec:.4f}, rec:{rec:.4f}, f1:{f1:.4f}")
    results[name] = {
        "accuracy": acc,
        "precision_macro": prec,
        "recall_macro": rec,
        "f1_macro": f1,
        "confusion_matrix_per_class": cm
    }

In [None]:
def run_models_multilabel(results, X_train, y_train, X_test, y_test):
  for name, base_model in models.items():
    logger.info(f"[Multilabel] Treinando modelo: {name}")
    ml_model = OneVsRestClassifier(base_model)
    ml_model.fit(X_train, y_train)
    y_pred_ml = ml_model.predict(X_test)
    test_results_multilabel(name, y_test, y_pred_ml, results)

### Ensemble Voting

In [None]:
def ensemble_voting(execution_type, results, X_train, y_train, X_test, y_test):
  logger.info(f"[{execution_type}] Treinando Voting Ensemble")
  voting = OneVsRestClassifier(VotingClassifier(estimators=estimators, voting="soft", n_jobs=-1)) if execution_type == "Multilabel" else VotingClassifier(estimators=estimators, voting="soft", n_jobs=-1)
  voting.fit(X_train, y_train)
  y_pred_v = voting.predict(X_test)
  if execution_type == "Multilabel":
    test_results_multilabel("VotingEnsemble", y_test, y_pred_v, results)
  else:
    test_results_binary("VotingEnsemble", y_test, y_pred_v, results)

## Treinamentos

### Binário

In [None]:
results_bin = {}

run_models_binary(results_bin, X_train_bin_tfidf, y_train_bin, X_test_bin_tfidf, y_test_bin)
ensemble_voting("Binaria", results_bin, X_train_bin_tfidf, y_train_bin, X_test_bin_tfidf, y_test_bin)

save_results_json("binaria", results_bin)

### Multilabel

In [None]:
results_ml = {}

run_models_multilabel(results_ml, X_train_ml_tfidf, y_train_ml, X_test_ml_tfidf, y_test_ml)
ensemble_voting("Multilabel", results_ml, X_train_ml_tfidf, y_train_ml, X_test_ml_tfidf, y_test_ml)

save_results_json("multilabel", results_ml)