In [51]:
!pip install --upgrade gensim



In [52]:
!pip install emoji



In [53]:
# =============================================================================
# MILESTONE 2 - HATEBR TEXT CLASSIFICATION (Simplified)
# Dataset: HateBR[](https://huggingface.co/datasets/franciellevargas/HateBR)
# Baselines:
#   A) TF-IDF (1-2 n-grams) + linguistic features + LogisticRegression
#   B) GloVe Average Embeddings + linguistic features + RandomForestClassifier
# =============================================================================

import pandas as pd
import numpy as np
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
from gensim.models import KeyedVectors
import emoji
import warnings
warnings.filterwarnings('ignore')

1. CARGA DEL DATASET (como en Proyecto 1)

In [54]:
# ============================================================
# CARGAR DATASET DESDE GITHUB (como en Proyecto 1)
# ============================================================

# URL "raw" del archivo HateBR.csv en GitHub
url = "https://raw.githubusercontent.com/franfgv9/Milestone_1_PLN/refs/heads/main/HateBR.csv"

# Leer el CSV directamente desde GitHub
df = pd.read_csv(url)

# Verificar la carga
print("Dimensiones del dataset:", df.shape)
print("Columnas disponibles:", df.columns.tolist())
df.head()

Dimensiones del dataset: (7000, 8)
Columnas disponibles: ['id', 'comentario', 'anotator1', 'anotator2', 'anotator3', 'label_final', 'links_post', 'account_post']


Unnamed: 0,id,comentario,anotator1,anotator2,anotator3,label_final,links_post,account_post
0,1,Mais um lixo,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli
1,2,Essa nao tem vergonha na cara!!,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli
2,3,Essa mulher é doente.pilantra!,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli
3,4,Comunista safada...,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli
4,5,Vagabunda. Comunista. Mentirosa. O povo chilen...,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli


In [55]:
# ============================================================
# SELECCIONAR COLUMNAS RELEVANTES
# ============================================================

df = df[["id", "comentario", "label_final"]]

# Verificar el resultado
print("\nColumnas seleccionadas:", df.columns.tolist())
print("Dimensiones del dataset:", df.shape)
df.head()


Columnas seleccionadas: ['id', 'comentario', 'label_final']
Dimensiones del dataset: (7000, 3)


Unnamed: 0,id,comentario,label_final
0,1,Mais um lixo,1
1,2,Essa nao tem vergonha na cara!!,1
2,3,Essa mulher é doente.pilantra!,1
3,4,Comunista safada...,1
4,5,Vagabunda. Comunista. Mentirosa. O povo chilen...,1


In [56]:
# ============================================================
# RENOMBRAR COLUMNAS PARA UNIFORMIDAD
# ============================================================

df = df.rename(columns={
    "comentario": "text",
    "label_final": "label"
})

print("\nColumnas finales:", df.columns.tolist())
df.head()

print("\nInformación del dataset:")
print(df.info())

# Limpiar valores nulos
df.dropna(subset=['text', 'label'], inplace=True)
df = df[df['label'].isin([0, 1])]  # 0: No ofensivo, 1: Ofensivo
df.reset_index(drop=True, inplace=True)

print(f"\nDataset final: {len(df)} ejemplos")
print("Distribución de clases:")
print(df['label'].value_counts())


Columnas finales: ['id', 'text', 'label']

Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7000 non-null   int64 
 1   text    7000 non-null   object
 2   label   7000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 164.2+ KB
None

Dataset final: 7000 ejemplos
Distribución de clases:
label
1    3500
0    3500
Name: count, dtype: int64


2. LIMPIEZA GENERAL (para features lingüísticas)

La idea es:

Limpiar ligeramente el texto, pero sin “destrozarlo”, para poder extraer después features lingüísticas (POS tags, entidades, dependencias, etc.).

Guardar esa versión limpia en una nueva columna del DataFrame.

Descargar y cargar spaCy en portugués, que usarás más tarde para sacar esas features.

In [57]:
# ============================================================
# PREPROCESAMIENTO GENERAL (para features lingüísticas)
# ============================================================

def clean_for_features(text):
    text = re.sub(r'http[s]?://\S+', '', text)  # Quitar URLs
    text = re.sub(r'\s+', ' ', text).strip()    # Normalizar espacios
    return text

df['text_features'] = df['text'].apply(clean_for_features)

# Descargar modelo spaCy para portugués si no está instalado
!python -m spacy download pt_core_news_sm

# Cargar modelo spaCy para portugués
print("\nCargando modelo spaCy (pt_core_news_sm)...")
nlp = spacy.load("pt_core_news_sm")

Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m133.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

Cargando modelo spaCy (pt_core_news_sm)...


Por qué no más limpieza?
→ Porque queremos preservar mayúsculas, signos, emojis para extraer features.
Solo quitamos URLs y espacios extra → no afectan al análisis lingüístico.

Aquí usas expresiones regulares con re.sub para eliminar enlaces.

re.sub(patrón, reemplazo, texto)
→ busca el patrón en texto y lo sustituye por reemplazo.

Desglose del patrón r'http[s]?://\S+':

r'': raw string → para que Python no interprete \ como escapes raros.

'http': busca exactamente la secuencia de letras http.

'[s]?':

[s] significa la letra s.

? significa “0 o 1 vez”.

Es decir, coincide con http:// y también con https://.

'://': los caracteres :// literalmente.

\S+:

\S = “cualquier carácter que no sea espacio en blanco”.

+ = “1 o más veces”.

Patrón r'\s+':

\s = cualquier espacio en blanco (espacio, tabulación, salto de línea, etc.).

+ = uno o más seguidos.

Es decir, detecta “bloques de espacios en blanco”.

re.sub(r'\s+', ' ', text):

Sustituye cualquier bloque de espacios/blancos (por ejemplo, " ", "\n\n", "\t \n") por un único espacio " ".

Esto “normaliza” la separación entre palabras.

.strip():

Elimina espacios al principio y al final del string.

Así no quedan frases empezando o terminando con espacios.

Ejemplo:

Antes: " Olha isso: ridículo \n\n "

Después: "Olha isso: ridículo"

3. LIMPIEZA ESPECÍFICA POR MODELO

In [58]:
# ============================================================
# LIMPIEZA ESPECÍFICA POR MODELO
# ============================================================

# --- Modelo A: TF-IDF (superficial) ---
def clean_for_tfidf(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text_clean_tfidf'] = df['text'].apply(clean_for_tfidf)

# No se baja a minúsculas → lowercase=False en TfidfVectorizer
# Se preservan mayúsculas → "IDIOTA" ≠ "idiota" → señal de ofensa

# --- Modelo B: Embeddings (semántico) ---
emoji_map = {'[laughing face]': 'risada', '[angry face]': 'raiva', '[heart]': 'amor'}

def replace_emojis(text):
    for emj, word in emoji_map.items():
        text = text.replace(emj, f' {word} ')
    return text

def clean_for_embeddings(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = replace_emojis(text)
    text = text.lower()
    text = re.sub(r'[^a-záéíóúâêôãõç\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Baja a minúsculas → GloVe es case-sensitive
# Reemplaza emojis → "risada" en lugar de cara riendo
# Solo letras y acentos → compatibilidad con GloVe

df['text_clean_emb'] = df['text'].apply(clean_for_embeddings)

4. EXTRACCIÓN DE FEATURES LINGÜÍSTICAS

In [59]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

portuguese_stopwords = stopwords.words('portuguese')

print(f"Loaded {len(portuguese_stopwords)} Portuguese stopwords.")
print("First 10 stopwords:", portuguese_stopwords[:10])

Loaded 207 Portuguese stopwords.
First 10 stopwords: ['a', 'à', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [60]:
offensive_comments = df[df['label'] == 1]['text_clean_emb']
inoffensive_comments = df[df['label'] == 0]['text_clean_emb']

# Tokenize the comments
all_offensive_tokens = [word for comment in offensive_comments for word in comment.split()]
all_inoffensive_tokens = [word for comment in inoffensive_comments for word in comment.split()]

print(f"Total offensive tokens: {len(all_offensive_tokens)}")
print(f"Total inoffensive tokens: {len(all_inoffensive_tokens)}")

filtered_offensive_tokens = [word for word in all_offensive_tokens if word not in portuguese_stopwords]
filtered_inoffensive_tokens = [word for word in all_inoffensive_tokens if word not in portuguese_stopwords]

print(f"Total offensive tokens after stopword removal: {len(filtered_offensive_tokens)}")
print(f"Total inoffensive tokens after stopword removal: {len(filtered_inoffensive_tokens)}")

from collections import Counter

offensive_word_counts = Counter(filtered_offensive_tokens)
inoffensive_word_counts = Counter(filtered_inoffensive_tokens)

print("Top 10 most common offensive words (after stopword removal):")
print(offensive_word_counts.most_common(10))
print("\nTop 10 most common inoffensive words (after stopword removal):")
print(inoffensive_word_counts.most_common(10))

unique_offensive_tokens_scored = {}

# Consider words present in offensive comments
for word, offensive_count in offensive_word_counts.items():
    inoffensive_count = inoffensive_word_counts.get(word, 0)

    # Calculate a score: higher if much more frequent in offensive comments
    # Add 1 to avoid division by zero if inoffensive_count is 0
    # and to smooth the ratio a bit. Higher 'offensive_count' is also weighted
    score = offensive_count / (inoffensive_count + 1) * offensive_count

    # Only consider words that are frequent enough in offensive comments (e.g., > 10 occurrences)
    # and have a significant score
    if offensive_count > 10 and score > 50:
        unique_offensive_tokens_scored[word] = score

# Sort and display top unique offensive tokens
sorted_unique_offensive_tokens = sorted(unique_offensive_tokens_scored.items(), key=lambda item: item[1], reverse=True)

print("\nTop 20 Unique Offensive Tokens (highly frequent in offensive, rare in inoffensive):")
for word, score in sorted_unique_offensive_tokens[:20]:
    print(f"- {word} (Offensive Count: {offensive_word_counts[word]}, Inoffensive Count: {inoffensive_word_counts.get(word, 0)}, Score: {score:.2f})")

unique_offensive_words = set(item[0] for item in sorted_unique_offensive_tokens)
print(f"Extracted {len(unique_offensive_words)} unique offensive words.")
print(f"First 10 unique offensive words: {list(unique_offensive_words)[:10]}")

Total offensive tokens: 53047
Total inoffensive tokens: 42523
Total offensive tokens after stopword removal: 31254
Total inoffensive tokens after stopword removal: 25031
Top 10 most common offensive words (after stopword removal):
[('pra', 357), ('vai', 270), ('brasil', 235), ('cara', 188), ('vc', 178), ('presidente', 163), ('povo', 156), ('vergonha', 155), ('esquerda', 144), ('tá', 138)]

Top 10 most common inoffensive words (after stopword removal):
[('presidente', 358), ('parabéns', 334), ('deus', 290), ('brasil', 266), ('vc', 175), ('pra', 171), ('bolsonaro', 167), ('sempre', 137), ('bem', 134), ('lula', 124)]

Top 20 Unique Offensive Tokens (highly frequent in offensive, rare in inoffensive):
- merda (Offensive Count: 104, Inoffensive Count: 0, Score: 10816.00)
- pirralha (Offensive Count: 134, Inoffensive Count: 1, Score: 8978.00)
- cadeia (Offensive Count: 91, Inoffensive Count: 0, Score: 8281.00)
- lixo (Offensive Count: 135, Inoffensive Count: 2, Score: 6075.00)
- nojo (Offens

In [61]:
def extract_linguistic_features(text):
    doc = nlp(text)
    features = {}

    # Básicas
    features['n_tokens'] = len(doc)
    features['upper_ratio'] = sum(1 for c in text if c.isupper()) / len(text) if text else 0
    features['n_exclam'] = text.count('!')
    features['n_question'] = text.count('?')

    # POS tags (solo adjetivos, sustantivos, verbos)  -> Son categorías especialmente relacionadas con lenguaje ofensivo
    pos_counts = {'ADJ': 0, 'NOUN': 0, 'VERB': 0}
    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1         # si coincide la categoría gramatical con las que tenemos en pos_counts entonces sumas 1
    total = sum(pos_counts.values())
    for pos in pos_counts:
        features[f'prop_{pos.lower()}'] = pos_counts[pos] / total if total > 0 else 0

    # Emojis
    features['n_emojis'] = len(emoji.emoji_list(text))

    # Nuevas Features Solicitadas:
    # 1. Presencia de hashtags
    features['n_hashtags'] = len(re.findall(r'#\w+', text))

    # 2. Presencia de menciones (@usuario)
    features['n_mentions'] = len(re.findall(r'@\w+', text))

    # 3. Presencia de pronombres de segunda persona
    n_second_person_pronouns = 0
    for token in doc:
        # Check if it's a pronoun and has 'Person=2' morphology
        if token.pos_ == "PRON" and token.morph.get("Person") == ["2"]:
            n_second_person_pronouns += 1
    features['n_second_person_pronouns'] = n_second_person_pronouns

    # 4. Presencia del patrón NOUN + ADJ
    n_noun_adj_pattern = 0
    for i, token in enumerate(doc):
        if token.pos_ == "NOUN" and i + 1 < len(doc) and doc[i + 1].pos_ == "ADJ":
            n_noun_adj_pattern += 1
    features['n_noun_adj_pattern'] = n_noun_adj_pattern

    # 5. Presencia del patrón pronombre 2ª persona + adjetivo
    n_pron_2nd_adj_pattern = 0
    for i, token in enumerate(doc):
        if token.pos_ == "PRON" and token.morph.get("Person") == ["2"] and i + 1 < len(doc) and doc[i + 1].pos_ == "ADJ":
            n_pron_2nd_adj_pattern += 1
    features['n_pron_2nd_adj_pattern'] = n_pron_2nd_adj_pattern

    # New Feature: Count of unique offensive tokens
    n_unique_offensive_tokens = 0
    for token in doc:
        if token.lemma_.lower() in unique_offensive_words:
            n_unique_offensive_tokens += 1
    features['n_unique_offensive_tokens'] = n_unique_offensive_tokens


    return features

print("Extrayendo features lingüísticas...")
feat_list = df['text_features'].apply(extract_linguistic_features)
feat_df = pd.DataFrame(feat_list.tolist())
print(f"Features extraídas: {feat_df.shape[1]}")


Extrayendo features lingüísticas...
Features extraídas: 14


In [62]:
print(feat_df.head())

   n_tokens  upper_ratio  n_exclam  n_question  prop_adj  prop_noun  \
0         3     0.083333         0           0      0.00       1.00   
1         8     0.032258         2           0      0.00       0.75   
2         5     0.033333         1           0      0.00       1.00   
3         3     0.052632         0           0      0.00       0.50   
4        15     0.051282         0           0      0.25       0.50   

   prop_verb  n_emojis  n_hashtags  n_mentions  n_second_person_pronouns  \
0       0.00         0           0           0                         0   
1       0.25         0           0           0                         0   
2       0.00         0           0           0                         0   
3       0.50         0           0           0                         0   
4       0.25         0           0           0                         0   

   n_noun_adj_pattern  n_pron_2nd_adj_pattern  n_unique_offensive_tokens  
0                   0                    

In [63]:
print(df['text'].head(5))

0                                         Mais um lixo
1                      Essa nao tem vergonha na cara!!
2                       Essa mulher é doente.pilantra!
3                                  Comunista safada...
4    Vagabunda. Comunista. Mentirosa. O povo chilen...
Name: text, dtype: object


In [64]:
# n_tokens    Comentarios largos → más contexto
# upper_ratio   MAYÚSCULAS = agresividad
# n_exclam, n_question    Exclamaciones = emoción fuerte
# prop_adj, prop_noun, prop_verb    Insultos suelen tener más adjetivos ("feio" "idiota")   --> IMPORTANTE: Estas proporciones no son respecto al total de tokens, sino respecto al total de ADJ + NOUN + VERB --> Por eso siempre suman 1
# n_emojis       Emojis como risa o cara enojada refuerzan el tono

5. DIVISIÓN DE DATOS

In [65]:
# ============================================================
# DIVISIÓN DE DATOS
# ============================================================

X_text_tfidf = df['text_clean_tfidf']
X_text_emb = df['text_clean_emb']
X_features = feat_df.values
y = df['label'].values

# 1. Separar 30% temporal (val + test)
X_train_tfidf, X_temp_tfidf, \
X_train_emb, X_temp_emb, \
X_feat_train, X_temp_feat, \
y_train, y_temp = train_test_split(
    X_text_tfidf, X_text_emb, X_features, y,
    test_size=0.3, random_state=42, stratify=y
)

# 2. Del 30% temporal → 15% val + 15% test
X_val_tfidf, X_test_tfidf, \
X_val_emb, X_test_emb, \
X_val_feat, X_feat_test, \
y_val, y_test = train_test_split(
    X_temp_tfidf, X_temp_emb, X_temp_feat, y_temp,
    test_size=0.5, random_state=42, stratify=y_temp
)

# Escalar solo con train
scaler = StandardScaler()
X_feat_train_scaled = scaler.fit_transform(X_feat_train)
X_val_feat_scaled = scaler.transform(X_val_feat)
X_feat_test_scaled = scaler.transform(X_feat_test)

# Escalado: necesario para combinar con TF-IDF y embeddings


6. MODELO A: TF-IDF + FEATURES + LogisticRegression

In [66]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

# ============================================================
# MODELO A: TF-IDF + FEATURES → 3 ALGORITMOS + GRIDSEARCH
# ============================================================
print("\n" + "="*70)
print("TRAINING MODEL A: TF-IDF (1-2 n-grams) + LINGUISTIC FEATURES")
print("GridSearchCV con cv=3 → SOLO sobre TRAIN")
print("Validation set NO se usa en tuning → queda para análisis posterior si quieres")
print("Evaluación FINAL: solo en TEST (15%)")
print("="*70)
# Vectorizador TF-IDF
tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    lowercase=False,
    max_features=10000
)

X_tfidf_train = tfidf.fit_transform(X_train_tfidf)
X_tfidf_val   = tfidf.transform(X_val_tfidf)
X_tfidf_test  = tfidf.transform(X_test_tfidf)

# Combinar con features lingüísticas
X_train_A = hstack([X_tfidf_train, csr_matrix(X_feat_train_scaled)])
X_val_A   = hstack([X_tfidf_val,   csr_matrix(X_val_feat_scaled)])
X_test_A  = hstack([X_tfidf_test,  csr_matrix(X_feat_test_scaled)])

print(f"TF-IDF vocabulary size: {len(tfidf.vocabulary_):,} terms")
print(f"Train shape: {X_train_A.shape} | Val shape: {X_val_A.shape} | Test shape: {X_test_A.shape}")

# Resultados
results_A = {}

# 1. Logistic Regression
print("\nTraining Logistic Regression (GridSearch on C)...")
log_grid_A = GridSearchCV( # Renamed
    LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    param_grid={'C': [0.1, 1.0, 10.0]},
    cv=3,                    # ← 3-fold CROSS-VALIDATION solo en TRAIN
    scoring='f1_macro',
    n_jobs=-1
)
log_grid_A.fit(X_train_A, y_train)
y_pred_log = log_grid_A.predict(X_test_A)
f1_log = f1_score(y_test, y_pred_log, average='macro')
results_A['LogisticRegression'] = (f1_log, log_grid_A.best_params_)
print(f"   Best C: {log_grid_A.best_params_['C']} | F1-macro (TEST): {f1_log:.4f}")

# 2. Random Forest
print("Training Random Forest...")
rf_grid_A = GridSearchCV( # Renamed
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid={
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 20, 30]
    },
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)
rf_grid_A.fit(X_train_A, y_train)
y_pred_rf = rf_grid_A.predict(X_test_A)
f1_rf = f1_score(y_test, y_pred_rf, average='macro')
results_A['RandomForest'] = (f1_rf, rf_grid_A.best_params_)
print(f"   Best params: {rf_grid_A.best_params_} | F1-macro (TEST): {f1_rf:.4f}")

# 3. Linear SVM
print("Training Linear SVM...")
svm_grid_A = GridSearchCV( # Renamed
    LinearSVC(class_weight='balanced', max_iter=10000, random_state=42),
    param_grid={'C': [0.01, 0.1, 1.0, 10.0]},
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)
svm_grid_A.fit(X_train_A, y_train)
y_pred_svm = svm_grid_A.predict(X_test_A)
f1_svm = f1_score(y_test, y_pred_svm, average='macro')
results_A['SVM'] = (f1_svm, svm_grid_A.best_params_)
print(f"   Best C: {svm_grid_A.best_params_['C']} | F1-macro (TEST): {f1_svm:.4f}")

# RESUMEN
print("\n" + "="*60)
print("MODEL A SUMMARY (TF-IDF + Linguistic Features)")
print("Tuning: cv=3 sobre TRAIN | Evaluación: TEST (15%)")
print("="*60)
for name, (f1, params) in results_A.items():
    print(f"{name:18} → F1-macro (TEST): {f1:.4f} | Best params: {params}")
print("="*60)

best_A_name = max(results_A, key=lambda k: results_A[k][0])
best_A_f1 = results_A[best_A_name][0]
print(f"WINNER MODEL A → {best_A_name}")
print(f"F1-macro final en TEST: {best_A_f1:.4f}")
print("Validation set reservado para futuros experimentos o ensemble")
print("="*60)


TRAINING MODEL A: TF-IDF (1-2 n-grams) + LINGUISTIC FEATURES
GridSearchCV con cv=3 → SOLO sobre TRAIN
Validation set NO se usa en tuning → queda para análisis posterior si quieres
Evaluación FINAL: solo en TEST (15%)
TF-IDF vocabulary size: 10,000 terms
Train shape: (4900, 10014) | Val shape: (1050, 10014) | Test shape: (1050, 10014)

Training Logistic Regression (GridSearch on C)...
   Best C: 10.0 | F1-macro (TEST): 0.8092
Training Random Forest...
   Best params: {'max_depth': None, 'n_estimators': 200} | F1-macro (TEST): 0.7590
Training Linear SVM...
   Best C: 1.0 | F1-macro (TEST): 0.8063

MODEL A SUMMARY (TF-IDF + Linguistic Features)
Tuning: cv=3 sobre TRAIN | Evaluación: TEST (15%)
LogisticRegression → F1-macro (TEST): 0.8092 | Best params: {'C': 10.0}
RandomForest       → F1-macro (TEST): 0.7590 | Best params: {'max_depth': None, 'n_estimators': 200}
SVM                → F1-macro (TEST): 0.8063 | Best params: {'C': 1.0}
WINNER MODEL A → LogisticRegression
F1-macro final en T

7. MODELO B: GloVe + FEATURES + RandomForest

In [67]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
# ============================================================
# MODELO B: GLOVE EMBEDDINGS + FEATURES (gensim)
# ============================================================

print("\n" + "="*60)
print("ENTRENANDO MODELO B: GLOVE + FEATURES")
print("="*60)

# Cargar GloVe (descargar desde NILC: http://nilc.icmc.usp.br/nwp/embeddings)
# Ejemplo: glove_s100.txt
glove_path = "/content/drive/MyDrive/proyecto2_PLN/glove_s100.txt"
print(f"Cargando GloVe desde: {glove_path}")
w2v_model = KeyedVectors.load_word2vec_format(glove_path)

def get_avg_embedding(text, model):
    words = text.split()
    valid = [w for w in words if w in model]
    if not valid:
        return np.zeros(model.vector_size)
    return np.mean([model[w] for w in valid], axis=0)

# Representación de frase: promedio de vectores
# Palabras fuera del vocabulario: ignoradas

print("Calculando embeddings promedio para train, val y test...")
X_emb_train = np.array([get_avg_embedding(t, w2v_model) for t in X_train_emb])
X_emb_val   = np.array([get_avg_embedding(t, w2v_model) for t in X_val_emb])
X_emb_test  = np.array([get_avg_embedding(t, w2v_model) for t in X_test_emb])

#Combinar con features lingüísticas escaladas
X_train_B = np.hstack([X_emb_train, X_feat_train_scaled])
X_val_B   = np.hstack([X_emb_val,   X_val_feat_scaled])
X_test_B  = np.hstack([X_emb_test,  X_feat_test_scaled])

print(f"GloVe embedding dimension: {X_emb_train.shape[1]}")
print(f"Train shape: {X_train_B.shape} | Val shape: {X_val_B.shape} | Test shape: {X_test_B.shape}")
# Resultados
results_B = {}

# 1. Logistic Regression
print("\nTraining Logistic Regression (GridSearch on C)...")
log_grid_B = GridSearchCV( # Renamed
    LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    param_grid={'C': [0.1, 1.0, 10.0]},
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)
log_grid_B.fit(X_train_B, y_train)
y_pred_log = log_grid_B.predict(X_test_B)
f1_log = f1_score(y_test, y_pred_log, average='macro')
results_B['LogisticRegression'] = (f1_log, log_grid_B.best_params_)
print(f"   Best C: {log_grid_B.best_params_['C']} | F1-macro (TEST): {f1_log:.4f}")

# 2. Random Forest
print("Training Random Forest...")
rf_grid_B = GridSearchCV( # Renamed
    RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid={
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 20, 30]
    },
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)
rf_grid_B.fit(X_train_B, y_train)
y_pred_rf = rf_grid_B.predict(X_test_B)
f1_rf = f1_score(y_test, y_pred_rf, average='macro')
results_B['RandomForest'] = (f1_rf, rf_grid_B.best_params_)
print(f"   Best params: {rf_grid_B.best_params_} | F1-macro (TEST): {f1_rf:.4f}")

# 3. Linear SVM
print("Training Linear SVM...")
svm_grid_B = GridSearchCV( # Renamed
    LinearSVC(class_weight='balanced', max_iter=10000, random_state=42),
    param_grid={'C': [0.01, 0.1, 1.0, 10.0]},
    cv=3,
    scoring='f1_macro',
    n_jobs=-1
)
svm_grid_B.fit(X_train_B, y_train)
y_pred_svm = svm_grid_B.predict(X_test_B)
f1_svm = f1_score(y_test, y_pred_svm, average='macro')
results_B['SVM'] = (f1_svm, svm_grid_B.best_params_)
print(f"   Best C: {svm_grid_B.best_params_['C']} | F1-macro (TEST): {f1_svm:.4f}")

# RESUMEN FINAL MODELO B
print("\n" + "="*60)
print("MODEL B SUMMARY (GloVe avg + Linguistic Features)")
print("Tuning: cv=3 sobre TRAIN | Evaluación: TEST (15%)")
print("="*60)
for name, (f1, params) in results_B.items():
    print(f"{name:18} → F1-macro (TEST): {f1:.4f} | Best params: {params}")
print("="*60)

best_B_name = max(results_B, key=lambda k: results_B[k][0])
best_B_f1 = results_B[best_B_name][0]
print(f"WINNER MODEL B → {best_B_name}")
print(f"F1-macro final en TEST: {best_B_f1:.4f}")
print("Validation set reservado para futuros análisis o ensemble")
print("="*60)


ENTRENANDO MODELO B: GLOVE + FEATURES
Cargando GloVe desde: /content/drive/MyDrive/proyecto2_PLN/glove_s100.txt
Calculando embeddings promedio para train, val y test...
GloVe embedding dimension: 100
Train shape: (4900, 114) | Val shape: (1050, 114) | Test shape: (1050, 114)

Training Logistic Regression (GridSearch on C)...
   Best C: 1.0 | F1-macro (TEST): 0.8248
Training Random Forest...
   Best params: {'max_depth': None, 'n_estimators': 300} | F1-macro (TEST): 0.8009
Training Linear SVM...
   Best C: 0.1 | F1-macro (TEST): 0.8248

MODEL B SUMMARY (GloVe avg + Linguistic Features)
Tuning: cv=3 sobre TRAIN | Evaluación: TEST (15%)
LogisticRegression → F1-macro (TEST): 0.8248 | Best params: {'C': 1.0}
RandomForest       → F1-macro (TEST): 0.8009 | Best params: {'max_depth': None, 'n_estimators': 300}
SVM                → F1-macro (TEST): 0.8248 | Best params: {'C': 0.1}
WINNER MODEL B → LogisticRegression
F1-macro final en TEST: 0.8248
Validation set reservado para futuros análisis 

8. COMPARACIÓN TEÓRICA: ¿CUÁNDO GANA CADA MODELO?

In [69]:
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

print("\n" + "="*80)
print(" RESULTADOS FINALES - 6 MODELOS COMPLETOS")
print(" Métricas: Accuracy | Precision (macro) | Recall (macro) | F1-macro | F1 por clase")
print("="*80)

# Diccionario para guardar TODAS las métricas
full_results = {}

# ============================================================
# Función para evaluar un modelo y extraer TODAS las métricas
# ============================================================

def evaluate_model(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, average='macro')
    rec = recall_score(y_true, y_pred, average='macro')
    f1_macro = f1_score(y_true, y_pred, average='macro')
    report = classification_report(y_true, y_pred, output_dict=True)
    f1_hate = report['1']['f1-score']
    f1_non_hate = report['0']['f1-score']

    return {
        'Accuracy': acc,
        'Precision_macro': prec,
        'Recall_macro': rec,
        'F1_macro': f1_macro,
        'F1_hate': f1_hate,
        'F1_non_hate': f1_non_hate
    }

# ------------------------------------------------------------------
# Evaluar todos los modelos guardados en results_A y results_B
# ------------------------------------------------------------------

# Necesitamos volver a predecir con los mejores modelos
# (los tenemos en log_grid.best_estimator_, etc.)

# --- MODELO A: TF-IDF ---
print("Evaluando modelos TF-IDF + features...")
# Logistic Regression
y_pred = log_grid_A.best_estimator_.predict(X_test_A)
full_results['TF-IDF + LogisticRegression'] = evaluate_model('TF-IDF + LR', y_test, y_pred)

# Random Forest
y_pred = rf_grid_A.best_estimator_.predict(X_test_A)
full_results['TF-IDF + RandomForest'] = evaluate_model('TF-IDF + RF', y_test, y_pred)

# SVM
y_pred = svm_grid_A.best_estimator_.predict(X_test_A)
full_results['TF-IDF + SVM'] = evaluate_model('TF-IDF + SVM', y_test, y_pred)

# --- MODELO B: GloVe ---
print("Evaluando modelos GloVe + features...")

# LogisticRegression GloVe
best_lr_B = LogisticRegression(**log_grid_B.best_params_, max_iter=1000, class_weight='balanced', random_state=42)
best_lr_B.fit(X_train_B, y_train)
y_pred_lr_glove = best_lr_B.predict(X_test_B)
full_results['GloVe + LogisticRegression'] = evaluate_model('GloVe + LR', y_test, y_pred_lr_glove)

# Random Forest GloVe
best_rf_B = RandomForestClassifier(**rf_grid_B.best_params_, random_state=42, n_jobs=-1)
best_rf_B.fit(X_train_B, y_train)
y_pred_rf_glove = best_rf_B.predict(X_test_B)
full_results['GloVe + RandomForest'] = evaluate_model('GloVe + RF', y_test, y_pred_rf_glove)

# SVM GloVe
best_svm_B = LinearSVC(**svm_grid_B.best_params_, class_weight='balanced', max_iter=10000, random_state=42)
best_svm_B.fit(X_train_B, y_train)
y_pred_svm_glove = best_svm_B.predict(X_test_B)
full_results['GloVe + SVM'] = evaluate_model('GloVe + SVM', y_test, y_pred_svm_glove)

# ------------------------------------------------------------------
# Crear tabla bonita con pandas
# ------------------------------------------------------------------
df_results = pd.DataFrame(full_results).T
df_results = df_results.round(4)
df_results = df_results.sort_values(by='F1_macro', ascending=False)

# Mostrar ranking
print("\n" + " RANKING FINAL DE MODELOS")
print("-" * 90)
print(df_results.to_string())

# ------------------------------------------------------------------
# Ganador absoluto
# ------------------------------------------------------------------
winner_name = df_results.index[0]
winner_row = df_results.iloc[0]

print("\n" + "="*90)
print(f" GANADOR ABSOLUTO: {winner_name}")
print("="*90)
print(f"Accuracy:        {winner_row['Accuracy']:.4f}")
print(f"Precision (macro): {winner_row['Precision_macro']:.4f}")
print(f"Recall (macro):    {winner_row['Recall_macro']:.4f}")
print(f"F1-macro:          {winner_row['F1_macro']:.4f}")
print(f"F1 (odio):         {winner_row['F1_hate']:.4f}")
print(f"F1 (no odio):      {winner_row['F1_non_hate']:.4f}")
print("="*90)

# ------------------------------------------------------------------
# Classification Report del ganador
# ------------------------------------------------------------------
winner_model_name = df_results.index[0]

if 'TF-IDF' in winner_model_name:
    X_test_winner = X_test_A
    if 'LogisticRegression' in winner_model_name:
        y_pred_winner = log_grid_A.best_estimator_.predict(X_test_winner)
    elif 'RandomForest' in winner_model_name:
        y_pred_winner = rf_grid_A.best_estimator_.predict(X_test_winner)
    elif 'SVM' in winner_model_name:
        y_pred_winner = svm_grid_A.best_estimator_.predict(X_test_winner)
else: # GloVe model
    X_test_winner = X_test_B
    if 'LogisticRegression' in winner_model_name:
        y_pred_winner = log_grid_B.best_estimator_.predict(X_test_winner)
    elif 'RandomForest' in winner_model_name:
        y_pred_winner = rf_grid_B.best_estimator_.predict(X_test_winner)
    elif 'SVM' in winner_model_name:
        y_pred_winner = svm_grid_B.best_estimator_.predict(X_test_winner)

print("\nCLASIFICATION REPORT COMPLETO DEL GANADOR:")
print(classification_report(y_test, y_pred_winner, digits=4))


 RESULTADOS FINALES - 6 MODELOS COMPLETOS
 Métricas: Accuracy | Precision (macro) | Recall (macro) | F1-macro | F1 por clase
Evaluando modelos TF-IDF + features...
Evaluando modelos GloVe + features...

 RANKING FINAL DE MODELOS
------------------------------------------------------------------------------------------
                             Accuracy  Precision_macro  Recall_macro  F1_macro  F1_hate  F1_non_hate
GloVe + SVM                    0.8248           0.8248        0.8248    0.8248   0.8238       0.8258
GloVe + LogisticRegression     0.8248           0.8248        0.8248    0.8248   0.8248       0.8248
TF-IDF + LogisticRegression    0.8095           0.8113        0.8095    0.8092   0.8020       0.8165
TF-IDF + SVM                   0.8067           0.8089        0.8067    0.8063   0.7980       0.8146
GloVe + RandomForest           0.8010           0.8014        0.8010    0.8009   0.7969       0.8049
TF-IDF + RandomForest          0.7590           0.7593        0.7590    0



---

