In [25]:
!pip install --upgrade gensim



In [26]:
!pip install emoji



In [27]:
# =============================================================================
# MILESTONE 2 - HATEBR TEXT CLASSIFICATION (Simplified)
# Dataset: HateBR[](https://huggingface.co/datasets/franciellevargas/HateBR)
# Baselines:
#   A) TF-IDF (1-2 n-grams) + linguistic features + LogisticRegression
#   B) GloVe Average Embeddings + linguistic features + RandomForestClassifier
# =============================================================================

import pandas as pd
import numpy as np
import re
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
from gensim.models import KeyedVectors
import emoji
import warnings
warnings.filterwarnings('ignore')

1. CARGA DEL DATASET (como en Proyecto 1)

In [28]:
# ============================================================
# 1. CARGAR DATASET DESDE GITHUB (como en Proyecto 1)
# ============================================================

# URL "raw" del archivo HateBR.csv en GitHub
url = "https://raw.githubusercontent.com/franfgv9/Milestone_1_PLN/refs/heads/main/HateBR.csv"

# Leer el CSV directamente desde GitHub
df = pd.read_csv(url)

# Verificar la carga
print("Dimensiones del dataset:", df.shape)
print("Columnas disponibles:", df.columns.tolist())
df.head()

Dimensiones del dataset: (7000, 8)
Columnas disponibles: ['id', 'comentario', 'anotator1', 'anotator2', 'anotator3', 'label_final', 'links_post', 'account_post']


Unnamed: 0,id,comentario,anotator1,anotator2,anotator3,label_final,links_post,account_post
0,1,Mais um lixo,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli
1,2,Essa nao tem vergonha na cara!!,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli
2,3,Essa mulher é doente.pilantra!,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli
3,4,Comunista safada...,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli
4,5,Vagabunda. Comunista. Mentirosa. O povo chilen...,1,1,1,1,https://www.instagram.com/p/B2uThqdH9xI/,Carla Zambelli


In [29]:
# ============================================================
# 2. SELECCIONAR COLUMNAS RELEVANTES
# ============================================================

df = df[["id", "comentario", "label_final"]]

# Verificar el resultado
print("\nColumnas seleccionadas:", df.columns.tolist())
print("Dimensiones del dataset:", df.shape)
df.head()


Columnas seleccionadas: ['id', 'comentario', 'label_final']
Dimensiones del dataset: (7000, 3)


Unnamed: 0,id,comentario,label_final
0,1,Mais um lixo,1
1,2,Essa nao tem vergonha na cara!!,1
2,3,Essa mulher é doente.pilantra!,1
3,4,Comunista safada...,1
4,5,Vagabunda. Comunista. Mentirosa. O povo chilen...,1


In [30]:
# ============================================================
# 3. RENOMBRAR COLUMNAS PARA UNIFORMIDAD
# ============================================================

df = df.rename(columns={
    "comentario": "text",
    "label_final": "label"
})

print("\nColumnas finales:", df.columns.tolist())
df.head()

print("\nInformación del dataset:")
print(df.info())

# Limpiar valores nulos
df.dropna(subset=['text', 'label'], inplace=True)
df = df[df['label'].isin([0, 1])]  # 0: No ofensivo, 1: Ofensivo
df.reset_index(drop=True, inplace=True)

print(f"\nDataset final: {len(df)} ejemplos")
print("Distribución de clases:")
print(df['label'].value_counts())


Columnas finales: ['id', 'text', 'label']

Información del dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      7000 non-null   int64 
 1   text    7000 non-null   object
 2   label   7000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 164.2+ KB
None

Dataset final: 7000 ejemplos
Distribución de clases:
label
1    3500
0    3500
Name: count, dtype: int64


2. LIMPIEZA GENERAL (para features lingüísticas)

In [31]:
# ============================================================
# 4. PREPROCESAMIENTO GENERAL (para features lingüísticas)
# ============================================================

def clean_for_features(text):
    text = re.sub(r'http[s]?://\S+', '', text)  # Quitar URLs
    text = re.sub(r'\s+', ' ', text).strip()    # Normalizar espacios
    return text

df['text_features'] = df['text'].apply(clean_for_features)

# Descargar modelo spaCy para portugués si no está instalado
!python -m spacy download pt_core_news_sm

# Cargar modelo spaCy para portugués
print("\nCargando modelo spaCy (pt_core_news_sm)...")
nlp = spacy.load("pt_core_news_sm")

Collecting pt-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.8.0/pt_core_news_sm-3.8.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

Cargando modelo spaCy (pt_core_news_sm)...


Por qué no más limpieza?
→ Porque queremos preservar mayúsculas, signos, emojis para extraer features.
Solo quitamos URLs y espacios extra → no afectan al análisis lingüístico.

3. EXTRACCIÓN DE FEATURES LINGÜÍSTICAS

In [32]:
# ============================================================
# 5. EXTRACCIÓN DE FEATURES LINGÜÍSTICAS (con spaCy)
# ============================================================

def extract_linguistic_features(text):
    doc = nlp(text)
    features = {}

    # Básicas
    features['n_tokens'] = len(doc)
    features['upper_ratio'] = sum(1 for c in text if c.isupper()) / len(text) if text else 0
    features['n_exclam'] = text.count('!')
    features['n_question'] = text.count('?')

    # POS tags (solo adjetivos, sustantivos, verbos)
    pos_counts = {'ADJ': 0, 'NOUN': 0, 'VERB': 0}
    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1
    total = sum(pos_counts.values())
    for pos in pos_counts:
        features[f'prop_{pos.lower()}'] = pos_counts[pos] / total if total > 0 else 0

    # Emojis
    features['n_emojis'] = len(emoji.emoji_list(text))

    return features

print("Extrayendo features lingüísticas...")
feat_list = df['text_features'].apply(extract_linguistic_features)
feat_df = pd.DataFrame(feat_list.tolist())
print(f"Features extraídas: {feat_df.shape[1]}")


Extrayendo features lingüísticas...
Features extraídas: 8


In [33]:
print(feat_df.head())

   n_tokens  upper_ratio  n_exclam  n_question  prop_adj  prop_noun  \
0         3     0.083333         0           0      0.00       1.00   
1         8     0.032258         2           0      0.00       0.75   
2         5     0.033333         1           0      0.00       1.00   
3         3     0.052632         0           0      0.00       0.50   
4        15     0.051282         0           0      0.25       0.50   

   prop_verb  n_emojis  
0       0.00         0  
1       0.25         0  
2       0.00         0  
3       0.50         0  
4       0.25         0  


In [None]:
# n_tokens    Comentarios largos → más contexto
# upper_ratio   MAYÚSCULAS = agresividad
# n_exclam, n_question    Exclamaciones = emoción fuerte
# prop_adj, prop_noun, prop_verb    Insultos suelen tener más adjetivos ("feio" "idiota")
# n_emojis       Emojis como risa o cara enojada refuerzan el tono

4. LIMPIEZA ESPECÍFICA POR MODELO

In [34]:
# ============================================================
# 6. LIMPIEZA ESPECÍFICA POR MODELO
# ============================================================

# --- Modelo A: TF-IDF (superficial) ---
def clean_for_tfidf(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text_clean_tfidf'] = df['text'].apply(clean_for_tfidf)

# No se baja a minúsculas → lowercase=False en TfidfVectorizer
# Se preservan mayúsculas → "IDIOTA" ≠ "idiota" → señal de ofensa

# --- Modelo B: Embeddings (semántico) ---
emoji_map = {'[laughing face]': 'risada', '[angry face]': 'raiva', '[heart]': 'amor'}

def replace_emojis(text):
    for emj, word in emoji_map.items():
        text = text.replace(emj, f' {word} ')
    return text

def clean_for_embeddings(text):
    text = re.sub(r'http[s]?://\S+', '', text)
    text = replace_emojis(text)
    text = text.lower()
    text = re.sub(r'[^a-záéíóúâêôãõç\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Baja a minúsculas → GloVe es case-sensitive
# Reemplaza emojis → "risada" en lugar de cara riendo
# Solo letras y acentos → compatibilidad con GloVe

df['text_clean_emb'] = df['text'].apply(clean_for_embeddings)

5. DIVISIÓN DE DATOS

In [35]:
# ============================================================
# 7. PREPARAR DATOS PARA MODELOS
# ============================================================

X_text_tfidf = df['text_clean_tfidf']
X_text_emb = df['text_clean_emb']
X_features = feat_df.values
y = df['label'].values

# División train/test
X_text_tfidf_train, X_text_tfidf_test, \
X_text_emb_train, X_text_emb_test, \
X_feat_train, X_feat_test, \
y_train, y_test = train_test_split(
    X_text_tfidf, X_text_emb, X_features, y,
    test_size=0.2, random_state=42)

# Escalar features
scaler = StandardScaler()
X_feat_train_scaled = scaler.fit_transform(X_feat_train)
X_feat_test_scaled = scaler.transform(X_feat_test)

# Escalado: necesario para combinar con TF-IDF y embeddings

6. MODELO A: TF-IDF + FEATURES + LogisticRegression

In [36]:
# ============================================================
# 8. MODELO A: TF-IDF + FEATURES (sklearn)
# ============================================================

print("\n" + "="*60)
print("ENTRENANDO MODELO A: TF-IDF + FEATURES")
print("="*60)

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    lowercase=False,      # Mantener mayúsculas
    max_features=5000
)

# 1-2 grams: "muito feio" ≠ "muito" + "feio"
# max_features=5000: evita overfitting
# lowercase=False: preserva mayúsculas

X_tfidf_train = tfidf.fit_transform(X_text_tfidf_train)
X_tfidf_test = tfidf.transform(X_text_tfidf_test)

X_train_A = hstack([X_tfidf_train, csr_matrix(X_feat_train_scaled)])
X_test_A = hstack([X_tfidf_test, csr_matrix(X_feat_test_scaled)])

# Combinación: matriz dispersa (TF-IDF) + densa (features)

clf_A = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
clf_A.fit(X_train_A, y_train)
y_pred_A = clf_A.predict(X_test_A)

# class_weight='balanced': ajusta por desbalance
# max_iter=1000: converge seguro

print("\n=== Resultados Modelo A (TF-IDF + Features) ===")
print(classification_report(y_test, y_pred_A, digits=4))
f1_A = f1_score(y_test, y_pred_A, average='macro')


ENTRENANDO MODELO A: TF-IDF + FEATURES

=== Resultados Modelo A (TF-IDF + Features) ===
              precision    recall  f1-score   support

           0     0.7918    0.8113    0.8014       689
           1     0.8127    0.7932    0.8028       711

    accuracy                         0.8021      1400
   macro avg     0.8022    0.8023    0.8021      1400
weighted avg     0.8024    0.8021    0.8022      1400



7. MODELO B: GloVe + FEATURES + RandomForest

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [41]:
# ============================================================
# 9. MODELO B: GLOVE EMBEDDINGS + FEATURES (gensim)
# ============================================================

print("\n" + "="*60)
print("ENTRENANDO MODELO B: GLOVE + FEATURES")
print("="*60)

# Cargar GloVe (descargar desde NILC: http://nilc.icmc.usp.br/nwp/embeddings)
# Ejemplo: glove_s100.txt
glove_path = "/content/drive/MyDrive/proyecto2_PLN/glove_s100.txt"
print(f"Cargando GloVe desde: {glove_path}")
w2v_model = KeyedVectors.load_word2vec_format(glove_path)

def get_avg_embedding(text, model):
    words = text.split()
    valid = [w for w in words if w in model]
    if not valid:
        return np.zeros(model.vector_size)
    return np.mean([model[w] for w in valid], axis=0)

# Representación de frase: promedio de vectores
# Palabras fuera del vocabulario: ignoradas

print("Calculando embeddings promedio...")
X_emb_train = np.array([get_avg_embedding(t, w2v_model) for t in X_text_emb_train])
X_emb_test = np.array([get_avg_embedding(t, w2v_model) for t in X_text_emb_test])

X_train_B = np.hstack([X_emb_train, X_feat_train_scaled])
X_test_B = np.hstack([X_emb_test, X_feat_test_scaled])

# 100 (GloVe) + 8 (features) = 108 dimensiones

clf_B = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)        # No necesita escalado, Captura interacciones no lineales
clf_B.fit(X_train_B, y_train)
y_pred_B = clf_B.predict(X_test_B)



print("\n=== Resultados Modelo B (GloVe + Features) ===")
print(classification_report(y_test, y_pred_B, digits=4))
f1_B = f1_score(y_test, y_pred_B, average='macro')

# F1-macro: promedio sin ponderar → justo con clases desbalanceadas
# digits=4: precisión para el informe


ENTRENANDO MODELO B: GLOVE + FEATURES
Cargando GloVe desde: /content/drive/MyDrive/proyecto2_PLN/glove_s100.txt
Calculando embeddings promedio...

=== Resultados Modelo B (GloVe + Features) ===
              precision    recall  f1-score   support

           0     0.7901    0.8084    0.7991       689
           1     0.8101    0.7918    0.8009       711

    accuracy                         0.8000      1400
   macro avg     0.8001    0.8001    0.8000      1400
weighted avg     0.8002    0.8000    0.8000      1400



8. COMPARACIÓN TEÓRICA: ¿CUÁNDO GANA CADA MODELO?

In [42]:
# ============================================================
# 10. COMPARACIÓN FINAL
# ============================================================

print("\n" + "="*60)
print("RESUMEN FINAL")
print("="*60)
print(f"Modelo A (TF-IDF + Features)     → F1-macro: {f1_A:.4f}")
print(f"Modelo B (GloVe + Features)      → F1-macro: {f1_B:.4f}")
print("="*60)

if f1_A > f1_B:
    print("GANADOR: Modelo A (TF-IDF) – mejor para patrones ofensivos explícitos")
else:
    print("GANADOR: Modelo B (GloVe) – mejor para tono y semántica")

# Guardar resultados
results = {
    'f1_A': f1_A,
    'f1_B': f1_B,
    'y_test': y_test.tolist(),
    'pred_A': y_pred_A.tolist(),
    'pred_B': y_pred_B.tolist()
}
import json
with open('milestone2_results.json', 'w') as f:
    json.dump(results, f)

print("\nResultados guardados en 'milestone2_results.json'")


RESUMEN FINAL
Modelo A (TF-IDF + Features)     → F1-macro: 0.8021
Modelo B (GloVe + Features)      → F1-macro: 0.8000
GANADOR: Modelo A (TF-IDF) – mejor para patrones ofensivos explícitos

Resultados guardados en 'milestone2_results.json'
Listo para el mini-report (máx 3 páginas).
