# 02 - Mejorando el modelo (Feature Engineering + Ajustes)

En este notebook vamos a:

- Ajustar TF-IDF
- Probar n-grams
- Manejar desbalance con class_weight
- Comparar métricas

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Cargar el dataset
df = pd.read_csv(
    "../data/SMSSpamCollection",
    sep="\t",
    header=None,
    names=["label", "text"]
)

df["label"] = df["label"].map({"ham": 0, "spam": 1})
df.head()

# Dividir el dataset en entrenamiento y prueba
X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Crear un modelo de línea base con TfidfVectorizer y LogisticRegression
baseline_model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=200))
])

baseline_model.fit(X_train, y_train)
baseline_pred = baseline_model.predict(X_test)

print("Baseline Model")
print(classification_report(y_test, baseline_pred, target_names=["ham","spam"]))


## ¿Qué podemos mejorar?

TF-IDF tiene parámetros importantes:

- stop_words → eliminar palabras comunes
- ngram_range → incluir combinaciones de palabras
- min_df → eliminar palabras muy raras
- max_df → eliminar palabras demasiado frecuentes
- class_weight → ajusta la importancia de cada clase en el entrenamiento

In [12]:
improved_model = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        stop_words="english",
        ngram_range=(1, 2),   # unigrams + bigrams
        min_df=2,
        max_df=0.95
    )),
    ("clf", LogisticRegression(max_iter=300, class_weight="balanced"))
])

improved_model.fit(X_train, y_train)
improved_pred = improved_model.predict(X_test)

print("Improved Model")
print(classification_report(y_test, improved_pred, target_names=["ham","spam"]))

Improved Model
              precision    recall  f1-score   support

         ham       0.99      0.98      0.99       966
        spam       0.90      0.93      0.91       149

    accuracy                           0.98      1115
   macro avg       0.94      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



## Comparación de modelos

In [None]:
from sklearn.metrics import f1_score

baseline_f1 = f1_score(y_test, baseline_pred)
improved_f1 = f1_score(y_test, improved_pred)

print("Baseline F1:", baseline_f1)
print("Improved F1:", improved_f1)

comparison = pd.DataFrame({
    "Model": ["Baseline", "Improved"],
    "F1 Score": [baseline_f1, improved_f1]
})

comparison

## Prueba de combinaciones

In [None]:
def train_and_evaluate(ngram, stop_words, class_weight):
    model = Pipeline([
        ("tfidf", TfidfVectorizer(
            ngram_range=ngram,
            stop_words=stop_words
        )),
        ("clf", LogisticRegression(max_iter=300, class_weight=class_weight))
    ])
    
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    
    print("ngram:", ngram, 
          "| stop_words:", stop_words, 
          "| class_weight:", class_weight)
    
    print(classification_report(y_test, pred))
    print("-"*50)

train_and_evaluate((1,1), None, None)
train_and_evaluate((1,2), None, None)
train_and_evaluate((1,2), "english", None)
train_and_evaluate((1,2), "english", "balanced")