## **Desafío 1 — Solución paso a paso (20 Newsgroups, TF‑IDF, Similaridad y Naïve Bayes)**

In [7]:
%pip install -q scikit-learn numpy scipy

Note: you may need to restart the kernel to use updated packages.


In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import ParameterGrid
import numpy as np
import scipy.sparse as sp

def top_k_indices(a, k):
    """Return indices of the top-k values of a 1D array (descending)."""
    idx = np.argpartition(-a, kth=min(k, len(a)-1))[:k]
    return idx[np.argsort(-a[idx])]

## **1) Carga del dataset**

In [5]:
train = fetch_20newsgroups(subset='train', remove=('headers','footers','quotes'))
test  = fetch_20newsgroups(subset='test',  remove=('headers','footers','quotes'))

print(f"Clases: {len(train.target_names)}")
print("Ejemplo de documento (train[0]):\n")
print(train.data[0][:800], "...")  # vista parcial

Clases: 20
Ejemplo de documento (train[0]):

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail. ...


## **2) Vectorización con TF‑IDF**

In [11]:
tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words='english',      
    max_df=0.5,                
    min_df=5,                  
    ngram_range=(1,2),        
    sublinear_tf=True          
)

X_train = tfidf.fit_transform(train.data)
X_test  = tfidf.transform(test.data)

y_train = train.target
y_test  = test.target

feature_names = tfidf.get_feature_names_out()
X_train.shape, X_test.shape

((11314, 28764), (7532, 28764))

## 3) **Ejercicio 1** — Similaridad coseno entre documentos

In [15]:
k = 5
i_test = 0  # podés cambiar el índice para explorar distintos documentos

# Similaridad coseno de un doc de test contra toda la matriz de train
# (1 x n_train) @ (n_train x d)^T ya está implícito en cosine_similarity
sims = cosine_similarity(X_test[i_test], X_train).ravel()
topk = top_k_indices(sims, k)

print(f"Documento de test #{i_test} — clase real: {test.target_names[y_test[i_test]]}\n")
print(test.data[i_test][:500], "...\n")

print(f"Top-{k} documentos más similares en train:")
for rank, j in enumerate(topk, start=1):
    print(f"{rank:>2}. idx={j:>5} | clase={train.target_names[y_train[j]]} | cos={sims[j]:.4f}")
    # Si querés ver el texto:
    # print(train.data[j][:200], "...\n")

Documento de test #0 — clase real: rec.autos

I am a little confused on all of the models of the 88-89 bonnevilles.
I have heard of the LE SE LSE SSE SSEI. Could someone tell me the
differences are far as features or performance. I am also curious to
know what the book value is for prefereably the 89 model. And how much
less than book value can you usually get them for. In other words how
much are they in demand this time of year. I have heard that the mid-spring
early summer is the best time to buy. ...

Top-5 documentos más similares en train:
 1. idx= 4143 | clase=comp.sys.mac.hardware | cos=0.2385
 2. idx=  401 | clase=rec.sport.hockey | cos=0.1562
 3. idx= 6769 | clase=comp.sys.mac.hardware | cos=0.1550
 4. idx= 4373 | clase=rec.autos | cos=0.1210
 5. idx=10179 | clase=rec.autos | cos=0.1181


## 4) **Ejercicio 2** — Clasificador por prototipo (1‑NN por coseno) 

In [19]:
def predict_1nn_cosine(X_tr, y_tr, X_te, batch=256):
    """1-NN por coseno usando batches para ahorrar memoria."""
    y_pred = []
    for start in range(0, X_te.shape[0], batch):
        stop = min(start+batch, X_te.shape[0])
        # (batch x d) vs (n_train x d) -> (batch x n_train)
        S = cosine_similarity(X_te[start:stop], X_tr)
        nn = S.argmax(axis=1)  # índice del train más similar
        y_pred.append(y_tr[nn])
    return np.concatenate(y_pred)

yhat_1nn = predict_1nn_cosine(X_train, y_train, X_test, batch=128)
f1_1nn = f1_score(y_test, yhat_1nn, average='macro')
print(f"F1-macro (1-NN coseno): {f1_1nn:.3f}")

F1-macro (1-NN coseno): 0.504


In [21]:
def predict_knn_cosine(X_tr, y_tr, X_te, k=5, batch=128):
    y_pred = []
    for start in range(0, X_te.shape[0], batch):
        stop = min(start+batch, X_te.shape[0])
        S = cosine_similarity(X_te[start:stop], X_tr)
        topk = np.argpartition(-S, kth=min(k, S.shape[1]-1), axis=1)[:, :k]
        # ordenar cada fila según similitud descendente
        rows = np.arange(topk.shape[0])[:, None]
        sorted_topk = topk[rows, np.argsort(-S[rows, topk])]
        # voto mayoritario
        blk_pred = []
        for r in range(sorted_topk.shape[0]):
            labels = y_tr[sorted_topk[r]]
            # empate: elegimos la clase con mayor suma de similitudes
            if k > 1:
                uniq, counts = np.unique(labels, return_counts=True)
                if counts.max() == 1:  # todos distintos: desempatar por suma de similitudes
                    sim_sums = {c: S[r, sorted_topk[r][labels==c]].sum() for c in uniq}
                    blk_pred.append(max(sim_sums, key=sim_sums.get))
                else:
                    blk_pred.append(uniq[counts.argmax()])
            else:
                blk_pred.append(labels[0])
        y_pred.extend(blk_pred)
    return np.array(y_pred)

yhat_knn = predict_knn_cosine(X_train, y_train, X_test, k=5, batch=96)
f1_knn = f1_score(y_test, yhat_knn, average='macro')
print(f"F1-macro (5-NN coseno): {f1_knn:.3f}")

F1-macro (5-NN coseno): 0.572


## 5) **Ejercicio 3** — Naïve Bayes (Multinomial & Complement). Pruebo dos modelos: **MultinomialNB** y **ComplementNB**.

In [27]:
param_grid = {
    "alpha": [0.1, 0.5, 1.0, 2.0],
}

best = {"mnb": (-1, None), "cnb": (-1, None)}  # (score, alpha)

for params in ParameterGrid(param_grid):
    alpha = params["alpha"]
    mnb = MultinomialNB(alpha=alpha)
    mnb.fit(X_train, y_train)
    yhat = mnb.predict(X_test)
    f1 = f1_score(y_test, yhat, average='macro')
    if f1 > best["mnb"][0]:
        best["mnb"] = (f1, alpha)

    cnb = ComplementNB(alpha=alpha)
    cnb.fit(X_train, y_train)
    yhat = cnb.predict(X_test)
    f1 = f1_score(y_test, yhat, average='macro')
    if f1 > best["cnb"][0]:
        best["cnb"] = (f1, alpha)

print("Mejores resultados:")
print(f"  MultinomialNB -> F1-macro={best['mnb'][0]:.3f} con alpha={best['mnb'][1]}")
print(f"  ComplementNB  -> F1-macro={best['cnb'][0]:.3f} con alpha={best['cnb'][1]}")

Mejores resultados:
  MultinomialNB -> F1-macro=0.674 con alpha=0.1
  ComplementNB  -> F1-macro=0.690 con alpha=0.5


In [29]:
# Entrenamos de nuevo con los mejores hiperparámetros y mostramos reporte de clasificación
best_alpha_mnb = best['mnb'][1] if best['mnb'][1] is not None else 1.0
best_alpha_cnb = best['cnb'][1] if best['cnb'][1] is not None else 1.0

mnb = MultinomialNB(alpha=best_alpha_mnb).fit(X_train, y_train)
cnb = ComplementNB(alpha=best_alpha_cnb).fit(X_train, y_train)

print("=== MultinomialNB ===")
print(classification_report(y_test, mnb.predict(X_test), target_names=test.target_names, zero_division=0))

print("=== ComplementNB ===")
print(classification_report(y_test, cnb.predict(X_test), target_names=test.target_names, zero_division=0))

=== MultinomialNB ===
                          precision    recall  f1-score   support

             alt.atheism       0.61      0.43      0.50       319
           comp.graphics       0.61      0.70      0.65       389
 comp.os.ms-windows.misc       0.68      0.55      0.61       394
comp.sys.ibm.pc.hardware       0.64      0.70      0.67       392
   comp.sys.mac.hardware       0.74      0.67      0.70       385
          comp.windows.x       0.77      0.75      0.76       395
            misc.forsale       0.79      0.77      0.78       390
               rec.autos       0.75      0.73      0.74       396
         rec.motorcycles       0.76      0.74      0.75       398
      rec.sport.baseball       0.90      0.80      0.85       397
        rec.sport.hockey       0.58      0.91      0.71       399
               sci.crypt       0.80      0.72      0.76       396
         sci.electronics       0.68      0.58      0.62       393
                 sci.med       0.83      0.76      0.

## 6) **Ejercicio 4** — Similaridad entre palabras.

In [33]:
# Transponer: cada fila representa un término como vector de aparición en documentos
X_td = X_train.T  # (vocab_size x n_docs)

# Elegí palabras interpretables del dominio 20 Newsgroups:
seed_words = ["space", "hockey", "windows", "jesus", "car"]

# Filtrar solo las que existan en el vocabulario tras el preprocesado
seed_words = [w for w in seed_words if w in feature_names]
print("Palabras semilla consideradas:", seed_words)

def similar_terms_for(word, topn=10):
    idx = np.where(feature_names == word)[0]
    if len(idx) == 0:
        return []
    idx = idx[0]
    sims = cosine_similarity(X_td[idx], X_td).ravel()
    # evitamos el término idéntico (top-1)
    order = top_k_indices(sims, topn+1)
    order = [j for j in order if j != idx][:topn]
    return [(feature_names[j], float(sims[j])) for j in order]

for w in seed_words:
    print(f"\n=== Vecinos de '{w}' ===")
    for term, s in similar_terms_for(w, topn=10):
        print(f"{term:>20s}  cos={s:.4f}")

Palabras semilla consideradas: ['space', 'hockey', 'windows', 'jesus', 'car']

=== Vecinos de 'space' ===
       space station  cos=0.3077
           sci space  cos=0.2716
                nasa  cos=0.2499
       space shuttle  cos=0.2402
             shuttle  cos=0.1993
       space program  cos=0.1986
              launch  cos=0.1814
       space related  cos=0.1801
          disk space  cos=0.1746
          space news  cos=0.1742

=== Vecinos de 'hockey' ===
                 nhl  cos=0.2569
      hockey players  cos=0.2558
                ncaa  cos=0.2216
      college hockey  cos=0.2210
         hockey east  cos=0.2148
       hockey league  cos=0.2042
                game  cos=0.2028
              league  cos=0.2007
             players  cos=0.1928
                 ice  cos=0.1893

=== Vecinos de 'windows' ===
                 dos  cos=0.3104
          ms windows  cos=0.2835
         dos windows  cos=0.2469
          windows nt  cos=0.2465
                  ms  cos=0.2325
          