In [1]:
import os, re, random, numpy as np
from tqdm import tqdm
from datasets import load_dataset
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import gensim
from gensim.models import Word2Vec

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ---------------------------
# 0) Configuración y seeds
# ---------------------------
SEED = 42
random.seed(SEED); np.random.seed(SEED)

# Hiperparámetros principales
VECTOR_SIZE = 100
WINDOW      = 5 # cuantas palabras por delante y detras vemos
MIN_COUNT   = 5 # elimina las palabras pocos frecuentes
NEGATIVE    = 10
SG          = 1       # 1 = Skip-gram, 0 = CBOW
EPOCHS_W2V  = 5
WORKERS     = max(1, os.cpu_count() or 1)

In [3]:
# ---------------------------
# 1) Dataset IMDb (HF Datasets)
#    25k train / 25k test, binario
# ---------------------------
ds = load_dataset("imdb")  # splits: 'train', 'test', 'unsupervised'
train_ds = ds["train"]
test_ds  = ds["test"]
print(f"IMDb -> train: {len(train_ds)}, test: {len(test_ds)}")

IMDb -> train: 25000, test: 25000


In [6]:
train_ds[1]

{'text': '"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn\'t matter what one\'s political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn\'t true. I\'ve seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don\'t exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we\'re treated to the site of Vincent Gallo\'s throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, 

In [7]:
# ---------------------------
# 2) Tokenizador simple
# ---------------------------
_word_re = re.compile(r"\b\w+\b", re.UNICODE)

def tokenize(text: str):
    return _word_re.findall(text.lower())

# Pre-tokenizamos para entrenar W2V (solo textos de train)
sentences = [tokenize(ex["text"]) for ex in tqdm(train_ds, desc="Tokenizando train")]
print("Ejemplo de oración tokenizada:", sentences[15000][:20])

Tokenizando train: 100%|██████████| 25000/25000 [00:02<00:00, 9335.20it/s] 

Ejemplo de oración tokenizada: ['budget', 'limitations', 'time', 'restrictions', 'shooting', 'a', 'script', 'and', 'then', 'cutting', 'it', 'cutting', 'it', 'cutting', 'it', 'this', 'crew', 'is', 'a', 'group']





In [8]:
# ---------------------------
# 3) Entrenar Word2Vec (gensim)
# ---------------------------
w2v = Word2Vec(
    sentences=sentences,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=WORKERS,
    sg=SG,
    negative=NEGATIVE,
    epochs=EPOCHS_W2V,
)
print("Vocab size (w2v):", len(w2v.wv.key_to_index))

Vocab size (w2v): 29114


In [10]:
dist_1 = w2v.wv.similarity("good", "great")
dist_2 = w2v.wv.similarity("good", "accident")
dist_3 = w2v.wv.similarity("good", "bad")
print(f"Distancia coseno entre 'good' y 'great': {dist_1:.4f}")
print(f"Distancia coseno entre 'good' y 'accident': {dist_2:.4f}")
print(f"Distancia coseno entre 'good' y 'bad': {dist_3:.4f}")

similar_words = w2v.wv.most_similar(positive=["great"], topn=3)
print(f"Palabras similares a 'great': {similar_words}")

word = w2v.wv.most_similar(positive=["james", "bruce"], negative=["cameron"], topn=1)[0][0]
print(f"Analogía 'james' - 'cameron' + 'bruce' → palabra más cercana: {word}")

Distancia coseno entre 'good' y 'great': 0.8239
Distancia coseno entre 'good' y 'accident': 0.1420
Distancia coseno entre 'good' y 'bad': 0.7401
Palabras similares a 'great': [('wonderful', 0.8540114164352417), ('terrific', 0.8244513869285583), ('good', 0.8238922357559204)]
Analogía 'james' - 'cameron' + 'bruce' → palabra más cercana: willis


In [11]:
# ---------------------------
# 4) Función: vector de documento
#    (promedio de embeddings de sus palabras)
# ---------------------------
def doc_vector(tokens):
    # filtrar palabras fuera de vocab
    vecs = [w2v.wv[w] for w in tokens if w in w2v.wv.key_to_index]
    if not vecs:
        return np.zeros(w2v.vector_size, dtype=np.float32)
    return np.mean(vecs, axis=0).astype(np.float32)

# Construimos matrices X/y
def build_xy(split, desc):
    X, y = [], []
    for ex in tqdm(split, desc=desc):
        tokens = tokenize(ex["text"])
        X.append(doc_vector(tokens))
        y.append(int(ex["label"]))  # 0 = neg, 1 = pos
    return np.vstack(X), np.array(y, dtype=np.int64)

X_train, y_train = build_xy(train_ds, "Vectorizando train")
X_test,  y_test  = build_xy(test_ds,  "Vectorizando test")

print("Shapes -> X_train:", X_train.shape, " | X_test:", X_test.shape)

Vectorizando train:   0%|          | 0/25000 [00:00<?, ?it/s]

Vectorizando train: 100%|██████████| 25000/25000 [00:10<00:00, 2413.33it/s]
Vectorizando test: 100%|██████████| 25000/25000 [00:10<00:00, 2436.15it/s]

Shapes -> X_train: (25000, 100)  | X_test: (25000, 100)





In [12]:
# ---------------------------
# 5) Clasificador (LogReg)
# ---------------------------
clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(X_train, y_train)

# Eval
pred_train = clf.predict(X_train)
pred_test  = clf.predict(X_test)

acc_train = accuracy_score(y_train, pred_train) * 100
acc_test  = accuracy_score(y_test,  pred_test)  * 100

print(f"Accuracy -> train: {acc_train:.2f}% | test: {acc_test:.2f}%\n")

Accuracy -> train: 83.62% | test: 83.04%



In [13]:
# ---------------------------
# 6) Inferencia de ejemplo
# ---------------------------
ejemplos = [
    "I absolutely loved this movie, the acting and the story were fantastic.",
    "Boring and predictable. I wouldn't recommend it."
]
for txt in ejemplos:
    v = doc_vector(tokenize(txt)).reshape(1, -1)
    pred = clf.predict(v)[0]
    print(f"Texto: {txt[:60]}...  -> Predicción: {'pos' if pred==1 else 'neg'}")

Texto: I absolutely loved this movie, the acting and the story were...  -> Predicción: pos
Texto: Boring and predictable. I wouldn't recommend it....  -> Predicción: neg


Ejercicios:
1) Comparar Skip-gram (sg=1) vs CBOW (sg=0), y probar distintos valores para window, vector_size, negative, epochs.
2) Hacer un grafico de los embeddings de palabras usando MDS o T-sne de sklearn. ¿Se observa algun patrón?
3) Entrenar 2 word2vec: uno con las reviews positivas y otro con las negativas. Comparar las distancias entre palabras.


In [15]:
# Hiperparámetros principales
VECTOR_SIZE = 100
WINDOW      = 10 # cuantas palabras por delante y detras vemos
MIN_COUNT   = 3  # elimina las palabras pocos frecuentes
NEGATIVE    = 10
SG          = 0       # 1 = Skip-gram, 0 = CBOW
EPOCHS_W2V  = 10
WORKERS     = max(1, os.cpu_count() or 1)


# Entrenamos el modelo
w2v = Word2Vec(
    sentences=sentences,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=WORKERS,
    sg=SG,
    negative=NEGATIVE,
    epochs=EPOCHS_W2V,
)

print("Vocab size (w2v):", len(w2v.wv.key_to_index))


# Separo en train y test
X_train, y_train = build_xy(train_ds, "Vectorizando train")
X_test,  y_test  = build_xy(test_ds,  "Vectorizando test")

print("Shapes -> X_train:", X_train.shape, " | X_test:", X_test.shape)


# Clasificador y evaluacion
clf = LogisticRegression(max_iter=1000, n_jobs=-1)
clf.fit(X_train, y_train)

pred_train = clf.predict(X_train)
pred_test  = clf.predict(X_test)

acc_train = accuracy_score(y_train, pred_train) * 100
acc_test  = accuracy_score(y_test,  pred_test)  * 100

print(f"Accuracy -> train: {acc_train:.2f}% | test: {acc_test:.2f}%\n")

Vocab size (w2v): 37893


Vectorizando train: 100%|██████████| 25000/25000 [00:10<00:00, 2492.95it/s]
Vectorizando test: 100%|██████████| 25000/25000 [00:09<00:00, 2585.09it/s]


Shapes -> X_train: (25000, 100)  | X_test: (25000, 100)
Accuracy -> train: 84.08% | test: 83.68%



In [None]:
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE

# Selecciona las palabras más frecuentes
top_n = 200
words = list(w2v.wv.key_to_index.keys())[:top_n]
embeddings = np.array([w2v.wv[word] for word in words])

# Aplicamos t-SNE
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings)

# Graficar
df_plot = pd.DataFrame({
    "word": words,
    "x": embeddings_2d[:, 0],
    "y": embeddings_2d[:, 1]
})

fig = px.scatter(df_plot, x="x", y="y", hover_name="word")
fig.update_traces(marker=dict(size=8, opacity=0.7), textposition='top center')
fig.update_layout(width=900, height=600)
fig.show()