In [2]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models import Word2Vec

from sentence_transformers import SentenceTransformer


# CARGAR DATASET DESDE CSV
train_path = "data/datasets/rumoureval2019_train.csv"
val_path   = "data/datasets/rumoureval2019_val.csv"
test_path  = "data/datasets/rumoureval2019_test.csv"

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)
test_df  = pd.read_csv(test_path)

# LIMPIEZA PARA EVITAR NaN EN label 
print("NaN en label (antes de limpiar):")
print("  train:", train_df["label"].isna().sum())
print("  val:  ", val_df["label"].isna().sum())
print("  test: ", test_df["label"].isna().sum())

train_df = train_df.dropna(subset=["label"])
val_df   = val_df.dropna(subset=["label"])
test_df  = test_df.dropna(subset=["label"])

print("NaN en label (después de limpiar):")
print("  train:", train_df["label"].isna().sum())
print("  val:  ", val_df["label"].isna().sum())
print("  test: ", test_df["label"].isna().sum())

print("\nEtiquetas únicas en train:", train_df["label"].unique())

# Función para concatenar textos
def concat_text_row(row):
    src = row.get("source_text", "")
    rep = row.get("reply_text", "")
    src = "" if pd.isna(src) else str(src)
    rep = "" if pd.isna(rep) else str(rep)
    return (src + " [SEP] " + rep).strip()

X_train_text = train_df.apply(concat_text_row, axis=1).tolist()
y_train = train_df["label"].values         

X_val_text = val_df.apply(concat_text_row, axis=1).tolist()
y_val = val_df["label"].values

X_test_text = test_df.apply(concat_text_row, axis=1).tolist()
y_test = test_df["label"].values

print("\nEjemplo de texto de entrenamiento:")
print(X_train_text[0])
print("Etiqueta:", y_train[0])



# FUNCIÓN AUXILIAR: KNN + BÚSQUEDA DE k
def train_and_evaluate_knn(X_train_vec, y_train,
                           X_val_vec, y_val,
                           X_test_vec, y_test,
                           k_values=[1, 3, 5, 7, 9],
                           title=""):
    print("RESULTADOS KNN -", title)

    best_k = None
    best_acc = 0.0

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train_vec, y_train)
        y_val_pred = knn.predict(X_val_vec)
        acc_val = accuracy_score(y_val, y_val_pred)
        print(f"k = {k} --> Accuracy validación = {acc_val:.4f}")

        if acc_val > best_acc:
            best_acc = acc_val
            best_k = k

    print("\nMejor número de vecinos (k) encontrado en validación:", best_k)
    print(f"Accuracy de validación con k={best_k}: {best_acc:.4f}")

    final_knn = KNeighborsClassifier(n_neighbors=best_k)
    final_knn.fit(X_train_vec, y_train)

    y_test_pred = final_knn.predict(X_test_vec)
    acc_test = accuracy_score(y_test, y_test_pred)

    print(f"\nAccuracy en TEST con k={best_k}: {acc_test:.4f}")
    print("\nClassification report (TEST):")
    print(classification_report(y_test, y_test_pred, digits=4))

    print("\nEjemplo de predicciones en test (primeros 20):")
    print("y_test_pred[:20] =", y_test_pred[:20])
    print("y_test[:20]      =", y_test[:20])

    return final_knn, best_k, acc_test



# EXPERIMENTO 1: TF-IDF + KNN
print("\n\nEXPERIMENTO 1: TF-IDF + KNN")

tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_val_tfidf   = tfidf_vectorizer.transform(X_val_text)
X_test_tfidf  = tfidf_vectorizer.transform(X_test_text)

knn_tfidf, best_k_tfidf, acc_test_tfidf = train_and_evaluate_knn(
    X_train_tfidf, y_train,
    X_val_tfidf, y_val,
    X_test_tfidf, y_test,
    k_values=[1, 3, 5, 7, 9],
    title="TF-IDF"
)

y_pred_test = knn_tfidf.predict(X_test_tfidf)
print("\nPREDICCIÓN(primeras 10 líneas)")
for i in range(10):
    print(f"Texto {i}:")
    print("   Predicción:", y_pred_test[i])
    print("   Real:      ", y_test[i])


# EXPERIMENTO 2: Word2Vec + KNN
print("\n\nEXPERIMENTO 2: Word2Vec + KNN")

def simple_tokenize(text):
    return str(text).lower().split()

train_tokens = [simple_tokenize(t) for t in X_train_text]
val_tokens   = [simple_tokenize(t) for t in X_val_text]
test_tokens  = [simple_tokenize(t) for t in X_test_text]

w2v_model = Word2Vec(
    sentences=train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)

word_vectors = w2v_model.wv

def document_embedding(tokens, word_vectors, dim=100):
    vecs = []
    for tok in tokens:
        if tok in word_vectors:
            vecs.append(word_vectors[tok])
    if len(vecs) == 0:
        return np.zeros(dim)
    else:
        return np.mean(vecs, axis=0)

def build_doc_matrix(list_of_tokens, word_vectors, dim=100):
    return np.vstack([
        document_embedding(toks, word_vectors, dim)
        for toks in list_of_tokens
    ])

X_train_w2v = build_doc_matrix(train_tokens, word_vectors, dim=100)
X_val_w2v   = build_doc_matrix(val_tokens,   word_vectors, dim=100)
X_test_w2v  = build_doc_matrix(test_tokens,  word_vectors, dim=100)

knn_w2v, best_k_w2v, acc_test_w2v = train_and_evaluate_knn(
    X_train_w2v, y_train,
    X_val_w2v, y_val,
    X_test_w2v, y_test,
    k_values=[1, 3, 5, 7, 9],
    title="Word2Vec (media embeddings)"
)

y_pred_test_w2v = knn_w2v.predict(X_test_w2v)
print("\nPREDICCIÓN Word2Vec (primeras 10 líneas)")
for i in range(10):
    print(f"{i}) pred={y_pred_test_w2v[i]}  real={y_test[i]}")




# EXPERIMENTO 3: EMBEDDINGS CONTEXTUALES + KNN
print("\n\nEXPERIMENTO 3: EMBEDDINGS (Sentence-BERT) + KNN")

bert_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

X_train_bert = bert_model.encode(X_train_text, batch_size=32, show_progress_bar=True)
X_val_bert   = bert_model.encode(X_val_text,   batch_size=32, show_progress_bar=True)
X_test_bert  = bert_model.encode(X_test_text,  batch_size=32, show_progress_bar=True)

knn_bert, best_k_bert, acc_test_bert = train_and_evaluate_knn(
    X_train_bert, y_train,
    X_val_bert, y_val,
    X_test_bert, y_test,
    k_values=[1, 3, 5, 7, 9],
    title="Embeddings contextuales (Sentence-BERT)"
)
y_pred_test_bert = knn_bert.predict(X_test_bert)
print("\nPREDICCIÓN BERT (primeras 10 líneas")
for i in range(10):
    print(f"{i}) pred={y_pred_test_bert[i]}  real={y_test[i]}")



# RESUMEN FINAL
print("\n\nRESUMEN FINAL")
print(f"TF-IDF:        mejor k = {best_k_tfidf},  accuracy test = {acc_test_tfidf:.4f}")
print(f"Word2Vec:      mejor k = {best_k_w2v},    accuracy test = {acc_test_w2v:.4f}")
print(f"Sentence-BERT: mejor k = {best_k_bert},   accuracy test = {acc_test_bert:.4f}")



NaN en label (antes de limpiar):
  train: 2
  val:   0
  test:  0
NaN en label (después de limpiar):
  train: 0
  val:   0
  test:  0

Etiquetas únicas en train: ['comment' 'deny' 'query' 'support']

Ejemplo de texto de entrenamiento:
France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58 [SEP] MT @euronews France: 10 dead after shooting at HQ of satirical weekly #CharlieHebdo. If Zionists/Jews did this they'd be nuking Israel
Etiqueta: comment


EXPERIMENTO 1: TF-IDF + KNN
RESULTADOS KNN - TF-IDF
k = 1 --> Accuracy validación = 0.6528
k = 3 --> Accuracy validación = 0.7153
k = 5 --> Accuracy validación = 0.7403
k = 7 --> Accuracy validación = 0.7542
k = 9 --> Accuracy validación = 0.7910

Mejor número de vecinos (k) encontrado en validación: 9
Accuracy de validación con k=9: 0.7910

Accuracy en TEST con k=9: 0.8239

Classification report (TEST):
              precision    recall  f1-score   support

     c

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
Batches: 100%|██████████| 153/153 [00:01<00:00, 99.34it/s] 
Batches: 100%|██████████| 45/45 [00:00<00:00, 118.04it/s]
Batches: 100%|██████████| 53/53 [00:00<00:00, 110.95it/s]


RESULTADOS KNN - Embeddings contextuales (Sentence-BERT)
k = 1 --> Accuracy validación = 0.6465
k = 3 --> Accuracy validación = 0.6931
k = 5 --> Accuracy validación = 0.7701
k = 7 --> Accuracy validación = 0.7868
k = 9 --> Accuracy validación = 0.8035

Mejor número de vecinos (k) encontrado en validación: 9
Accuracy de validación con k=9: 0.8035

Accuracy en TEST con k=9: 0.8304

Classification report (TEST):


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


              precision    recall  f1-score   support

     comment     0.8395    0.9900    0.9086      1405
        deny     0.0000    0.0000    0.0000       100
       query     0.0000    0.0000    0.0000        66
     support     0.0000    0.0000    0.0000       104

    accuracy                         0.8304      1675
   macro avg     0.2099    0.2475    0.2271      1675
weighted avg     0.7042    0.8304    0.7621      1675


Ejemplo de predicciones en test (primeros 20):
y_test_pred[:20] = ['comment' 'comment' 'comment' 'comment' 'comment' 'comment' 'comment'
 'comment' 'comment' 'comment' 'comment' 'comment' 'comment' 'comment'
 'comment' 'comment' 'comment' 'comment' 'comment' 'comment']
y_test[:20]      = ['comment' 'comment' 'comment' 'comment' 'comment' 'comment' 'comment'
 'comment' 'comment' 'comment' 'comment' 'comment' 'comment' 'comment'
 'comment' 'comment' 'query' 'comment' 'comment' 'query']

PREDICCIÓN BERT (primeras 10 líneas
0) pred=comment  real=comment
1) pred=

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
