In [None]:
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight

from gensim.models import Word2Vec

from sentence_transformers import SentenceTransformer

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, Dataset

from collections import Counter

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW


train_path = "data/datasets/rumoureval2019_train.csv"
val_path   = "data/datasets/rumoureval2019_val.csv"
test_path  = "data/datasets/rumoureval2019_test.csv"

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)
test_df  = pd.read_csv(test_path)

print("NaN en label (antes de limpiar):")
print("  train:", train_df["label"].isna().sum())
print("  val:  ", val_df["label"].isna().sum())
print("  test: ", test_df["label"].isna().sum())

train_df = train_df.dropna(subset=["label"])
val_df   = val_df.dropna(subset=["label"])
test_df  = test_df.dropna(subset=["label"])

print("\nNaN en label (después de limpiar):")
print("  train:", train_df["label"].isna().sum())
print("  val:  ", val_df["label"].isna().sum())
print("  test: ", test_df["label"].isna().sum())

print("\nEtiquetas únicas en train:", train_df["label"].unique())

print("\nDistribución de clases:")
for name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    counts = df["label"].value_counts()
    total = counts.sum()
    print(f"\n=== {name.upper()} (total = {total}) ===")
    for label, c in counts.items():
        print(f"{label:8s}: {c:4d} ({c/total:.3f})")

def concat_text_row(row):
    src = row.get("source_text", "")
    rep = row.get("reply_text", "")
    src = "" if pd.isna(src) else str(src)
    rep = "" if pd.isna(rep) else str(rep)
    return (src + " [SEP] " + rep).strip()

X_train_text = train_df.apply(concat_text_row, axis=1).tolist()
y_train = train_df["label"].values         

X_val_text = val_df.apply(concat_text_row, axis=1).tolist()
y_val = val_df["label"].values

X_test_text = test_df.apply(concat_text_row, axis=1).tolist()
y_test = test_df["label"].values

print("\nEjemplo de texto de entrenamiento:")
print(X_train_text[0])
print("Etiqueta:", y_train[0])

label_encoder = LabelEncoder()
y_train_idx = label_encoder.fit_transform(y_train)
y_val_idx   = label_encoder.transform(y_val)
y_test_idx  = label_encoder.transform(y_test)
num_classes = len(label_encoder.classes_)
print("\nClases (label_encoder):", label_encoder.classes_)

major_class = Counter(y_test).most_common(1)[0][0]
baseline_acc = np.mean(y_test == major_class)
print(f"\nClase mayoritaria en TEST: {major_class}")
print(f"Accuracy baseline (siempre '{major_class}') = {baseline_acc:.4f}")


def train_and_evaluate_knn(X_train_vec, y_train,
                           X_val_vec, y_val,
                           X_test_vec, y_test,
                           k_values=[1, 3, 5, 7, 9],
                           title=""):
    print("\n" + "="*60)
    print("RESULTADOS KNN -", title)
    print("="*60)

    best_k = None
    best_acc = 0.0

    for k in k_values:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train_vec, y_train)
        y_val_pred = knn.predict(X_val_vec)
        acc_val = accuracy_score(y_val, y_val_pred)
        print(f"k = {k} --> Accuracy validación = {acc_val:.4f}")

        if acc_val > best_acc:
            best_acc = acc_val
            best_k = k

    print("\nMejor número de vecinos (k) encontrado en validación:", best_k)
    print(f"Accuracy de validación con k={best_k}: {best_acc:.4f}")

    final_knn = KNeighborsClassifier(n_neighbors=best_k)
    final_knn.fit(X_train_vec, y_train)

    y_test_pred = final_knn.predict(X_test_vec)
    acc_test = accuracy_score(y_test, y_test_pred)

    print(f"\nAccuracy en TEST con k={best_k}: {acc_test:.4f}")
    print("\nClassification report (TEST):")
    print(classification_report(y_test, y_test_pred, digits=4))

    print("\nEjemplo de predicciones en test (primeros 20):")
    print("y_test_pred[:20] =", y_test_pred[:20])
    print("y_test[:20]      =", y_test[:20])

    return final_knn, best_k, acc_test


class ConvNet1D(nn.Module):

    def __init__(self, input_dim, num_classes, dropout=0.3):
        super(ConvNet1D, self).__init__()

        self.conv1 = nn.Conv1d(in_channels=1,   out_channels=64, kernel_size=5, padding=2)
        self.bn1   = nn.BatchNorm1d(64)

        self.conv2 = nn.Conv1d(in_channels=64,  out_channels=64, kernel_size=5, padding=2)
        self.bn2   = nn.BatchNorm1d(64)

        self.conv3 = nn.Conv1d(in_channels=64,  out_channels=64, kernel_size=5, padding=2)
        self.bn3   = nn.BatchNorm1d(64)

        self.conv4 = nn.Conv1d(in_channels=64,  out_channels=64, kernel_size=5, padding=2)
        self.bn4   = nn.BatchNorm1d(64)

        self.global_pool = nn.AdaptiveMaxPool1d(1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(64, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)           # (B, 1, L)

        x = F.relu(self.bn1(self.conv1(x)))  # (B, 64, L)
        x = F.relu(self.bn2(self.conv2(x)))  # (B, 64, L)
        x = F.relu(self.bn3(self.conv3(x)))  # (B, 64, L)
        x = F.relu(self.bn4(self.conv4(x)))  # (B, 64, L)

        x = self.global_pool(x)      # (B, 64, 1)
        x = x.squeeze(-1)            # (B, 64)
        x = self.dropout(x)
        x = self.fc(x)               # (B, num_classes)
        return x



def train_and_evaluate_cnn(
    X_train, y_train_idx,
    X_val, y_val_idx,
    X_test, y_test_idx,
    label_encoder,
    title="CNN",
    num_epochs=20,
    batch_size=32,
    lr=5e-4,
    dropout=0.3,
    device=None
):

    print("\n" + "="*60)
    print("ENTRENANDO RED NEURONAL CONVOLUCIONAL -", title)
    print("="*60)

    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Usando dispositivo:", device)

    X_train = np.asarray(X_train, dtype=np.float32)
    X_val   = np.asarray(X_val,   dtype=np.float32)
    X_test  = np.asarray(X_test,  dtype=np.float32)

    y_train_idx = np.asarray(y_train_idx, dtype=np.int64)
    y_val_idx   = np.asarray(y_val_idx,   dtype=np.int64)
    y_test_idx  = np.asarray(y_test_idx,  dtype=np.int64)

    input_dim = X_train.shape[1]
    num_classes = len(label_encoder.classes_)

    train_dataset = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train_idx))
    val_dataset   = TensorDataset(torch.from_numpy(X_val),   torch.from_numpy(y_val_idx))
    test_dataset  = TensorDataset(torch.from_numpy(X_test),  torch.from_numpy(y_test_idx))

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    class_weights_np = compute_class_weight(
        class_weight="balanced",
        classes=np.arange(num_classes),
        y=y_train_idx
    )
    class_weights = torch.tensor(class_weights_np, dtype=torch.float32).to(device)
    print("\nPesos de clase (para CrossEntropyLoss):")
    for idx, w in enumerate(class_weights_np):
        print(f"  Clase {idx} ({label_encoder.classes_[idx]}): {w:.4f}")

    model = ConvNet1D(input_dim=input_dim, num_classes=num_classes, dropout=dropout).to(device)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)

    best_val_acc = 0.0
    best_state_dict = None

    for epoch in range(1, num_epochs + 1):
        model.train()
        running_loss = 0.0
        correct_train = 0
        total_train = 0

        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * batch_X.size(0)
            _, preds = torch.max(outputs, 1)
            correct_train += (preds == batch_y).sum().item()
            total_train += batch_X.size(0)

        train_loss = running_loss / total_train
        train_acc = correct_train / total_train

        model.eval()
        correct_val = 0
        total_val = 0
        with torch.inference_mode():
            for batch_X, batch_y in val_loader:
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)
                outputs = model(batch_X)
                _, preds = torch.max(outputs, 1)
                correct_val += (preds == batch_y).sum().item()
                total_val += batch_X.size(0)

        val_acc = correct_val / total_val

        print(f"Época {epoch:02d}/{num_epochs} | "
              f"Loss train = {train_loss:.4f} | "
              f"Acc train = {train_acc:.4f} | "
              f"Acc val = {val_acc:.4f}")

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_state_dict = model.state_dict()

    print(f"\nMejor accuracy de validación alcanzado: {best_val_acc:.4f}")

    if best_state_dict is not None:
        model.load_state_dict(best_state_dict)

    model.eval()
    all_preds = []
    all_true = []
    with torch.inference_mode():
        for batch_X, batch_y in test_loader:
            batch_X = batch_X.to(device)
            outputs = model(batch_X)
            _, preds = torch.max(outputs, 1)
            all_preds.append(preds.cpu().numpy())
            all_true.append(batch_y.numpy())

    all_preds = np.concatenate(all_preds)
    all_true  = np.concatenate(all_true)

    y_test_pred_labels = label_encoder.inverse_transform(all_preds)
    y_test_true_labels = label_encoder.inverse_transform(all_true)

    acc_test = accuracy_score(y_test_true_labels, y_test_pred_labels)
    print(f"\nAccuracy en TEST ({title}) = {acc_test:.4f}")
    print("\nClassification report (TEST):")
    print(classification_report(y_test_true_labels, y_test_pred_labels, digits=4))

    print("\nEjemplo de predicciones en test (primeros 20):")
    print("y_test_pred[:20] =", y_test_pred_labels[:20])
    print("y_test[:20]      =", y_test_true_labels[:20])

    return model, acc_test, y_test_pred_labels



print("\n\nTF-IDF + KNN")

tfidf_vectorizer = TfidfVectorizer(
    lowercase=True,
    max_features=10000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_val_tfidf   = tfidf_vectorizer.transform(X_val_text)
X_test_tfidf  = tfidf_vectorizer.transform(X_test_text)

knn_tfidf, best_k_tfidf, acc_test_tfidf = train_and_evaluate_knn(
    X_train_tfidf, y_train,
    X_val_tfidf, y_val,
    X_test_tfidf, y_test,
    k_values=[1, 3, 5, 7, 9],
    title="TF-IDF"
)

y_pred_test = knn_tfidf.predict(X_test_tfidf)
print("\nPREDICCIÓN TF-IDF + KNN (primeras 10 líneas)")
for i in range(10):
    print(f"Texto {i}:")
    print("   Predicción:", y_pred_test[i])
    print("   Real:      ", y_test[i])


print("\n\nTF-IDF + CNN")

# Normalización con StandardScaler (with_mean=False porque es sparse)
scaler_tfidf = StandardScaler(with_mean=False)
X_train_tfidf_scaled = scaler_tfidf.fit_transform(X_train_tfidf)
X_val_tfidf_scaled   = scaler_tfidf.transform(X_val_tfidf)
X_test_tfidf_scaled  = scaler_tfidf.transform(X_test_tfidf)

# Pasamos a denso para PyTorch
X_train_tfidf_dense = X_train_tfidf_scaled.toarray()
X_val_tfidf_dense   = X_val_tfidf_scaled.toarray()
X_test_tfidf_dense  = X_test_tfidf_scaled.toarray()

cnn_tfidf, acc_test_tfidf_cnn, y_pred_test_tfidf_cnn = train_and_evaluate_cnn(
    X_train_tfidf_dense, y_train_idx,
    X_val_tfidf_dense,   y_val_idx,
    X_test_tfidf_dense,  y_test_idx,
    label_encoder,
    title="TF-IDF + CNN",
    num_epochs=15,       
    batch_size=32,
    lr=5e-4,
    dropout=0.3
)

print("\nPREDICCIÓN TF-IDF + CNN (primeras 10 líneas)")
for i in range(10):
    print(f"Texto {i}:")
    print("   Predicción:", y_pred_test_tfidf_cnn[i])
    print("   Real:      ", y_test[i])


print("\n\nWord2Vec + KNN")

def simple_tokenize(text):
    return str(text).lower().split()

train_tokens = [simple_tokenize(t) for t in X_train_text]
val_tokens   = [simple_tokenize(t) for t in X_val_text]
test_tokens  = [simple_tokenize(t) for t in X_test_text]

w2v_model = Word2Vec(
    sentences=train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)

word_vectors = w2v_model.wv

def document_embedding(tokens, word_vectors, dim=100):
    vecs = []
    for tok in tokens:
        if tok in word_vectors:
            vecs.append(word_vectors[tok])
    if len(vecs) == 0:
        return np.zeros(dim)
    else:
        return np.mean(vecs, axis=0)

def build_doc_matrix(list_of_tokens, word_vectors, dim=100):
    return np.vstack([
        document_embedding(toks, word_vectors, dim)
        for toks in list_of_tokens
    ])

X_train_w2v = build_doc_matrix(train_tokens, word_vectors, dim=100)
X_val_w2v   = build_doc_matrix(val_tokens,   word_vectors, dim=100)
X_test_w2v  = build_doc_matrix(test_tokens,  word_vectors, dim=100)

knn_w2v, best_k_w2v, acc_test_w2v = train_and_evaluate_knn(
    X_train_w2v, y_train,
    X_val_w2v, y_val,
    X_test_w2v, y_test,
    k_values=[1, 3, 5, 7, 9],
    title="Word2Vec (media embeddings)"
)

y_pred_test_w2v = knn_w2v.predict(X_test_w2v)
print("\nPREDICCIÓN Word2Vec + KNN (primeras 10 líneas)")
for i in range(10):
    print(f"{i}) pred={y_pred_test_w2v[i]}  real={y_test[i]}")


print("\n\nWord2Vec + CNN")

scaler_w2v = StandardScaler()
X_train_w2v_scaled = scaler_w2v.fit_transform(X_train_w2v)
X_val_w2v_scaled   = scaler_w2v.transform(X_val_w2v)
X_test_w2v_scaled  = scaler_w2v.transform(X_test_w2v)

cnn_w2v, acc_test_w2v_cnn, y_pred_test_w2v_cnn = train_and_evaluate_cnn(
    X_train_w2v_scaled, y_train_idx,
    X_val_w2v_scaled,   y_val_idx,
    X_test_w2v_scaled,  y_test_idx,
    label_encoder,
    title="Word2Vec + CNN",
    num_epochs=30,       
    batch_size=32,
    lr=5e-4,
    dropout=0.3
)

print("\nPREDICCIÓN Word2Vec + CNN (primeras 10 líneas)")
for i in range(10):
    print(f"{i}) pred={y_pred_test_w2v_cnn[i]}  real={y_test[i]}")


print("\n\nEMBEDDINGS (Sentence-BERT) + KNN")

bert_model_st = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

X_train_bert = bert_model_st.encode(X_train_text, batch_size=32, show_progress_bar=True)
X_val_bert   = bert_model_st.encode(X_val_text,   batch_size=32, show_progress_bar=True)
X_test_bert  = bert_model_st.encode(X_test_text,  batch_size=32, show_progress_bar=True)

knn_bert, best_k_bert, acc_test_bert = train_and_evaluate_knn(
    X_train_bert, y_train,
    X_val_bert,   y_val,
    X_test_bert,  y_test,
    k_values=[1, 3, 5, 7, 9],
    title="Embeddings contextuales (Sentence-BERT)"
)

y_pred_test_bert = knn_bert.predict(X_test_bert)
print("\nPREDICCIÓN BERT + KNN (primeras 10 líneas)")
for i in range(10):
    print(f"{i}) pred={y_pred_test_bert[i]}  real={y_test[i]}")

print("\n\nBERT Embeddings + CNN ")

scaler_bert = StandardScaler()
X_train_bert_scaled = scaler_bert.fit_transform(X_train_bert)
X_val_bert_scaled   = scaler_bert.transform(X_val_bert)
X_test_bert_scaled  = scaler_bert.transform(X_test_bert)

cnn_bert, acc_test_bert_cnn, y_pred_test_bert_cnn = train_and_evaluate_cnn(
    X_train_bert_scaled, y_train_idx,
    X_val_bert_scaled,   y_val_idx,
    X_test_bert_scaled,  y_test_idx,
    label_encoder,
    title="Sentence-BERT + CNN",
    num_epochs=30,       
    batch_size=32,
    lr=5e-4,
    dropout=0.3
)

print("\nPREDICCIÓN BERT + CNN (primeras 10 líneas)")
for i in range(10):
    print(f"{i}) pred={y_pred_test_bert_cnn[i]}  real={y_test[i]}")


print("\n\nTRANSFORMER PREENTRENADO + FINE-TUNING")

transformer_model_name = "distilbert-base-uncased"
tokenizer_hf = AutoTokenizer.from_pretrained(transformer_model_name)

def tokenize_batch_hf(texts, tokenizer, max_length=128):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

train_encodings_hf = tokenize_batch_hf(X_train_text, tokenizer_hf)
val_encodings_hf   = tokenize_batch_hf(X_val_text,   tokenizer_hf)
test_encodings_hf  = tokenize_batch_hf(X_test_text,  tokenizer_hf)

class RumourEvalHFDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

train_dataset_hf = RumourEvalHFDataset(train_encodings_hf, y_train_idx)
val_dataset_hf   = RumourEvalHFDataset(val_encodings_hf,   y_val_idx)
test_dataset_hf  = RumourEvalHFDataset(test_encodings_hf,  y_test_idx)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Dispositivo para Transformer:", device)

model_hf = AutoModelForSequenceClassification.from_pretrained(
    transformer_model_name,
    num_labels=num_classes
).to(device)

optimizer_hf = AdamW(model_hf.parameters(), lr=2e-5)

train_loader_hf = DataLoader(train_dataset_hf, batch_size=16, shuffle=True)
val_loader_hf   = DataLoader(val_dataset_hf,   batch_size=32, shuffle=False)
test_loader_hf  = DataLoader(test_dataset_hf,  batch_size=32, shuffle=False)

num_epochs_hf = 3
best_val_acc_hf = 0.0
best_state_dict_hf = None

for epoch in range(1, num_epochs_hf + 1):
    model_hf.train()
    total_loss = 0.0
    correct_train = 0
    total_train = 0

    for batch in train_loader_hf:
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer_hf.zero_grad()
        outputs = model_hf(**batch)
        loss = outputs.loss
        logits = outputs.logits

        loss.backward()
        optimizer_hf.step()

        total_loss += loss.item() * batch["labels"].size(0)
        preds = logits.argmax(dim=-1)
        correct_train += (preds == batch["labels"]).sum().item()
        total_train += batch["labels"].size(0)

    train_loss = total_loss / total_train
    train_acc = correct_train / total_train

    model_hf.eval()
    correct_val = 0
    total_val = 0
    with torch.inference_mode():
        for batch in val_loader_hf:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model_hf(**batch)
            logits = outputs.logits
            preds = logits.argmax(dim=-1)
            correct_val += (preds == batch["labels"]).sum().item()
            total_val += batch["labels"].size(0)

    val_acc = correct_val / total_val

    print(f"[Transformer] Época {epoch}/{num_epochs_hf} | "
          f"Loss train = {train_loss:.4f} | Acc train = {train_acc:.4f} | Acc val = {val_acc:.4f}")

    if val_acc > best_val_acc_hf:
        best_val_acc_hf = val_acc
        best_state_dict_hf = model_hf.state_dict()

print(f"\nMejor accuracy de validación (Transformer) = {best_val_acc_hf:.4f}")

if best_state_dict_hf is not None:
    model_hf.load_state_dict(best_state_dict_hf)

model_hf.eval()
all_preds_hf = []
all_true_hf = []

with torch.inference_mode():
    for batch in test_loader_hf:
        labels = batch["labels"].numpy().copy()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model_hf(**batch)
        logits = outputs.logits
        preds = logits.argmax(dim=-1).cpu().numpy()
        all_preds_hf.append(preds)
        all_true_hf.append(labels)

all_preds_hf = np.concatenate(all_preds_hf)
all_true_hf  = np.concatenate(all_true_hf)

acc_test_transformer = accuracy_score(all_true_hf, all_preds_hf)
print(f"\nAccuracy en TEST (Transformer fine-tuned: {transformer_model_name}) = {acc_test_transformer:.4f}")

y_test_pred_labels_transformer = label_encoder.inverse_transform(all_preds_hf)
y_test_true_labels = label_encoder.inverse_transform(all_true_hf)

print("\nClassification report (TEST) - Transformer fine-tuned:")
print(classification_report(y_test_true_labels, y_test_pred_labels_transformer, digits=4))


print("\n\nRESUMEN FINAL - KNN")
print(f"TF-IDF (KNN):        mejor k = {best_k_tfidf},  accuracy test = {acc_test_tfidf:.4f}")
print(f"Word2Vec (KNN):      mejor k = {best_k_w2v},    accuracy test = {acc_test_w2v:.4f}")
print(f"Sentence-BERT (KNN): mejor k = {best_k_bert},   accuracy test = {acc_test_bert:.4f}")

print("\nRESUMEN FINAL - CNN")
print(f"TF-IDF  + CNN:        accuracy test = {acc_test_tfidf_cnn:.4f}")
print(f"Word2Vec + CNN:       accuracy test = {acc_test_w2v_cnn:.4f}")
print(f"Sentence-BERT + CNN:  accuracy test = {acc_test_bert_cnn:.4f}")
print(f"\nBaseline mayoría ('{major_class}') en TEST: accuracy = {baseline_acc:.4f}")

print("\nRESUMEN FINAL - TRANSFORMER FINE-TUNED")
print(f"Transformer ({transformer_model_name}): accuracy test = {acc_test_transformer:.4f}")


NaN en label (antes de limpiar):
  train: 2
  val:   0
  test:  0

NaN en label (después de limpiar):
  train: 0
  val:   0
  test:  0

Etiquetas únicas en train: ['comment' 'deny' 'query' 'support']

Distribución de clases:

=== TRAIN (total = 4877) ===
comment : 3495 (0.717)
support :  642 (0.132)
query   :  373 (0.076)
deny    :  367 (0.075)

=== VAL (total = 1440) ===
comment : 1174 (0.815)
query   :  114 (0.079)
deny    :   79 (0.055)
support :   73 (0.051)

=== TEST (total = 1675) ===
comment : 1405 (0.839)
support :  104 (0.062)
deny    :  100 (0.060)
query   :   66 (0.039)

Ejemplo de texto de entrenamiento:
France: 10 people dead after shooting at HQ of satirical weekly newspaper #CharlieHebdo, according to witnesses http://t.co/FkYxGmuS58 [SEP] MT @euronews France: 10 dead after shooting at HQ of satirical weekly #CharlieHebdo. If Zionists/Jews did this they'd be nuking Israel
Etiqueta: comment

Clases (label_encoder): ['comment' 'deny' 'query' 'support']

Clase mayoritaria e

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



RESULTADOS KNN - Word2Vec (media embeddings)
k = 1 --> Accuracy validación = 0.6806
k = 3 --> Accuracy validación = 0.7715
k = 5 --> Accuracy validación = 0.8028
k = 7 --> Accuracy validación = 0.8104
k = 9 --> Accuracy validación = 0.8139

Mejor número de vecinos (k) encontrado en validación: 9
Accuracy de validación con k=9: 0.8139

Accuracy en TEST con k=9: 0.8388

Classification report (TEST):
              precision    recall  f1-score   support

     comment     0.8392    0.9993    0.9123      1405
        deny     0.0000    0.0000    0.0000       100
       query     0.0000    0.0000    0.0000        66
     support     0.5000    0.0096    0.0189       104

    accuracy                         0.8388      1675
   macro avg     0.3348    0.2522    0.2328      1675
weighted avg     0.7350    0.8388    0.7664      1675


Ejemplo de predicciones en test (primeros 20):
y_test_pred[:20] = ['comment' 'comment' 'comment' 'comment' 'comment' 'comment' 'comment'
 'comment' 'comment' 'com

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Época 01/30 | Loss train = 1.6196 | Acc train = 0.3195 | Acc val = 0.4854
Época 02/30 | Loss train = 1.4485 | Acc train = 0.3119 | Acc val = 0.2965
Época 03/30 | Loss train = 1.4104 | Acc train = 0.3213 | Acc val = 0.7000
Época 04/30 | Loss train = 1.3625 | Acc train = 0.3250 | Acc val = 0.2819
Época 05/30 | Loss train = 1.3381 | Acc train = 0.3453 | Acc val = 0.6458
Época 06/30 | Loss train = 1.3383 | Acc train = 0.3742 | Acc val = 0.6479
Época 07/30 | Loss train = 1.3267 | Acc train = 0.3531 | Acc val = 0.2604
Época 08/30 | Loss train = 1.3104 | Acc train = 0.3662 | Acc val = 0.4611
Época 09/30 | Loss train = 1.3033 | Acc train = 0.3514 | Acc val = 0.4444
Época 10/30 | Loss train = 1.3058 | Acc train = 0.3668 | Acc val = 0.5708
Época 11/30 | Loss train = 1.2869 | Acc train = 0.3799 | Acc val = 0.7118
Época 12/30 | Loss train = 1.2762 | Acc train = 0.3631 | Acc val = 0.6438
Época 13/30 | Loss train = 1.2823 | Acc train = 0.3810 | Acc val = 0.6979
Época 14/30 | Loss train = 1.2567 | Ac

Batches: 100%|██████████| 153/153 [00:01<00:00, 96.97it/s] 
Batches: 100%|██████████| 45/45 [00:00<00:00, 123.42it/s]
Batches: 100%|██████████| 53/53 [00:00<00:00, 118.59it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



RESULTADOS KNN - Embeddings contextuales (Sentence-BERT)
k = 1 --> Accuracy validación = 0.6465
k = 3 --> Accuracy validación = 0.6931
k = 5 --> Accuracy validación = 0.7701
k = 7 --> Accuracy validación = 0.7868
k = 9 --> Accuracy validación = 0.8035

Mejor número de vecinos (k) encontrado en validación: 9
Accuracy de validación con k=9: 0.8035

Accuracy en TEST con k=9: 0.8304

Classification report (TEST):
              precision    recall  f1-score   support

     comment     0.8395    0.9900    0.9086      1405
        deny     0.0000    0.0000    0.0000       100
       query     0.0000    0.0000    0.0000        66
     support     0.0000    0.0000    0.0000       104

    accuracy                         0.8304      1675
   macro avg     0.2099    0.2475    0.2271      1675
weighted avg     0.7042    0.8304    0.7621      1675


Ejemplo de predicciones en test (primeros 20):
y_test_pred[:20] = ['comment' 'comment' 'comment' 'comment' 'comment' 'comment' 'comment'
 'comment' 'c

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



ENTRENANDO RED NEURONAL CONVOLUCIONAL - Sentence-BERT + CNN (mejorada)
Usando dispositivo: cuda

Pesos de clase (para CrossEntropyLoss):
  Clase 0 (comment): 0.3489
  Clase 1 (deny): 3.3222
  Clase 2 (query): 3.2688
  Clase 3 (support): 1.8991
Época 01/30 | Loss train = 1.6951 | Acc train = 0.2520 | Acc val = 0.6111
Época 02/30 | Loss train = 1.5550 | Acc train = 0.2696 | Acc val = 0.7576
Época 03/30 | Loss train = 1.4227 | Acc train = 0.3168 | Acc val = 0.7771
Época 04/30 | Loss train = 1.4195 | Acc train = 0.2840 | Acc val = 0.8153
Época 05/30 | Loss train = 1.4009 | Acc train = 0.3100 | Acc val = 0.6188
Época 06/30 | Loss train = 1.3958 | Acc train = 0.3047 | Acc val = 0.5194
Época 07/30 | Loss train = 1.3827 | Acc train = 0.3117 | Acc val = 0.6347
Época 08/30 | Loss train = 1.3962 | Acc train = 0.3240 | Acc val = 0.1118
Época 09/30 | Loss train = 1.3688 | Acc train = 0.3258 | Acc val = 0.0674
Época 10/30 | Loss train = 1.3714 | Acc train = 0.3238 | Acc val = 0.7257
Época 11/30 | L

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Dispositivo para Transformer: cuda
[Transformer] Época 1/3 | Loss train = 0.8365 | Acc train = 0.7205 | Acc val = 0.8313
[Transformer] Época 2/3 | Loss train = 0.7163 | Acc train = 0.7542 | Acc val = 0.8208
[Transformer] Época 3/3 | Loss train = 0.5818 | Acc train = 0.7989 | Acc val = 0.8028

Mejor accuracy de validación (Transformer) = 0.8313

Accuracy en TEST (Transformer fine-tuned: distilbert-base-uncased) = 0.8281

Classification report (TEST) - Transformer fine-tuned:
              precision    recall  f1-score   support

     comment     0.8589    0.9530    0.9035      1405
        deny     0.4571    0.1600    0.2370       100
       query     0.3951    0.4848    0.4354        66
     support     0.0000    0.0000    0.0000       104

    accuracy                         0.8281      1675
   macro avg     0.4278    0.3995    0.3940      1675
weighted avg     0.7633    0.8281    0.7892      1675



RESUMEN FINAL - KNN
TF-IDF (KNN):        mejor k = 9,  accuracy test = 0.8239
Word2V

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
