# Carga de datos

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
from sentence_transformers import SentenceTransformer

df_true = pd.read_csv("True.csv", engine="python", on_bad_lines="skip")
df_fake = pd.read_csv("Fake.csv", engine="python", on_bad_lines="skip")

print(df_true.shape)
print(df_fake.shape)

df_true.head()

In [None]:
df_true["label"] = 1
df_fake["label"] = 0

# Unimos datasets
df = pd.concat([df_true, df_fake], axis=0)

# Barajamos
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.head()

In [None]:
df.shape

In [None]:
df["label"].value_counts()

# EDA

In [None]:
sns.countplot(x="label", data=df)
plt.title("Distribución de noticias Fake vs Real")
plt.xlabel("Label (0 = Fake, 1 = Real)")
plt.ylabel("Número de noticias")
plt.show()

In [None]:
df['text_length'] = df['text'].apply(len)

# Número de palabras
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

print(df[['text_length', 'word_count']].describe())

# Visualizar histograma de longitud de palabras
plt.figure(figsize=(10,5))
sns.histplot(df['word_count'], bins=50, kde=True)
plt.title("Distribución del número de palabras por noticia")
plt.xlabel("Número de palabras")
plt.ylabel("Cantidad de noticias")
plt.show()

In [None]:
# Eliminar textos vacíos
df = df[df['word_count'] > 0].copy()

df = df.reset_index(drop=True)

print("Nuevo tamaño del dataset:", df.shape)

# Preprocesamiento

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

df['full_text'] = df['title'] + " " + df['text']

def clean_text(text):
    text = text.lower()  # pasamos a minúsculas
    text = re.sub(r'http\S+','', text)  # quitamos URLs
    text = re.sub(r'[^a-z\s]', '', text)  # quitamos puntuación y números
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]
    return " ".join(tokens)

df['clean_text'] = df['full_text'].apply(clean_text)
df[['full_text','clean_text']].head()

In [None]:
# Tabla pero con mas caracteres
example_df = df[['full_text','clean_text']].head(5).copy()
example_df['full_text'] = example_df['full_text'].str[:100] + "..."
example_df['clean_text'] = example_df['clean_text'].str[:100] + "..."

display(example_df.style.set_caption("Tabla: Ejemplo resumido de textos originales y limpios")
        .set_properties(**{'text-align': 'left'}))

In [None]:
example_df = df[['full_text','clean_text']].head(5).copy()
example_df['full_text_len'] = example_df['full_text'].apply(lambda x: len(x.split()))
example_df['clean_text_len'] = example_df['clean_text'].apply(lambda x: len(x.split()))
example_df = example_df[['full_text_len','clean_text_len']]

display(example_df.style.set_caption("Tabla 3: Longitud de textos originales y limpios (número de palabras)")
        .background_gradient(cmap="Blues", axis=1))

# Ejemplo antes vs despues
for i, row in df[['full_text','clean_text']].head(3).iterrows():
    print(f"--- Ejemplo {i+1} ---")
    print("Original:", row['full_text'][:200], "...")
    print("Limpio  :", row['clean_text'][:200], "...\n")


In [None]:
# Creamos TF-IDF
tfidf = TfidfVectorizer(max_features=10000)
X_tfidf = tfidf.fit_transform(df['clean_text'])

y = df['label']

print("Tamaño TF-IDF:", X_tfidf.shape)

# Entrenamiento

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

# Creamos y entrenamos modelo
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Predecir
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Matriz de confusión
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Fake','Real'], yticklabels=['Fake','Real'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Matriz de Confusión")
plt.show()

In [None]:
scores = cross_val_score(clf, X_tfidf, y, cv=5, scoring='accuracy')
print("Accuracy CV (5 folds):", scores)
print("Mean accuracy:", np.mean(scores))

## Sentence transformers

In [None]:
model = SentenceTransformer('all-MiniLM-L12-v2')
X_emb = model.encode(df['clean_text'], batch_size=64, show_progress_bar=True)

y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X_emb, y, test_size=0.2, random_state=42, stratify=y)

# Entrenamos modelo
clf_emb = LogisticRegression(max_iter=1000)
clf_emb.fit(X_train, y_train)

# Predecir
y_pred = clf_emb.predict(X_test)

print("Accuracy (embeddings):", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

## Validación cruzada con embeddings

In [None]:
# Logistic regression sobre embeddings
clf_emb_cv = LogisticRegression(max_iter=1000)

# 5-fold cross-validation
scores_emb = cross_val_score(clf_emb_cv, X_emb, y, cv=5, scoring='accuracy')
print("Accuracy CV embeddings (5 folds):", scores_emb)
print("Mean accuracy embeddings:", np.mean(scores_emb))