#  Imports

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import nltk
import re
import string
import spacy
import contractions


# Carga de datos

In [14]:
# Cargar datos de Kaggle
df_fake = pd.read_csv("Fake.csv") # Noticias falsas
df_real = pd.read_csv("True.csv") # Noticias verdaderas

# Agregar columna de etiquetas
df_fake["label"] = 0
df_real["label"] = 1

# Subject y Data no nos interesa
df_fake.drop(["subject", "date"], axis=1, inplace=True)
df_real.drop(["subject", "date"], axis=1, inplace=True)

In [15]:
print(df_fake.shape)
df_fake.head()

(23481, 3)


Unnamed: 0,title,text,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0


In [16]:
print(df_real.shape)
df_real.head()

(21417, 3)


Unnamed: 0,title,text,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


In [17]:
print("Porcentaje de balanceo de clases:")
print("Fake: ", df_fake.shape[0]/(df_fake.shape[0]+df_real.shape[0]))
print("Real: ", df_real.shape[0]/(df_fake.shape[0]+df_real.shape[0]))

Porcentaje de balanceo de clases:
Fake:  0.5229854336496058
Real:  0.47701456635039424


In [18]:
# Mostrar la primera noticia fake
print("Primera noticia fake:")
print("Title: ", df_fake.iloc[0]['title'])
print("Text: ", df_fake.iloc[0]['text'])

# Mostrar la primera noticia real
print("Primera noticia real:")
print("Title: ", df_real.iloc[0]['title'])
print("Text: ", df_real.iloc[0]['text'])

Primera noticia fake:
Title:   Donald Trump Sends Out Embarrassing New Year’s Eve Message; This is Disturbing
Text:  Donald Trump just couldn t wish all Americans a Happy New Year and leave it at that. Instead, he had to give a shout out to his enemies, haters and  the very dishonest fake news media.  The former reality show star had just one job to do and he couldn t do it. As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year,  President Angry Pants tweeted.  2018 will be a great year for America! As our Country rapidly grows stronger and smarter, I want to wish all of my friends, supporters, enemies, haters, and even the very dishonest Fake News Media, a Happy and Healthy New Year. 2018 will be a great year for America!  Donald J. Trump (@realDonaldTrump) December 31, 2017Trump s tweet went down about as welll as you d expect.What kind of president se

In [19]:
# Unir ambos datasets
df = pd.concat([df_fake, df_real])

# Mezclar datos
df = df.sample(frac=1).reset_index(drop=True)

# Ver primeras filas
print(df.head())

                                               title  \
0  WHATEVER HAPPENED To Trump’s Second Wife? [VIDEO]   
1  ABSOLUTE SUBMISSION: Trump Bows to Neocon Orth...   
2  LONDON’S MAYOR HAS HARSH WORDS For Our Communi...   
3  https://fedup.wpengine.com/wp-content/uploads/...   
4  Trump's top defense and homeland officials to ...   

                                                text  label  
0  It s a pretty safe bet that the press isn t ab...      0  
1  Consortium News Exclusive: In his Mideast trip...      0  
2  Our country is spinning out of control. Obama ...      0  
3  https://fedup.wpengine.com/wp-content/uploads/...      0  
4  BERLIN (Reuters) - U.S. Secretary of Defense J...      1  


In [20]:
# Comprobar que los datos siguen balanceados
print(df["label"].value_counts())

0    23481
1    21417
Name: label, dtype: int64


In [21]:
df.to_csv("FakeAndRealNews.csv", index=False)

# Preprocesado NLP

In [8]:
df = pd.read_csv("FakeAndRealNews.csv")

In [None]:
# Run only once to install the spaCy model
!python -m spacy download en_core_web_sm

In [6]:
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guigr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean_function(text):
    text = text.lower() # Convertir a minúsculas para uniformidad
    text = re.sub(r"\[.*?\]", "", text) # Quitar todo contenido entre corchetes
    text = re.sub(f"[{string.punctuation}]", "", text) # Quitar cualquier carácter de puntuación y comillas especiales
    text = re.sub(r"\w*\d\w*", "", text) # Quitar palabras con números
    text = re.sub(r'https://\S+|www\.\S+', '', text) # Quitar URLs
    text = contractions.fix(text) # Expandir contracciones
    text = " ".join([word for word in text.split() if word not in stop_words]) # Quitar stopwords
    text = " ".join([token.lemma_ for token in nlp(text)]) # Lematización
    return text

df["clean_title"] = df["title"].apply(clean_function)
df["clean_text"] = df["text"].apply(clean_function)

In [None]:
# Mostrar comparaciones de la limpieza realizada en algunas filas de texto
number_new = 11
print(df['label'][number_new])

print(df['title'][number_new])
print(df['text'][number_new])

print(df['clean_title'][number_new])
print(df['clean_text'][number_new])


Algunas comillas no se han eliminado. 

Además, hay algunas palabras vacías que no se expanden usando *contractions* como 's, por lo que eliminamos caracteres de longitud = 1, pues no aportan valor.

Por último, nos aseguramos de tener un único espacio entre palabras.


In [70]:
def clean_function_v2(text):
    text = re.sub(r"[‘’“”\"\']", "", text) # Eliminar comillas raras
    text = re.sub(r"\s.\s", " ", text) # Eliminar caracteres que miden solo 1 (resquicios con espacios a ambos lados)
    text = re.sub(r"\s+", " ", text).strip() # Quitar espacios en blanco adicionales
    return text

df["clean_title"] = df["clean_title"].apply(clean_function_v2)
df["clean_text"] = df["clean_text"].apply(clean_function_v2)

In [72]:
# Mostrar comparaciones de la limpieza realizada en algunas filas de texto
number_new = 25000
print(df['label'][number_new])

print(df['title'][number_new])
print(df['text'][number_new])

print(df['clean_title'][number_new])
print(df['clean_text'][number_new])

0
SHE GREW UP BELIEVING BLACKS Could Only Support Democrats…Until She Took A Job With ACORN: WATCH The INCREDIBLE Story Of A Woman Who Took On Obama’s LEFTIST MACHINE [VIDEO]
Keep your eye on Anita Moncreif If knowledge is power she is the Democrat Party s worst nightmare. When you re on the left, and all of your friends are leftists, and your parents are leftists, you don t hang around with other people, and you only get the view of folks as what you see on TV, and how they present it to you. And you guys are seen as racist, angry people. Every time they get a chance, that s the image they push out there on TV. They try to find that one crazy Tea Party person and they try to get them to say something, and they make sure they play it on all the black stations. And you see that and you say,  Okay, these people are nuts.  So I didn t expect to find any kind of support from the Right. Everything Anita Moncreif believed to be true about the Left changed when she took a job with ACORN and q

In [None]:
# Eliminar las columnas 'title' y 'text'
df.drop(['title', 'text'], axis=1, inplace=True)

# Mostrar las primeras filas del DataFrame
df.head()

Unnamed: 0,label,clean_title,clean_text
0,0,whatever happen trump second wife,pretty safe bet press able reveal bad blood do...
1,0,absolute submission trump bow neocon orthodoxy,consortium news exclusive mideast trip saudi a...
2,0,london mayor harsh word community organizer ch...,country spin control obama orchestrate effort ...
3,0,,
4,1,trump top defense homeland official attend mun...,berlin reuter us secretary defense james matti...


Por ejemplo, la noticia 3 era sólo una URL. Eliminemos las filas vacías.

In [77]:
# Eliminar filas con texto vacío
filas_antes = df.shape[0]
df = df[df['clean_text'].str.strip() != '']
filas_despues = df.shape[0]
filas_eliminadas = filas_antes - filas_despues
print(f"Se han eliminado {filas_eliminadas} filas.")

# Mostrar las primeras filas del DataFrame después de eliminar filas vacías
df.head()

Se han eliminado 705 filas.


Unnamed: 0,label,clean_title,clean_text
0,0,whatever happen trump second wife,pretty safe bet press able reveal bad blood do...
1,0,absolute submission trump bow neocon orthodoxy,consortium news exclusive mideast trip saudi a...
2,0,london mayor harsh word community organizer ch...,country spin control obama orchestrate effort ...
4,1,trump top defense homeland official attend mun...,berlin reuter us secretary defense james matti...
5,1,support brazil pension reform organize lawmaker,brasiliario de janeiro reuters government braz...


In [78]:
# Comprobar que los datos siguen balanceados
print(df["label"].value_counts())

0    22777
1    21416
Name: label, dtype: int64


In [79]:
# Guardar el DataFrame en un archivo CSV
df.to_csv("CleanedFakeAndRealNews.csv", index=False)

# Vectorización