In [16]:
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

from torchtext.data.utils import get_tokenizer


nltk.download('stopwords')
nltk.download('punkt')
src = "/content/drive/MyDrive/data/fake_real_news/"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load the data

fake_df = pd.read_csv(src + "Fake.csv")
true_df = pd.read_csv(src + "True.csv")

print("Fake news articles : " + str(len(fake_df)))
print("True news articles : " + str(len(true_df)))

fake_df["category"] = 0
true_df["category"] = 1

news_df = pd.concat([fake_df, true_df])

Fake news articles : 23481
True news articles : 21417


In [3]:
#Any blanks ?

news_df.isna().sum()

title       0
text        0
subject     0
date        0
category    0
dtype: int64

In [4]:


start_date = 2010
end_date = 2022
date_range = np.arange(start_date, end_date)
date_range

def remove_brackets(article):
  return re.sub('\[[^]]*\]', '', article)

def remove_reuters_reference(article):
  return article.split("(Reuters) - ")[-1]

def remove_urls(article):
  return re.sub(r'http\S+', '', article)

def remove_twitter_handles(article):
  return re.sub(r'\(@\S+', '', article)

def remove_twitter_pic_refs(article):
  return re.sub(r'pic.twitter.com\S+', '', article)


def add_space_between_year_and_text(article):
  for year in date_range:
    if str(year) in article:
      article = article.replace(str(year), str(year) + " ")

  return article


for idx, news_text in enumerate(news_df.text):

  article = news_text
  article = remove_brackets(article)
  article = remove_reuters_reference(article)
  article = remove_urls(article)
  article = remove_twitter_handles(article)
  article = remove_twitter_pic_refs(article)
  article = add_space_between_year_and_text(article)

  news_df.iloc[idx, 1] = article

In [31]:
#Remove stop words and punction

stop_words = stopwords.words("english")
stop_words += list(string.punctuation)

max_word_length = 0
for idx, news_text in enumerate(news_df.text):
  words = word_tokenize(news_text)
  updated_words = []
  for word in words:
    if word not in stop_words:
      updated_words.append(word)

      if max_word_length < len(updated_words):
        max_word_length = len(updated_words)

  news_df.iloc[idx, 1] = " ".join(updated_words)

In [32]:
# Check after data cleanup
# list(news_df.text)[0]


5087

# Tokens

In [48]:
tokenizer = get_tokenizer("spacy")
tokens = tokenizer(list(news_df.text)[0])
tokens

['Donald',
 'Trump',
 'just',
 'couldn',
 't',
 'wish',
 'all',
 'Americans',
 'a',
 'Happy',
 'New',
 'Year',
 'and',
 'leave',
 'it',
 'at',
 'that',
 '.',
 'Instead',
 ',',
 'he',
 'had',
 'to',
 'give',
 'a',
 'shout',
 'out',
 'to',
 'his',
 'enemies',
 ',',
 'haters',
 'and',
 ' ',
 'the',
 'very',
 'dishonest',
 'fake',
 'news',
 'media',
 '.',
 ' ',
 'The',
 'former',
 'reality',
 'show',
 'star',
 'had',
 'just',
 'one',
 'job',
 'to',
 'do',
 'and',
 'he',
 'couldn',
 't',
 'do',
 'it',
 '.',
 'As',
 'our',
 'Country',
 'rapidly',
 'grows',
 'stronger',
 'and',
 'smarter',
 ',',
 'I',
 'want',
 'to',
 'wish',
 'all',
 'of',
 'my',
 'friends',
 ',',
 'supporters',
 ',',
 'enemies',
 ',',
 'haters',
 ',',
 'and',
 'even',
 'the',
 'very',
 'dishonest',
 'Fake',
 'News',
 'Media',
 ',',
 'a',
 'Happy',
 'and',
 'Healthy',
 'New',
 'Year',
 ',',
 ' ',
 'President',
 'Angry',
 'Pants',
 'tweeted',
 '.',
 ' ',
 '2018',
 ' ',
 'will',
 'be',
 'a',
 'great',
 'year',
 'for',
 'Americ

In [None]:
#Create the train and test splits
x_train, x_test, y_train, y_test = train_test_split(news_df.text, news_df.category)