In [5]:
import pandas as pd
import numpy as np
import re
import string

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

from torchtext.data.utils import get_tokenizer


nltk.download('stopwords')
nltk.download('punkt')
src = "/content/drive/MyDrive/data/fake_real_news/"

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Data cleanup

In [91]:
# Load the data

fake_df = pd.read_csv(src + "Fake.csv")
true_df = pd.read_csv(src + "True.csv")

print("Fake news articles : " + str(len(fake_df)))
print("True news articles : " + str(len(true_df)))

fake_df["category"] = 0
true_df["category"] = 1

news_df = pd.concat([fake_df, true_df])

Fake news articles : 23481
True news articles : 21417


In [92]:
#Any blanks ?
news_df.isna().sum()

title       0
text        0
subject     0
date        0
category    0
dtype: int64

In [93]:
news_df["text"] = news_df["title"] + " " + news_df["text"]

In [94]:
# Removing unnessary text

start_date = 2010
end_date = 2022
date_range = np.arange(start_date, end_date)
date_range

def remove_brackets(article):
  return re.sub('\[[^]]*\]', '', article)

def remove_reuters_reference(article):
  return article.split("(Reuters) - ")[-1]

def remove_urls(article):
  return re.sub(r'http\S+', 'URL', article)

def remove_twitter_handles(article):
  return re.sub(r'\(@\S+', '', article)

def remove_twitter_pic_refs(article):
  return re.sub(r'pic.twitter.com\S+', '', article)


def add_space_between_year_and_text(article):
  for year in date_range:
    if str(year) in article:
      article = article.replace(str(year), str(year) + " ")

  return article



punctuations = string.punctuation

for idx, news_text in enumerate(news_df.text):

  article = news_text.lower()
  article = remove_brackets(article)
  article = remove_reuters_reference(article)
  article = remove_urls(article)
  article = remove_twitter_handles(article)
  article = remove_twitter_pic_refs(article)
  article = add_space_between_year_and_text(article)
  
  #Replacing the punctuations with spaces
  article = re.sub(r'[^\w\s]',' ',article)

  news_df.iloc[idx, 1] = article

In [97]:
#Remove stop words and punction

stop_words = stopwords.words("english")

max_word_length = 0
for idx, news_text in enumerate(news_df.text):
  words = word_tokenize(news_text)
  updated_words = []
  for word in words:
    if word not in stop_words:
      updated_words.append(word)

      if max_word_length < len(updated_words):
        max_word_length = len(updated_words)

  news_df.iloc[idx, 1] = " ".join(updated_words)


print(max_word_length)

5083


In [98]:
# news_df.isna().sum()
indices, blanks = [], []
for idx, x in enumerate(list(news_df['text'])):
  if x == " ":
    indices.append(idx)

len(indices)

0

In [99]:
#Save the cleanedup data in csv
updated_news_df = pd.DataFrame({"text" : list(news_df["text"]), 
                                "category" : list(news_df["category"])})
updated_news_df.to_csv(src + "updated_news.csv", index=False)

# Tokens

In [26]:

#Original work presented by the below link
#I have referenced this work to develop this solution
#Kindly read the below solution and upvote it as well
# https://www.kaggle.com/rushinaik/mission-torch-1

import sys
import random
import torch
from torchtext.legacy import data
import torch.nn as nn
import torch.optim as optim


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

SEED = 2021

In [27]:
news_df = pd.read_csv(src + 'updated_news.csv')

news_df.head()

Unnamed: 0,text,category
0,donald trump sends embarrassing new year eve m...,0
1,drunk bragging trump staffer started russian c...,0
2,sheriff david clarke becomes internet joke thr...,0
3,trump obsessed even obama name coded website i...,0
4,pope francis called donald trump christmas spe...,0


In [28]:
#Train and test split
# news_text = []
# for idx, news_values in enumerate(news_df.text):
#    news_text.append(news_values)

x_train, x_test, y_train, y_test = train_test_split(news_df['text'], news_df['category'], test_size=0.3)

In [29]:
# vectorizer for text field
vectorizer = TfidfVectorizer(ngram_range=(1,1), max_features=10000)

# Vectorize the training text to prepare for Pytorch model
x_train = vectorizer.fit_transform(x_train)

# Vectorize testing text.
x_test = vectorizer.transform(x_test)

In [30]:
x_train = torch.tensor(x_train.todense()).float()
x_test = torch.tensor(x_test.todense()).float()

In [31]:
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

# Model Definition

In [38]:
model = nn.Sequential(
                nn.Linear(x_train.shape[1],128),
                nn.ReLU(),
                nn.Dropout(0.1),
                nn.Linear(128, news_df['category'].nunique()),
                nn.LogSoftmax(dim=1)
)

model

Sequential(
  (0): Linear(in_features=10000, out_features=128, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.1, inplace=False)
  (3): Linear(in_features=128, out_features=2, bias=True)
  (4): LogSoftmax(dim=1)
)

# Optimizer

In [39]:
# defining the loss 
criterion = nn.NLLLoss()


# Optimizers require the parameters to optimize and a learning rate
optimizer = optim.Adam(model.parameters(), lr=0.002)

In [40]:
train_losses, test_losses, accuracies = [], [], []

epochs = 180


x_train = x_train.to(device)
y_train = y_train.to(device)

x_test = x_test.to(device)
y_test = y_test.to(device)


model = model.to(device)


for epoch in range(epochs):

    model.train()

    optimizer.zero_grad()

    output = model(x_train)
    loss = criterion(output, y_train)
    
    loss.backward()
    train_loss = loss.item()
    train_losses.append(train_loss)
    
    optimizer.step()
        
    
    with torch.set_grad_enabled(True):
        model.eval()

        log_ps = model(x_test)
        test_loss = criterion(log_ps, y_test)
        test_losses.append(test_loss)

        ps = torch.exp(log_ps)
        top_p, top_class = ps.topk(1, dim=1)
        equals = top_class == y_test.view(*top_class.shape)
        test_accuracy = torch.mean(equals.float())
        accuracies.append(test_accuracy)


    if (epoch + 1) % 10 == 0:

        print("Epoch:  {} / {} ".format(epoch + 1, epochs))
        print("Training Loss: {:.3f}".format(train_loss))
        print("Test Loss: {:.3f}".format(test_loss))
        print("Test Accuracy: {:.3f}".format(test_accuracy))
        print("-" * 20)

Epoch:  10 / 180 
Training Loss: 0.492
Test Loss: 0.467
Test Accuracy: 0.925
--------------------
Epoch:  20 / 180 
Training Loss: 0.256
Test Loss: 0.245
Test Accuracy: 0.941
--------------------
Epoch:  30 / 180 
Training Loss: 0.139
Test Loss: 0.144
Test Accuracy: 0.957
--------------------
Epoch:  40 / 180 
Training Loss: 0.086
Test Loss: 0.101
Test Accuracy: 0.969
--------------------
Epoch:  50 / 180 
Training Loss: 0.059
Test Loss: 0.081
Test Accuracy: 0.975
--------------------
Epoch:  60 / 180 
Training Loss: 0.043
Test Loss: 0.069
Test Accuracy: 0.977
--------------------
Epoch:  70 / 180 
Training Loss: 0.032
Test Loss: 0.063
Test Accuracy: 0.979
--------------------
Epoch:  80 / 180 
Training Loss: 0.025
Test Loss: 0.059
Test Accuracy: 0.980
--------------------
Epoch:  90 / 180 
Training Loss: 0.020
Test Loss: 0.056
Test Accuracy: 0.981
--------------------
Epoch:  100 / 180 
Training Loss: 0.016
Test Loss: 0.055
Test Accuracy: 0.981
--------------------
Epoch:  110 / 180 
