In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import Callback

from sklearn.model_selection import train_test_split


In [5]:
import string

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

import plotly
plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objects as go

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [6]:
legit, fake = pd.read_csv('True.csv'), pd.read_csv('Fake.csv')
legit.sample(10)

Unnamed: 0,title,text,subject,date
1116,"Searching for Trump card, Democrats watch Virg...","RICHMOND, Va. (Reuters) - Speaking as a folksy...",politicsNews,"October 20, 2017"
1887,"In North Dakota, Trump finds Democrat willing ...",WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"September 6, 2017"
7379,Timeline: Pivotal moments in Trump's president...,WASHINGTON (Reuters) - Donald Trump’s successf...,politicsNews,"November 9, 2016"
962,Defense rests in Democratic Senator Menendez's...,NEW YORK (Reuters) - The defense in Democratic...,politicsNews,"October 30, 2017"
13603,Fractured French Right struggles to unite agai...,"PARIS (Reuters) - A year ago, France s conserv...",worldnews,"November 29, 2017"
16561,Britain aiming for outline Brexit transition d...,LONDON (Reuters) - Britain wants an outline ag...,worldnews,"October 25, 2017"
18956,IAEA chief calls for clarity on disputed secti...,neVIENNA (Reuters) - The U.N. nuclear watchdog...,worldnews,"September 26, 2017"
5276,Experts dispute Trump's assertion that U.S. nu...,WASHINGTON (Reuters) - While President Donald ...,politicsNews,"February 24, 2017"
11373,EU questions Russia's 2018 vote after Navalny ...,BRUSSELS (Reuters) - The Russian authorities ...,worldnews,"December 26, 2017"
7010,Trump expected to pick investor Wilbur Ross as...,WASHINGTON (Reuters) - U.S. President-elect Do...,politicsNews,"November 29, 2016"


In [7]:
legit['target'] = 1
fake['target'] = 0
data = pd.concat([legit, fake], axis=0)
data.sample(10)

Unnamed: 0,title,text,subject,date,target
14412,MAINSTREAM MEDIA STANDS DOWN: Fire Alarm Pulle...,Free speech is under attack in America like ne...,politics,"Feb 26, 2016",0
5833,Hypocrites: Republican National Convention Wi...,As Republicans stand firm about doing nothing ...,News,"June 17, 2016",0
3887,Trump Gets HILARIOUSLY Mocked For Looking Ove...,Donald Trump showed up to vote in New York Cit...,News,"November 8, 2016",0
2571,JK Rowling Is Absolutely Destroying Piers Mor...,Washed up television personality Piers Morgan ...,News,"February 11, 2017",0
2130,Judge Uses Trump’s OWN WORDS To Prove ‘Travel...,"In the second time in less than two months, Tr...",News,"March 15, 2017",0
12975,Exclusive: Saudi-led blockade cuts fuel lifeli...,LONDON (Reuters) - No fuel shipments have reac...,worldnews,"December 6, 2017",1
7976,U.S. Congress passes funding bill; averts gove...,WASHINGTON (Reuters) - The U.S. Congress on We...,politicsNews,"September 29, 2016",1
22632,GOD SQUAD: Jury Finds Polygamous Mormon Towns ...,Jamie Ross Courthouse News ServicePHOENIX Tw...,US_News,"March 12, 2016",0
11389,"For Iraq's Christians, a bittersweet first Chr...","TELESKOF, Iraq (Reuters) - Inside the newly re...",worldnews,"December 24, 2017",1
3133,Trump Gets Stomped By Twitter After DEMANDING...,"Uh oh, you guys, President Obama is in trouble...",News,"January 6, 2017",0


In [8]:
data.isnull().sum()

title      0
text       0
subject    0
date       0
target     0
dtype: int64

In [9]:
# Hyperparameters for title and text
vocab_size = 100000
embedding_dim_title = 128
max_length_title = 40
embedding_dim_text = 500
max_length_text = 500
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
test_ratio = .3
embedding_dim = 500
max_length_text = 500

In [10]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)

    # instantiate a distribution strategy
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
    TPU_EXIST = True
except Exception as e:
    print(e)
    TPU_EXIST = False

Please provide a TPU Name to connect to.


In [None]:
# DATA CLEANING

In [11]:
# Text cleaning
def clean(text):
    #1. Remove punctuation
    translator1 = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(translator1)
    
    #2. Convert to lowercase characters
    text = text.lower()
    
    #3. Remove stopwords
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])
    
    return text

In [12]:
# Apply cleaning to title and text in dataset
data['title'] = data['title'].apply(clean)
data['text'] = data['text'].apply(clean)
data.sample(10)

Unnamed: 0,title,text,subject,date,target
6398,here’s conservatives silent 2016 cop killer data,police country great run let face remember 197...,News,"May 13, 2016",0
8273,israeli trump supporters open campaign office ...,jerusalem reuters israeli supporters u republi...,politicsNews,"September 5, 2016",1
14091,joke week dear clueless…,boom dear abby husband liar cheat cheated begi...,politics,"Apr 18, 2016",0
15847,jihad dummies us army enlistee mohammed abdull...,note little johnny might want consider using d...,politics,"Apr 10, 2015",0
14276,huckabee ‘hillary ride life” trump…she bill wo...,reason sitting chair tonight stage like everyo...,politics,"Mar 16, 2016",0
22952,ep 17 patrick henningsen live – ‘parallax poli...,join patrick every wednesday independent talk ...,Middle-east,"March 16, 2017",0
22867,boiler room ep 114 – psychos compromised media,tune alternate current radio network acr anoth...,Middle-east,"June 24, 2017",0
17369,obama’s federalization police force baltimore ...,happened go baltimore mayor facebook page saw ...,Government News,"May 6, 2015",0
10172,role tech set clinton server unknown bosses state,new york reuters soon hillary clinton’s arriva...,politicsNews,"March 25, 2016",1
3515,merkel minister stress u ties critical trump t...,berlin reuters u president donald trump called...,politicsNews,"May 30, 2017",1


In [None]:
# Data preprocessing, model creation and testing functions

In [13]:
def preprocessing(data, dependent_column=None, target='target', max_len=40):
    train_X, test_X, train_y, test_y = train_test_split(data[dependent_column], data[target], test_size=test_ratio)
    tokenizer = Tokenizer(num_words=vocab_size,
                          oov_token=oov_tok)
    tokenizer.fit_on_texts(train_X)
    train_sequences = tokenizer.texts_to_sequences(train_X)
    train_padded = pad_sequences(train_sequences, maxlen=max_len,
                                padding=padding_type,
                                truncating=trunc_type)
    test_sequences = tokenizer.texts_to_sequences(test_X)
    test_padded = pad_sequences(test_sequences, maxlen=max_len,
                               padding=padding_type,
                               truncating=trunc_type)
    return train_padded, test_padded, train_y, test_y

In [14]:
# Create Model
def model_creation(vocab_size=vocab_size, embedding_dim=128):
    if TPU_EXIST:
        with tpu_strategy.scope():
            model = tf.keras.Sequential()
            model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim))
            model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)))
            model.add(tf.keras.layers.Dense(embedding_dim, activation='relu'))
            model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    else:
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim))
        model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim)))
        model.add(tf.keras.layers.Dense(embedding_dim, activation='relu'))
        model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [15]:
def train_model(model, train_X, train_Y, test_X, test_Y, epochs):
    class CustomCallback(Callback):
        def on_epoch_end(self, epoch, logs={}):
            if logs.get('acc') > 0.99:
                print(f'Accuracy reached {logs.get("acc")*100:0.2f}. Stopping the training')
                self.model.stop_training = True

    history = model.fit(train_X, train_Y,
                       epochs=epochs,
                       batch_size=64,
                       validation_data=[test_X, test_Y],
                       callbacks=[CustomCallback()])
    return history

In [None]:
# Case 1: Using news title to train and validate predictor model

In [19]:
train_padded, test_padded, train_y, test_y = preprocessing(data, dependent_column='title', max_len=max_length_title)
model = model_creation(embedding_dim=embedding_dim_title)
history_title = train_model(model, train_padded, train_y, test_padded, test_y, 15)

Train on 31428 samples, validate on 13470 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15


In [20]:
train_padded, test_padded, train_y, test_y = preprocessing(data, dependent_column='text', max_len=max_length_title)
model = model_creation(embedding_dim=embedding_dim_text)
history_text = train_model(model, train_padded, train_y, test_padded, test_y, 15)

Train on 31428 samples, validate on 13470 samples
Epoch 1/15


In [21]:
# Plotting accuracies of above two cases
title_max_acc = max(history_title.history.get('acc'))
text_max_acc = max(history_text.history.get('acc'))

fig = go.Figure()
fig.add_trace(go.Scatter(x=['Title', 'Text'],
                        y=[title_max_acc,
                          text_max_acc],
                        mode='lines+markers',
                        name='Accuracies of Models'))
fig.update_layout(title='Accuracies Differences',
                 xaxis_title='Case Name',
                 yaxis_title='Accuracy of Model')
fig.show()