In [1]:
from platform import python_version

print(python_version())

3.7.6


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import gensim
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [4]:
import re

In [5]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [7]:
import random

In [8]:
stop = set(stopwords.words('english'))
random.sample(stopwords.words('english'), 10)

['up',
 'needn',
 'won',
 'shan',
 "hadn't",
 'here',
 'doesn',
 'have',
 'am',
 'having']

# Loading Data

In [9]:
CUR_DATASET = "FakeNews"

In [10]:
train_dataset = pd.read_csv(f"./data/{CUR_DATASET}/train.csv.zip")
test_dataset = pd.read_csv(f"./data/{CUR_DATASET}/test.csv.zip")

In [11]:
train_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [12]:
labels = train_dataset["label"].values

# Data Preprocessing and Data Preparation

In [13]:
train_dataset = train_dataset.fillna("null data")
test_dataset = test_dataset.fillna("null data")

### Removing stopwords

In [14]:
stop_words = stopwords.words('english')

In [15]:
if_stopwords = True

In [16]:
if if_stopwords:
    train_dataset["title"] = train_dataset["title"].str.lower().str.replace("’", "'")
    train_dataset["title"] = train_dataset["title"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    train_dataset["text"] = train_dataset["text"].str.lower().str.replace("’", "'")
    train_dataset["text"] = train_dataset["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [17]:
train_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,house dem aide: even see comey's letter jason ...,Darrell Lucus,house dem aide: even see comey's letter jason ...,1
1,1,"flynn: hillary clinton, big woman campus - bre...",Daniel J. Flynn,ever get feeling life circles roundabout rathe...,0
2,2,truth might get fired,Consortiumnews.com,"truth might get fired october 29, 2016 tension...",1
3,3,15 civilians killed single us airstrike identi...,Jessica Purkiss,videos 15 civilians killed single us airstrike...,1
4,4,iranian woman jailed fictional unpublished sto...,Howard Portnoy,print iranian woman sentenced six years prison...,1


### Preprocessing

In [18]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', '', txt)
}

In [19]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [20]:
STOP_WORDS = [preprocessing_text_fn["no_punctuation"](word) for word in stop_words]
random.sample(stop_words, 20)

["she's",
 'from',
 "aren't",
 'after',
 'shouldn',
 'same',
 'it',
 'their',
 'in',
 'until',
 'needn',
 'them',
 "you'd",
 'doing',
 'aren',
 "doesn't",
 'me',
 'any',
 'having',
 'how']

In [21]:
def tokenize_without_stopwords(text, stop_words=STOP_WORDS):
    word_tokens = word_tokenize(text)
    filtered_sequence = [word for word in word_tokens if not word.lower() in stop_words]
    return filtered_sequence

In [22]:
train_dataset["title"] = train_dataset["title"].apply(preprocess_text)
train_dataset["text"] = train_dataset["text"].apply(preprocess_text)
train_dataset.head(10)

Unnamed: 0,id,title,author,text,label
0,0,house dem aide even see comeys letter jason ch...,Darrell Lucus,house dem aide even see comeys letter jason ch...,1
1,1,flynn hillary clinton big woman campus breitbart,Daniel J. Flynn,ever get feeling life circles roundabout rathe...,0
2,2,truth might get fired,Consortiumnews.com,truth might get fired october tension intellig...,1
3,3,civilians killed single us airstrike identified,Jessica Purkiss,videos civilians killed single us airstrike id...,1
4,4,iranian woman jailed fictional unpublished sto...,Howard Portnoy,print iranian woman sentenced six years prison...,1
5,5,jackie mason hollywood would love trump bombed...,Daniel Nussbaum,trying times jackie mason voice reason in week...,0
6,6,life life luxury elton johns favorite shark pi...,null data,ever wonder britains iconic pop pianist gets l...,1
7,7,benoît hamon wins french socialist partys pres...,Alissa J. Rubin,paris france chose idealistic traditional cand...,0
8,8,excerpts draft script donald trumps qampa blac...,null data,donaldtrump scheduled make highly anticipated ...,0
9,9,backchannel plan ukraine russia courtesy trump...,Megan Twohey and Scott Shane,week michaelflynn resigned national security a...,0


In [23]:
test_dataset["title"] = test_dataset["title"].apply(preprocess_text)
test_dataset["text"] = test_dataset["text"].apply(preprocess_text)
test_dataset.head(10)

Unnamed: 0,id,title,author,text
0,20800,Specter of Trump Loosens Tongues if Not Purse ...,David Streitfeld,PALO ALTO Calif After years of scorning the po...
1,20801,Russian warships ready to strike terrorists ne...,null data,Russian warships ready to strike terrorists ne...
2,20802,NoDAPL Native American Leaders Vow to Stay All...,Common Dreams,Videos NoDAPL Native American Leaders Vow to S...
3,20803,Tim Tebow Will Attempt Another Comeback This T...,Daniel Victor,If at first you dont succeed trydifferent spor...
4,20804,Keiser Report Meme Wars E,Truth Broadcast Network,mins ago Views Comments Likes For the first t...
5,20805,Trump is USAs antique hero Clinton will be nex...,null data,Trump is USAs antique hero Clinton will be nex...
6,20806,Pelosi Calls for FBI Investigation to Find Out...,Pam Key,Sunday on NBCs Meet the Press House Minority L...
7,20807,Weekly Featured Profile Randy Shannon,Trevor Loudon,You are here Home Articles of the Bound Weekly...
8,20808,Urban Population Booms Will Make Climate Chang...,null data,Urban Population Booms Will Make Climate Chang...
9,20809,null data,cognitive dissident,dont we have the receipt


### Lemmatization and Stemming

In [24]:
if_lemmatize = True

In [25]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    train_dataset["title"] = train_dataset["title"].str.lower().str.replace("’", "'")
    train_dataset["title"] = train_dataset["title"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    train_dataset["text"] = train_dataset["text"].str.lower().str.replace("’", "'")
    train_dataset["text"] = train_dataset["text"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [26]:
train_text = (train_dataset['title'] + " " + train_dataset['text']).values
test_text = (test_dataset['title'] + " " + test_dataset['text']).values

In [27]:
labels = train_dataset['label'].values

## Embedding with word2vec

In [28]:
# Limit length of each article
max_length = 3000
lengths = np.array([len(x) for x in train_text])
train_text = train_text[lengths < max_length]
labels = labels[lengths < max_length]

In [29]:
# Check actual max length of an article
article_length = max(np.array([len(x) for x in train_text]))
article_length

2999

In [30]:
# Word2vec model with vector size = 100
vec_size = 100

# workers - number of CPU threads
word_model = gensim.models.Word2Vec(train_text, vector_size = vec_size, window = 5, workers = 12)
word_model.train(train_text, epochs = 10, total_examples = len(train_text))
wv = word_model.wv

In [31]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
vocabulary_size = len(tokenizer.word_index) + 1
encoded_articles = tokenizer.texts_to_sequences(train_text)

In [32]:
padded_articles = pad_sequences(encoded_articles, maxlen = article_length, padding='post')

In [33]:
padded_articles.shape

(12347, 2999)

In [34]:
emb_matrix = np.zeros(shape=(vocabulary_size, vec_size))
for w, i in tokenizer.word_index.items():
    ind = wv.has_index_for(w)
    if ind:
        emb_matrix[i] = wv.get_vector(w)

In [35]:
x_train, x_test, y_train, y_test = train_test_split(padded_articles, labels, test_size=0.25)

## LSTM model

In [36]:
from keras.initializers import Constant
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, LSTM
from keras.layers import ReLU
from keras.layers import Dropout
model=Sequential()
model.add(Embedding(input_dim = vocabulary_size, 
                    output_dim = vec_size,
                    input_length = article_length,
                    embeddings_initializer = Constant(emb_matrix))
         )
model.add(LSTM(32, return_sequences=True))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [37]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 2999, 100)         10193000  
                                                                 
 lstm (LSTM)                 (None, 2999, 32)          17024     
                                                                 
 flatten (Flatten)           (None, 95968)             0         
                                                                 
 dense (Dense)               (None, 128)               12284032  
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 16)                2064      
                                                                 
 dropout_1 (Dropout)         (None, 16)                0

In [38]:
import keras.backend as K
def keras_f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [39]:
model.compile(optimizer=Adam(),loss='binary_crossentropy', metrics=['accuracy', keras_f1_score])

## Training

In [None]:
epochs=5
batch_size=128

In [41]:
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x23e5bc71288>

## Metrics

In [42]:
from sklearn.metrics import f1_score
def accuracy(predictions, labels):
    predictions[predictions >= 0.5] = 1
    predictions[predictions < 0.5] = 0
    return accuracy_score(labels, predictions)

def flat_f1_score(predictions, labels):
    predictions[predictions >= 0.5] = 1
    predictions[predictions < 0.5] = 0
    return f1_score(labels, predictions)

## LIAR dataset tests

In [43]:
liar_dataset_train = pd.read_csv('./data/LIAR/train.tsv', sep='\t', header = None)
liar_dataset_test = pd.read_csv('./data/LIAR/test.tsv', sep='\t', header = None)
liar_dataset_valid = pd.read_csv('./data/LIAR/valid.tsv', sep='\t', header = None)
liar_dataset = pd.concat([liar_dataset_train, liar_dataset_test, liar_dataset_valid], axis = 0).reset_index(drop = True)

In [44]:
liar_dataset = liar_dataset.iloc[:, [1, 2]]
liar_dataset = liar_dataset.rename(columns = {1: 'label', 2: 'statements'})

In [45]:
liar_dataset['label'] = liar_dataset['label'].replace({
    'false' : False,
    'barely-true' : False,
    'pants-fire' : False,
    'half-true' : True,
    'mostly-true' : True,
    'true' : True
})

In [46]:
liar_label = liar_dataset['label']

In [47]:
encoded_liar_text = tokenizer.texts_to_sequences(liar_dataset['statements'])
padded_liar_text = pad_sequences(encoded_liar_text, maxlen = article_length, padding='post')

### Prediction on test dataset (without training on it)

In [48]:
pred = model.predict(padded_liar_text, batch_size)



In [49]:
accuracy(pred, liar_label)

0.5391290751309514

In [52]:
flat_f1_score(pred, liar_label)

0.6894262683736369