In [1]:
from platform import python_version

print(python_version())

3.7.6


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import gensim
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [4]:
import re

In [5]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [7]:
import random

In [8]:
stop = set(stopwords.words('english'))
random.sample(stopwords.words('english'), 10)

['having',
 'being',
 'too',
 'such',
 'and',
 'but',
 'each',
 'doing',
 'after',
 'their']

# Loading Data

In [9]:
CUR_DATASET = "ISOT"

In [10]:
train_dataset_fake = pd.read_csv(f"./data/{CUR_DATASET}/Fake.csv.zip")
train_dataset_fake['label'] = 0
train_dataset_true = pd.read_csv(f"./data/{CUR_DATASET}/True.csv.zip")
train_dataset_true['label'] = 1
train_dataset = pd.concat([train_dataset_fake, train_dataset_true])

In [11]:
train_dataset.head()

Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",0


In [12]:
labels = train_dataset["label"].values

# Data Preprocessing and Data Preparation

In [13]:
train_dataset = train_dataset.fillna("null data")

### Removing stopwords

In [14]:
stop_words = stopwords.words('english')

In [15]:
if_stopwords = True

In [16]:
if if_stopwords:
    train_dataset["title"] = train_dataset["title"].str.lower().str.replace("’", "'")
    train_dataset["title"] = train_dataset["title"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    train_dataset["text"] = train_dataset["text"].str.lower().str.replace("’", "'")
    train_dataset["text"] = train_dataset["text"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [17]:
train_dataset.head()

Unnamed: 0,title,text,subject,date,label
0,donald trump sends embarrassing new year's eve...,donald trump wish americans happy new year lea...,News,"December 31, 2017",0
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,News,"December 31, 2017",0
2,sheriff david clarke becomes internet joke thr...,"friday, revealed former milwaukee sheriff davi...",News,"December 30, 2017",0
3,trump obsessed even obama's name coded website...,"christmas day, donald trump announced would ba...",News,"December 29, 2017",0
4,pope francis called donald trump christmas speech,pope francis used annual christmas day message...,News,"December 25, 2017",0


### Preprocessing

In [18]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', '', txt)
}

In [19]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [20]:
STOP_WORDS = [preprocessing_text_fn["no_punctuation"](word) for word in stop_words]
random.sample(stop_words, 20)

['both',
 'its',
 "hasn't",
 "shan't",
 'ma',
 'during',
 'other',
 'by',
 'than',
 'myself',
 'while',
 'only',
 'against',
 'ourselves',
 'same',
 "don't",
 'after',
 'doing',
 'any',
 'up']

In [21]:
def tokenize_without_stopwords(text, stop_words=STOP_WORDS):
    word_tokens = word_tokenize(text)
    filtered_sequence = [word for word in word_tokens if not word.lower() in stop_words]
    return filtered_sequence

In [22]:
train_dataset["title"] = train_dataset["title"].apply(preprocess_text)
train_dataset["text"] = train_dataset["text"].apply(preprocess_text)
train_dataset.head(10)

Unnamed: 0,title,text,subject,date,label
0,donald trump sends embarrassing new years eve ...,donald trump wish americans happy new year lea...,News,"December 31, 2017",0
1,drunk bragging trump staffer started russian c...,house intelligence committee chairman devin nu...,News,"December 31, 2017",0
2,sheriff david clarke becomes internet joke thr...,friday revealed former milwaukee sheriff david...,News,"December 30, 2017",0
3,trump obsessed even obamas name coded website ...,christmas day donald trump announced would bac...,News,"December 29, 2017",0
4,pope francis called donald trump christmas speech,pope francis used annual christmas day message...,News,"December 25, 2017",0
5,racist alabama cops brutalize black boy handcu...,number cases cops brutalizing killing people c...,News,"December 25, 2017",0
6,fresh golf course trump lashes fbi deputy dire...,donald trump spent good portion day golf club ...,News,"December 23, 2017",0
7,trump said insanely racist stuff inside oval o...,wake yet another court decision derailed donal...,News,"December 23, 2017",0
8,former cia director slams trump un bullying op...,many people raised alarm regarding fact donald...,News,"December 22, 2017",0
9,watch brandnew protrump ad features muchkissin...,might thought get break watching people kiss d...,News,"December 21, 2017",0


### Lemmatization and Stemming

In [23]:
if_lemmatize = True

In [24]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    train_dataset["title"] = train_dataset["title"].str.lower().str.replace("’", "'")
    train_dataset["title"] = train_dataset["title"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    train_dataset["text"] = train_dataset["text"].str.lower().str.replace("’", "'")
    train_dataset["text"] = train_dataset["text"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [25]:
train_text = (train_dataset['title'] + " " + train_dataset['text']).values

In [26]:
labels = train_dataset['label'].values

## Embedding with word2vec

In [27]:
# Limit length of each article
max_length = 3000
lengths = np.array([len(x) for x in train_text])
train_text = train_text[lengths < max_length]
labels = labels[lengths < max_length]

In [28]:
# Check actual max length of an article
article_length = max(np.array([len(x) for x in train_text]))
article_length

2999

In [29]:
# Word2vec model with vector size = 100
vec_size = 100

# workers - number of CPU threads
word_model = gensim.models.Word2Vec(train_text, vector_size = vec_size, window = 5, workers = 12)
word_model.train(train_text, epochs = 10, total_examples = len(train_text))
wv = word_model.wv

In [30]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
vocabulary_size = len(tokenizer.word_index) + 1
encoded_articles = tokenizer.texts_to_sequences(train_text)

In [31]:
padded_articles = pad_sequences(encoded_articles, maxlen = article_length, padding='post')

In [32]:
padded_articles.shape

(39335, 2999)

In [33]:
emb_matrix = np.zeros(shape=(vocabulary_size, vec_size))
for w, i in tokenizer.word_index.items():
    ind = wv.has_index_for(w)
    if ind:
        emb_matrix[i] = wv.get_vector(w)

In [34]:
x_train, x_test, y_train, y_test = train_test_split(padded_articles, labels, test_size=0.25)

## LSTM model

In [71]:
from keras.initializers import Constant
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, LSTM
from keras.layers import ReLU
from keras.layers import Dropout
model=Sequential()
model.add(Embedding(input_dim = vocabulary_size, 
                    output_dim = vec_size,
                    input_length = article_length,
                    embeddings_initializer = Constant(emb_matrix))
         )
model.add(LSTM(32, return_sequences=True))
model.add(Flatten())
model.add(Dense(128,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))

In [72]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 2999, 100)         17474700  
                                                                 
 lstm_1 (LSTM)               (None, 2999, 32)          17024     
                                                                 
 flatten_1 (Flatten)         (None, 95968)             0         
                                                                 
 dense_3 (Dense)             (None, 128)               12284032  
                                                                 
 dropout_2 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 16)                2064      
                                                                 
 dropout_3 (Dropout)         (None, 16)               

In [73]:
import keras.backend as K
def keras_f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [74]:
model.compile(optimizer=Adam(),loss='binary_crossentropy', metrics=['accuracy', keras_f1_score])

## Training

In [75]:
epochs=5
batch_size=128

In [76]:
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test,y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1c92b4f0f88>

## Metrics

In [77]:
from sklearn.metrics import f1_score
def accuracy(predictions, labels):
    predictions[predictions >= 0.5] = 1
    predictions[predictions < 0.5] = 0
    return accuracy_score(labels, predictions)

def flat_f1_score(predictions, labels):
    predictions[predictions >= 0.5] = 1
    predictions[predictions < 0.5] = 0
    return f1_score(labels, predictions)

## LIAR dataset tests

In [78]:
liar_dataset_train = pd.read_csv('./data/LIAR/train.tsv', sep='\t', header = None)
liar_dataset_test = pd.read_csv('./data/LIAR/test.tsv', sep='\t', header = None)
liar_dataset_valid = pd.read_csv('./data/LIAR/valid.tsv', sep='\t', header = None)
liar_dataset = pd.concat([liar_dataset_train, liar_dataset_test, liar_dataset_valid], axis = 0).reset_index(drop = True)

In [79]:
liar_dataset = liar_dataset.iloc[:, [1, 2]]
liar_dataset = liar_dataset.rename(columns = {1: 'label', 2: 'statements'})

In [80]:
liar_dataset['label'] = liar_dataset['label'].replace({
    'false' : False,
    'barely-true' : False,
    'pants-fire' : False,
    'half-true' : True,
    'mostly-true' : True,
    'true' : True
})

In [81]:
liar_label = liar_dataset['label']

In [82]:
encoded_liar_text = tokenizer.texts_to_sequences(liar_dataset['statements'])
padded_liar_text = pad_sequences(encoded_liar_text, maxlen = article_length, padding='post')

### Prediction

In [83]:
pred = model.predict(padded_liar_text, batch_size)



In [84]:
accuracy(pred, liar_label)

0.4723633805019154

In [86]:
flat_f1_score(pred, liar_label)

0.21895613933572503