In [1]:
from platform import python_version

print(python_version())

3.7.6


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import balanced_accuracy_score

In [3]:
import gensim
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences, to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [4]:
import re

In [5]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")  # Punkt Sentence Tokenizer
nltk.download("averaged_perceptron_tagger")  # Part of Speech Tagger
nltk.download("wordnet")  # a lexical database of English; useful for synonyms, hyponyms, etc.

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [7]:
import random

In [8]:
stop = set(stopwords.words('english'))
random.sample(stopwords.words('english'), 10)

["weren't",
 'theirs',
 'them',
 'some',
 'over',
 'wouldn',
 'themselves',
 'during',
 'was',
 'hasn']

# Loading Data

In [9]:
liar_dataset_train = pd.read_csv('../data/LIAR-PLUS/train2.tsv', sep='\t', header = None)
liar_dataset_test = pd.read_csv('../data/LIAR-PLUS/test2.tsv', sep='\t', header = None)
liar_dataset_valid = pd.read_csv('../data/LIAR-PLUS/val2.tsv', sep='\t', header = None)
liar_dataset = pd.concat([liar_dataset_train, liar_dataset_test, liar_dataset_valid], axis = 0).reset_index(drop = True)

In [10]:
liar_dataset = liar_dataset.iloc[:, [2, 3, 15]]
liar_dataset = liar_dataset.rename(columns = {2: 'label', 3: 'statements', 15: 'justification'})

In [11]:
liar_dataset.dropna(inplace=True)
liar_dataset.reset_index(inplace=True, drop=True)

In [12]:
liar_dataset

Unnamed: 0,label,statements,justification
0,false,Says the Annies List political group supports ...,That's a premise that he fails to back up. Ann...
1,half-true,When did the decline of coal start? It started...,"Surovell said the decline of coal ""started whe..."
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",Obama said he would have voted against the ame...
3,false,Health care reform legislation is likely to ma...,The release may have a point that Mikulskis co...
4,half-true,The economic turnaround started at the end of ...,"Crist said that the economic ""turnaround start..."
...,...,...,...
12687,half-true,"For the first time in more than a decade, impo...","In 2009, 17 percent of the U. S. 's oil import..."
12688,mostly-true,Says Donald Trump has bankrupted his companies...,"Clinton said, Trump has ""bankrupted his compan..."
12689,true,"John McCain and George Bush have ""absolutely n...","""I don't think that there should be a mandate ..."
12690,false,A new poll shows 62 percent support the presid...,But the poll doesn't say that. Several days af...


In [13]:
liar_dataset['label'] = liar_dataset['label'].replace({
    'false' : 0,
    'barely-true' : 1,
    'pants-fire' : 2,
    'half-true' : 3,
    'mostly-true' : 4,
    'true' : 5
})

In [14]:
liar_dataset

Unnamed: 0,label,statements,justification
0,0,Says the Annies List political group supports ...,That's a premise that he fails to back up. Ann...
1,3,When did the decline of coal start? It started...,"Surovell said the decline of coal ""started whe..."
2,4,"Hillary Clinton agrees with John McCain ""by vo...",Obama said he would have voted against the ame...
3,0,Health care reform legislation is likely to ma...,The release may have a point that Mikulskis co...
4,3,The economic turnaround started at the end of ...,"Crist said that the economic ""turnaround start..."
...,...,...,...
12687,3,"For the first time in more than a decade, impo...","In 2009, 17 percent of the U. S. 's oil import..."
12688,4,Says Donald Trump has bankrupted his companies...,"Clinton said, Trump has ""bankrupted his compan..."
12689,5,"John McCain and George Bush have ""absolutely n...","""I don't think that there should be a mandate ..."
12690,0,A new poll shows 62 percent support the presid...,But the poll doesn't say that. Several days af...


In [15]:
train_dataset = liar_dataset

In [16]:
train_dataset.head()

Unnamed: 0,label,statements,justification
0,0,Says the Annies List political group supports ...,That's a premise that he fails to back up. Ann...
1,3,When did the decline of coal start? It started...,"Surovell said the decline of coal ""started whe..."
2,4,"Hillary Clinton agrees with John McCain ""by vo...",Obama said he would have voted against the ame...
3,0,Health care reform legislation is likely to ma...,The release may have a point that Mikulskis co...
4,3,The economic turnaround started at the end of ...,"Crist said that the economic ""turnaround start..."


# Data Preprocessing and Data Preparation

In [17]:
train_dataset = train_dataset.fillna("null data")

### Removing stopwords

In [18]:
stop_words = stopwords.words('english')

In [19]:
if_stopwords = True

In [20]:
if if_stopwords:
    train_dataset["statements"] = train_dataset["statements"].str.lower().str.replace("’", "'")
    train_dataset["statements"] = train_dataset["statements"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    train_dataset["justification"] = train_dataset["justification"].str.lower().str.replace("’", "'")
    train_dataset["justification"] = train_dataset["justification"].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

In [21]:
train_dataset.head()

Unnamed: 0,label,statements,justification
0,0,says annies list political group supports thir...,that's premise fails back up. annie's list mak...
1,3,decline coal start? started natural gas took s...,"surovell said decline coal ""started natural ga..."
2,4,"hillary clinton agrees john mccain ""by voting ...",obama said would voted amendment present. thou...
3,0,health care reform legislation likely mandate ...,release may point mikulskis comment could open...
4,3,economic turnaround started end term.,"crist said economic ""turnaround started end te..."


### Preprocessing

In [22]:
preprocessing_text_fn = {
    "no_punctuation": lambda txt: re.sub(r'[^\w\s]','', txt),
    "no_special_symbols": lambda txt: re.sub('[$,#,&]', '', txt),
    "no_digits": lambda txt: re.sub('\d*', '', txt),
    "no_www": lambda txt: re.sub('w{3}', '', txt),
    "no_urls": lambda txt: re.sub('http\S+', '', txt),
    "no_spaces": lambda txt: re.sub('\s+', ' ', txt),
    "no_single_chars": lambda txt: re.sub(r'\s+[a-zA-Z]\s+', '', txt)
}

In [23]:
def preprocess_text(text, pipeline = preprocessing_text_fn):
    text = str(text)
    for fn in pipeline.keys():
        text = pipeline[fn](text)
        
    return text

In [24]:
STOP_WORDS = [preprocessing_text_fn["no_punctuation"](word) for word in stop_words]
random.sample(stop_words, 20)

['then',
 'her',
 'needn',
 'because',
 'themselves',
 'from',
 'after',
 'are',
 'is',
 'do',
 "needn't",
 'who',
 'that',
 'not',
 "isn't",
 "should've",
 'did',
 'some',
 'those',
 'these']

In [25]:
def tokenize_without_stopwords(text, stop_words=STOP_WORDS):
    word_tokens = word_tokenize(text)
    filtered_sequence = [word for word in word_tokens if not word.lower() in stop_words]
    return filtered_sequence

In [26]:
train_dataset["statements"] = train_dataset["statements"].apply(preprocess_text)
train_dataset["justification"] = train_dataset["justification"].apply(preprocess_text)
train_dataset.head(10)

Unnamed: 0,label,statements,justification
0,0,says annies list political group supports thir...,thats premise fails back up annies list makes ...
1,3,decline coal start started natural gas took st...,surovell said decline coal started natural gas...
2,4,hillary clinton agrees john mccain by voting g...,obama said would voted amendment present thoug...
3,0,health care reform legislation likely mandate ...,release may point mikulskis comment could open...
4,3,economic turnaround started end term,crist said economic turnaround started end ter...
5,5,chicago bears starting quarterbacks last years...,vos specifically used word fired means faculty...
6,1,jim dunnam lived district represents years now,determining would take significant detective w...
7,3,im person stage worked actively last year pass...,however bill another one sponsored majority le...
8,3,however took million oregon lottery funds port...,johnson correct many factors played role lotte...
9,4,says gop primary opponents glenn grothman joe ...,considering million figure covers years reason...


### Lemmatization and Stemming

In [28]:
if_lemmatize = True

In [29]:
if if_lemmatize:
    
    import nltk
    nltk.download('wordnet')
    nltk.download('omw-1.4')
    from nltk.stem import WordNetLemmatizer
    
    wnl = WordNetLemmatizer()
    
    train_dataset["statements"] = train_dataset["statements"].str.lower().str.replace("’", "'")
    train_dataset["statements"] = train_dataset["statements"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))
    train_dataset["justification"] = train_dataset["justification"].str.lower().str.replace("’", "'")
    train_dataset["justification"] = train_dataset["justification"].apply(lambda x: ' '.join([wnl.lemmatize(word) for word in word_tokenize(x)]))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MondayPC\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [30]:
SJ = False

if SJ:
    train_text = (train_dataset['justification'] + " " + train_dataset['statements']).values
else:
    train_text = train_dataset['justification']

In [31]:
train_labels = train_dataset['label'].values

In [32]:
label_encoder = preprocessing.LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
train_labels = to_categorical(train_labels, 6)

## Embedding with word2vec

In [33]:
# Limit length of each article
max_length = 3300
lengths = np.array([len(x) for x in train_text])
train_text = train_text[lengths < max_length]
train_labels = train_labels[lengths < max_length]

In [34]:
# Check actual max length of an article
article_length = max(np.array([len(x) for x in train_text]))
article_length

3227

In [35]:
# Word2vec model with vector size = 100
vec_size = 100

# workers - number of CPU threads
word_model = gensim.models.Word2Vec(train_text, vector_size = vec_size, window = 5, workers = 12)
word_model.train(train_text, epochs = 10, total_examples = len(train_text))
wv = word_model.wv

In [36]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_text)
vocabulary_size = len(tokenizer.word_index) + 1
encoded_articles = tokenizer.texts_to_sequences(train_text)

In [37]:
padded_articles = pad_sequences(encoded_articles, maxlen = article_length, padding='post')

In [38]:
padded_articles.shape

(12683, 3227)

In [39]:
emb_matrix = np.zeros(shape=(vocabulary_size, vec_size))
for w, i in tokenizer.word_index.items():
    ind = wv.has_index_for(w)
    if ind:
        emb_matrix[i] = wv.get_vector(w)

In [40]:
x_train, x_test, y_train, y_test = train_test_split(padded_articles, train_labels, test_size=0.25)

## LSTM model

In [92]:
from keras.initializers import Constant
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, LSTM
from keras.layers import ReLU
from keras.layers import Dropout
from tensorflow.keras.losses import CategoricalCrossentropy

model=Sequential()
model.add(Embedding(input_dim = vocabulary_size, 
                    output_dim = vec_size,
                    input_length = article_length,
                    embeddings_initializer = Constant(emb_matrix))
         )
model.add(LSTM(32, return_sequences=True))
model.add(Flatten())
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(16,activation='relu'))
model.add(Dropout(0.1))
model.add(Dense(6, activation='softmax'))

In [93]:
model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 3227, 100)         2552300   
                                                                 
 lstm_5 (LSTM)               (None, 3227, 32)          17024     
                                                                 
 flatten_5 (Flatten)         (None, 103264)            0         
                                                                 
 dense_12 (Dense)            (None, 64)                6608960   
                                                                 
 dropout_7 (Dropout)         (None, 64)                0         
                                                                 
 dense_13 (Dense)            (None, 16)                1040      
                                                                 
 dropout_8 (Dropout)         (None, 16)               

In [94]:
import keras.backend as K
def keras_f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [95]:
model.compile(optimizer=Adam(),loss='binary_crossentropy', metrics=['accuracy'])

## Training

In [96]:
epochs=10
batch_size=64

In [97]:
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b85ae7c908>

In [98]:
train_pred = np.argmax(model.predict(x_train), axis=1)
train_truth = np.argmax(y_train, axis=1)



In [99]:
# Accuracy
accuracy_score(train_truth, train_pred)

0.9834945332211943

In [100]:
# F1 score
balanced_accuracy_score(train_pred, train_truth)

0.9833783735277551

## Applying on test dataset

In [101]:
test_pred = np.argmax(model.predict(x_test), axis=1)
test_truth = np.argmax(y_test, axis=1)



In [102]:
# Accuracy
accuracy_score(test_pred, test_truth)

0.18290760012614318

In [103]:
# Balanced accuracy
balanced_accuracy_score(test_truth, test_pred)

0.17367251099982592