In [104]:
import pandas as pd
import numpy as np

# Load Train and Dev data

In [103]:
import json
def load_data(dataName):
    ids = []
    with open('project-data/' + dataName + '.data.txt') as f:
        for line in f.readlines():
            ids.append(line.rstrip('\n').split(','))

    data = []
    for seq in ids:
        texts = []
        try:
            # proceed only if the source tweet exists
            with open('project-data/crawled_tweets/' + dataName + '/' + seq[0] + '.json') as json_file:
                source = json.load(json_file)['text']
            texts.append(source)
            for id in seq[1:]:
                try: 
                    with open('project-data/crawled_tweets/' + dataName + '/' + id + '.json') as json_file:
                        text = json.load(json_file)['text']
                    texts.append(text)
                except:
                    continue
            data.append(texts)
        except:
            data.append(texts)
            continue
    return data

# load data and labels
trainData = load_data('train')
devData = load_data('dev')
trainLabel = [label.rstrip('\n') for label in open("project-data/train.label.txt", "r").readlines()]
devLabel = [label.rstrip('\n') for label in open("project-data/dev.label.txt", "r").readlines()]
assert(len(trainData) == len(trainLabel))
assert(len(devData) == len(devLabel))

# build dataframes
train = pd.DataFrame({"thread" : trainData, "label": trainLabel})
dev = pd.DataFrame({"thread" : devData, "label": devLabel})

# remove empty thread
train = train[train['thread'].map(lambda d: len(d)) > 0]
dev = dev[dev['thread'].map(lambda d: len(d)) > 0]

train.head()

Unnamed: 0,thread,label
0,[19. 5G mobile networks DO NOT spread COVID-19...,nonrumour
1,[@Telegraph we will be very satisfied if #Nawa...,rumour
2,[Coronavirus disease (COVID-19) advice for the...,nonrumour
3,[@WSJ when Canadians of all people start shoot...,nonrumour
4,[if the primary focus of a government isn't to...,nonrumour


In [105]:
# encode labels
from sklearn.preprocessing import LabelEncoder
labels = ['nonrumour', 'rumour']
encoder = LabelEncoder()
encoder.fit(train.label.to_list())
y_train = encoder.transform(train.label.to_list())
y_dev = encoder.transform(dev.label.to_list())

In [106]:
from collections import Counter
print('train: {}'.format(Counter(train.label)))
print('dev: {}'.format(Counter(dev.label)))

train: Counter({'nonrumour': 1390, 'rumour': 400})
dev: Counter({'nonrumour': 456, 'rumour': 136})


# Load test data

In [107]:
ids = []
with open('project-data/test.data.txt') as f:
    for line in f.readlines():
        ids.append(line.rstrip('\n').split(','))

testData = []
for seq in ids:
    texts = []
    try:
        # proceed only if the source tweet exists
        with open('project-data/tweet-objects/' + seq[0] + '.json') as json_file:
            source = json.load(json_file)['text']
        texts.append(source)
        for id in seq[1:]:
            try: 
                with open('project-data/tweet-objects/' + id + '.json') as json_file:
                    text = json.load(json_file)['text']
                texts.append(text)
            except:
                continue
        testData.append(texts)
    except:
        testData.append(texts)
        continue
test = pd.DataFrame({"thread" : testData})
test.head()

Unnamed: 0,thread
0,[How Does COVID-19 Spread? https://t.co/TXHDeU...
1,"[@brain_warrior I hate to keep saying it, but ..."
2,[Q. How are COVID-19 and influenza viruses dif...
3,[Una de les Q&amp;A on coronaviruses de la pàg...
4,[@_truthpolitics We should absolutely blame th...


# Preprocessing

Preprocessing includes:
1. remove twitter handles and urls
2. lower case
3. tokenize each tweet text into word tokens
4. remove any word that does not contain any English alphabets in a list of words
5. remove punctuation

In [108]:
# adapted from Assignment 1 code
import nltk
from nltk.corpus import stopwords
import re
from string import punctuation
from gensim.parsing.preprocessing import remove_stopwords

import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI)

stopwords = set(stopwords.words('english')) #note: stopwords are all in lowercase

def removeNonEnglish(tokens):
    # remove any word that does not contain any English alphabets in a list of words
    removed = []
    for token in tokens:
        alphabet = False
        for char in token:
            if char.isalpha():
                alphabet = True
                break
        if alphabet:
            removed.append(token)
    return removed

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
words = set(nltk.corpus.words.words()) #a list of words provided by NLTK
words = set([ word.lower() for word in words ]) #lowercase all the words for better matching
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma


def preprocess(thread):
    ts = []
    for tweet in thread:
        t = p.clean(tweet)
        t = t.lower() # lowercase all words
        t = t.split(' ') # tokenize each tweet into individual word tokens
        t = removeNonEnglish(t) # remove any word that does not contain any English alphabets
        # t = [token for token in t if not token in stopwords] # remove stopwords
        t = [lemmatize(w) for w in t]
        t = [w.strip(punctuation) for w in t] # remove punctuation
        ts.append(' '.join(t))
    return ts

train["preprocessed"] = [preprocess(thread) for thread in train['thread']]
dev["preprocessed"] = [preprocess(thread) for thread in dev['thread']]
test["preprocessed"] = [preprocess(thread) for thread in test['thread']]

In [109]:
train.head()

Unnamed: 0,thread,label,preprocessed
0,[19. 5G mobile networks DO NOT spread COVID-19...,nonrumour,"[5g mobile network do not spread covid-19, 5g ..."
1,[@Telegraph we will be very satisfied if #Nawa...,rumour,[we will be very satisfy if nawazsharif resign...
2,[Coronavirus disease (COVID-19) advice for the...,nonrumour,[coronavirus disease covid-19 advice for the p...
3,[@WSJ when Canadians of all people start shoot...,nonrumour,[when canadian of all people start shooting we...
4,[if the primary focus of a government isn't to...,nonrumour,[if the primary focus of a government isn't to...


In [110]:
max_num_words = max(
    max([len(tweet.split()) for thread in dev["preprocessed"] for tweet in thread]),
    max([len(tweet.split()) for thread in test["preprocessed"] for tweet in thread]),
    max([len(tweet.split()) for thread in train["preprocessed"] for tweet in thread]))
print('max number of words in one tweet: {}'.format(max_num_words))

max number of words in one tweet: 30


In [111]:
max_num_tweets = max(max([len(thread) for thread in dev["preprocessed"]]),
    max([len(thread) for thread in train["preprocessed"]]),
    max([len(thread) for thread in test["preprocessed"]]))
print('max number of tweets in one thread: {}'.format(max_num_tweets))

max number of tweets in one thread: 305


# Tokenization and Label Encoding

In [112]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token="<UNK>", split = ' ')
tokenizer.fit_on_texts([tweet for thread in train['preprocessed'] for tweet in thread])

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
vocab_size

15851

In [113]:
# tokenise the input into word sequences
train['xseq'] = [tokenizer.texts_to_sequences(thread) for thread in train.thread]
dev['xseq'] = [tokenizer.texts_to_sequences(thread) for thread in dev.thread]
test['xseq'] = [tokenizer.texts_to_sequences(thread) for thread in test.thread]

train.head()

Unnamed: 0,thread,label,preprocessed,xseq
0,[19. 5G mobile networks DO NOT spread COVID-19...,nonrumour,"[5g mobile network do not spread covid-19, 5g ...","[[29, 406, 644, 1888, 15, 17, 85, 27, 29, 1, 1..."
1,[@Telegraph we will be very satisfied if #Nawa...,rumour,[we will be very satisfy if nawazsharif resign...,"[[1, 34, 48, 2, 152, 1, 37, 5342, 5343, 161, 3..."
2,[Coronavirus disease (COVID-19) advice for the...,nonrumour,[coronavirus disease covid-19 advice for the p...,"[[20, 90, 27, 29, 820, 16, 3, 344, 1, 69, 9, 1..."
3,[@WSJ when Canadians of all people start shoot...,nonrumour,[when canadian of all people start shooting we...,"[[1, 77, 7363, 7, 41, 25, 357, 1498, 125, 2128..."
4,[if the primary focus of a government isn't to...,nonrumour,[if the primary focus of a government isn't to...,"[[37, 3, 5355, 2236, 7, 4, 328, 366, 5, 3318, ..."


## padding

In [114]:
from keras.preprocessing.sequence import pad_sequences

def padding(max_num_tweets, max_num_words, data):
    padded = np.zeros((len(data), max_num_tweets, max_num_words))
    for i in range(len(data)):
        thread_seq = data[i]
        thread_len = len(thread_seq)
        padded[i] = np.concatenate((pad_sequences(thread_seq, padding='post', maxlen=max_num_words), np.zeros((max_num_tweets-thread_len, max_num_words))), axis=0)
    return padded
#train['xseq_padded'] = [pad_sequences(thread, padding='post', maxlen=max_num_words) for thread in train.xseq]
#dev['xseq_padded'] = [pad_sequences(thread, padding='post', maxlen=max_num_words) for thread in dev.xseq]
#test['xseq_padded'] = [pad_sequences(thread, padding='post', maxlen=max_num_words) for thread in test.xseq]

#def padding(data):
    #padded = np.array([], dtype=object)
    #for thread_seq in data:
        #padded = np.concatenate(padded, pad_sequences(thread_seq, padding='post', maxlen=max_num_words), axis=0)
        #padded.append(pad_sequences(thread_seq, padding='post', maxlen=max_num_words))
    #return padded

train_xseq_padded= padding(max_num_tweets, max_num_words, train.xseq.values).astype(int)
dev_xseq_padded = padding(max_num_tweets, max_num_words, dev.xseq.values).astype(int)
test_xseq_padded = padding(max_num_tweets, max_num_words, test.xseq.values).astype(int)
train_xseq_padded[0]

array([[   29,   406,   644, ...,     0,     0,     0],
       [   29,   406,   644, ...,     0,     0,     0],
       [13012,     3,    24, ...,     0,     0,     0],
       ...,
       [    0,     0,     0, ...,     0,     0,     0],
       [    0,     0,     0, ...,     0,     0,     0],
       [    0,     0,     0, ...,     0,     0,     0]])

# Word2Vec Embedding

In [115]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

model_w2v = api.load("glove-twitter-50")

In [116]:
import numpy as np
# creating an matrix with zeroes of shape vocab x embedding dimension
embedding_matrix = np.zeros((vocab_size, 50))
# Iterate through word, index in the dictionary
for word, i in word_index.items():
    # extract the corresponding vector for the vocab indice of same word
    try:
        embedding_vector = model_w2v[word]
        if embedding_vector is not None:
            # Storing it in a matrix
            embedding_matrix[i] = embedding_vector
    except:
        continue

In [118]:
sum([line.any() for line in embedding_matrix])

13116

# Build Model and Parameter tuning

In [122]:
import keras
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Embedding, Dense, Dropout, Masking
from keras.optimizers import Adam
from sklearn.utils import class_weight
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

# Batch Size and Epochs

# Learning Rate

In [None]:
lstm1_units = 64
lstm2_units = 32
class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train), y_train)

def create_model(learn_rate=0.01):
	emb_dim = 50
	embedding_layer = Embedding(input_dim=vocab_size, output_dim=emb_dim,
								input_length=max_num_words, weights=[embedding_matrix], trainable=False)
	model = Sequential()
	model.add(TimeDistributed(embedding_layer, input_shape=(max_num_tweets, max_num_words)))
	model.add(Dropout(0.2))
	model.add(Masking(mask_value=0))
	model.add(TimeDistributed(LSTM(lstm1_units), input_shape=(max_num_tweets, max_num_words, emb_dim)))
	model.add(Dropout(0.2))
	model.add(Masking(mask_value=0))
	model.add(LSTM(lstm2_units))
	model.add(Dropout(0.2))
	model.add(Dense(1, activation='sigmoid'))
	model.compile(optimizer = Adam(lr=learn_rate),
				loss='binary_crossentropy',
				metrics = ['accuracy', keras.metrics.FalseNegatives(name="fn"), keras.metrics.FalsePositives(name="fp"),
							keras.metrics.TrueNegatives(name="tn"), keras.metrics.TruePositives(name="tp")])
	return model

# create model
model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=32, verbose=0)
# define the grid search parameters
learn_rate = [0.3, 0.2, 0.1, 0.01, 0.001, 0.0001, 0.00001]
param_grid = dict(learn_rate=learn_rate)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(train_xseq_padded, y_train, class_weight = class_weights)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

In [141]:
# parameter tuning
lstm1_unitsL = [16]#[64, 32, 16]
lstm2_unitsL = [8] #[32, 16, 8]
lrL = [0.3, 0.1, 0.01, 0.001, 0.0001, 0.00001]
epochs = 5
batch_size = 32
#dropout

emb_dim = 50
class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train), y_train)

for lstm1_units, lstm2_units in zip(lstm1_unitsL, lstm2_unitsL):
    for lr in lrL:
        print('--- ### Testing: lstm1_units = {}, lstm2_units = {}, lr = {}'.format(lstm1_units, lstm2_units, lr))

        model = Sequential()
        embedding_layer = Embedding(input_dim=vocab_size, output_dim=emb_dim,
                                    input_length=max_num_words, weights=[embedding_matrix], trainable=False)
        model.add(TimeDistributed(embedding_layer, input_shape=(max_num_tweets, max_num_words)))
        model.add(Dropout(0.2))
        model.add(Masking(mask_value=0))
        model.add(TimeDistributed(LSTM(lstm1_units, recurrent_dropout = 0.5), input_shape=(max_num_tweets, max_num_words, emb_dim)))
        model.add(Dropout(0.2))
        model.add(Masking(mask_value=0))
        model.add(LSTM(lstm2_units, recurrent_dropout = 0.5))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.compile(optimizer = Adam(lr=lr),
                    loss='binary_crossentropy',
                    metrics = ['accuracy', keras.metrics.FalseNegatives(name="fn"), keras.metrics.FalsePositives(name="fp"),
                                keras.metrics.TrueNegatives(name="tn"), keras.metrics.TruePositives(name="tp")])

        model.fit(
            train_xseq_padded,
            y_train,
            batch_size = batch_size,
            epochs=epochs,
            verbose=1,
            validation_data=(dev_xseq_padded, y_dev),
            class_weight=class_weights
        )

#loss, accuracy = model.evaluate(dev_xseq_padded, y_dev, verbose=False)
#print("Testing Accuracy:  {:.4f}".format(accuracy))

--- ### Testing: lstm1_units = 16, lstm2_units = 8, lr = 0.3


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1790 samples, validate on 592 samples
Epoch 1/5
Epoch 2/5

KeyboardInterrupt: 

In [138]:
model = Sequential()
embedding_layer = Embedding(input_dim=vocab_size, output_dim=emb_dim,
                                    input_length=max_num_words, weights=[embedding_matrix], trainable=False)
model.add(TimeDistributed(embedding_layer, input_shape=(max_num_tweets, max_num_words)))
model.add(Dropout(0.2))
model.add(Masking(mask_value=0))
model.add(TimeDistributed(LSTM(lstm1_units, recurrent_dropout = 0.5), input_shape=(max_num_tweets, max_num_words, emb_dim)))
model.add(Dropout(0.2))
model.add(Masking(mask_value=0))
model.add(LSTM(lstm2_units, recurrent_dropout = 0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = Adam(lr=lr),
            loss='binary_crossentropy',
            metrics = ['accuracy', keras.metrics.FalseNegatives(name="fn"), keras.metrics.FalsePositives(name="fp"),
                    keras.metrics.TrueNegatives(name="tn"), keras.metrics.TruePositives(name="tp"),])
model.summary()


Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_76 (TimeDis (None, 305, 30, 50)       792550    
_________________________________________________________________
dropout_48 (Dropout)         (None, 305, 30, 50)       0         
_________________________________________________________________
masking_42 (Masking)         (None, 305, 30, 50)       0         
_________________________________________________________________
time_distributed_77 (TimeDis (None, 305, 64)           29440     
_________________________________________________________________
dropout_49 (Dropout)         (None, 305, 64)           0         
_________________________________________________________________
masking_43 (Masking)         (None, 305, 64)           0         
_________________________________________________________________
lstm_77 (LSTM)               (None, 32)               

In [121]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train), y_train)

model.fit(
    train_xseq_padded,
    y_train,
    batch_size=32,
    epochs=epochs,
    verbose=1,
    validation_data=(dev_xseq_padded, y_dev),
    class_weight=class_weights
)

loss, accuracy = model.evaluate(dev_xseq_padded, y_dev, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1790 samples, validate on 592 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
 320/1790 [====>.........................] - ETA: 2:23 - loss: 0.5164 - accuracy: 0.7906 - fn: 67.0000 - fp: 0.0000e+00 - tn: 253.0000 - tp: 0.0000e+00

KeyboardInterrupt: 

In [89]:
lstm1_units = 32
lstm2_units = 16
lr = 

class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train), y_train)
emb_dim = 50

embedding_layer = Embedding(input_dim=vocab_size, output_dim=50, input_length=max_num_words, weights=[embedding_matrix], trainable=False)

model = Sequential(name="lstm")
model.add(TimeDistributed(embedding_layer, input_shape=(None, max_num_words)))
model.add(TimeDistributed(LSTM(lstm1_units), input_shape=(None, max_num_words, emb_dim)))
model.add(LSTM(lstm2_units))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = Adam(lr=0.000001),
              loss='binary_crossentropy',
              metrics = metrics)

counts = Counter(y_train)
weight_for_0 = 1.0 / counts[0]
weight_for_1 = 1.0 / counts[1]
class_weight = {0: weight_for_0, 1: weight_for_1}

model.fit(
    xseq_train,
    y_train,
    batch_size=64,
    epochs=5,
    verbose=True,
    validation_data=(xseq_dev, y_dev),
    class_weight=class_weight
)

loss, accuracy = model.evaluate(xseq_dev, y_dev, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


NameError: name 'xseq_train' is not defined

In [41]:
pred = model.predict(test_xseq_padded)
pred

array([[0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.43961695],
       [0.439

In [33]:
xseq_train.shape

(1788, 305, 30)

In [35]:
model.fit(xseq_train, y_train, epochs=20, verbose=True, validation_data=(xseq_dev, y_dev), batch_size=32)

loss, accuracy = model.evaluate(xseq_dev, y_dev, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Train on 1788 samples, validate on 590 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

KeyboardInterrupt: 