In [1]:
import pandas as pd
import numpy as np

# Load Train and Dev data

In [2]:
import json
def load_data(dataName):
    ids = []
    with open('project-data/' + dataName + '.data.txt') as f:
        for line in f.readlines():
            ids.append(line.rstrip('\n').split(','))

    data = []
    for seq in ids:
        texts = []
        try:
            # proceed only if the source tweet exists
            with open('project-data/crawled_tweets/' + dataName + '/' + seq[0] + '.json') as json_file:
                source = json.load(json_file)['text']
            texts.append(source)
            for id in seq[1:]:
                try: 
                    with open('project-data/crawled_tweets/' + dataName + '/' + id + '.json') as json_file:
                        text = json.load(json_file)['text']
                    texts.append(text)
                except:
                    continue
            data.append(texts)
        except:
            data.append(texts)
            continue
    return data

# load data and labels
trainData = load_data('train')
devData = load_data('dev')
trainLabel = [label.rstrip('\n') for label in open("project-data/train.label.txt", "r").readlines()]
devLabel = [label.rstrip('\n') for label in open("project-data/dev.label.txt", "r").readlines()]
assert(len(trainData) == len(trainLabel))
assert(len(devData) == len(devLabel))

# build dataframes
train = pd.DataFrame({"thread" : trainData, "label": trainLabel})
dev = pd.DataFrame({"thread" : devData, "label": devLabel})

# remove empty thread
train = train[train['thread'].map(lambda d: len(d)) > 0]
dev = dev[dev['thread'].map(lambda d: len(d)) > 0]

train.head()

Unnamed: 0,thread,label
0,[19. 5G mobile networks DO NOT spread COVID-19...,nonrumour
1,[@Telegraph we will be very satisfied if #Nawa...,rumour
2,[Coronavirus disease (COVID-19) advice for the...,nonrumour
3,[@WSJ when Canadians of all people start shoot...,nonrumour
4,[if the primary focus of a government isn't to...,nonrumour


In [3]:
# encode labels
from sklearn.preprocessing import LabelEncoder
labels = ['nonrumour', 'rumour']
encoder = LabelEncoder()
encoder.fit(train.label.to_list())
y_train = encoder.transform(train.label.to_list())
y_dev = encoder.transform(dev.label.to_list())

In [4]:
from collections import Counter
print('train: {}'.format(Counter(train.label)))
print('dev: {}'.format(Counter(dev.label)))

train: Counter({'nonrumour': 1390, 'rumour': 400})
dev: Counter({'nonrumour': 456, 'rumour': 136})


In [5]:
1-(1388+455)/(400+135+1388+455)

0.2249789739276703

# Load test data

In [16]:
ids = []
with open('project-data/test.data.txt') as f:
    for line in f.readlines():
        ids.append(line.rstrip('\n').split(','))

testData = []
for seq in ids:
    texts = []
    try:
        # proceed only if the source tweet exists
        with open('project-data/tweet-objects/' + seq[0] + '.json') as json_file:
            source = json.load(json_file)['text']
        texts.append(source)
        for id in seq[1:]:
            try: 
                with open('project-data/tweet-objects/' + id + '.json') as json_file:
                    text = json.load(json_file)['text']
                texts.append(text)
            except:
                continue
        testData.append(texts)
    except:
        testData.append(texts)
        continue
test = pd.DataFrame({"thread" : testData})
test.head()

Unnamed: 0,thread
0,[How Does COVID-19 Spread? https://t.co/TXHDeU...
1,"[@brain_warrior I hate to keep saying it, but ..."
2,[Q. How are COVID-19 and influenza viruses dif...
3,[Una de les Q&amp;A on coronaviruses de la pàg...
4,[@_truthpolitics We should absolutely blame th...


# Preprocessing

Preprocessing includes:
1. remove twitter handles and urls
2. lower case
3. tokenize each tweet text into word tokens
4. remove any word that does not contain any English alphabets in a list of words
5. remove punctuation

In [17]:
from gensim.parsing.preprocessing import remove_stopwords
remove_stopwords('tree and branch')

'tree branch'

In [18]:
# adapted from Assignment 1 code
import nltk
from nltk.corpus import stopwords
import re
from string import punctuation
from gensim.parsing.preprocessing import remove_stopwords

import preprocessor as p
p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI)

stopwords = set(stopwords.words('english')) #note: stopwords are all in lowercase

def removeNonEnglish(tokens):
    # remove any word that does not contain any English alphabets in a list of words
    removed = []
    for token in tokens:
        alphabet = False
        for char in token:
            if char.isalpha():
                alphabet = True
                break
        if alphabet:
            removed.append(token)
    return removed

lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
words = set(nltk.corpus.words.words()) # a list of words provided by NLTK
words = set([ word.lower() for word in words ]) #lowercase all the words for better matching
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma


def preprocess(thread):
    ts = []
    for tweet in thread:
        t = p.clean(tweet)
        t = t.lower() # lowercase all words
        t = t.split(' ') # tokenize each tweet into individual word tokens
        t = removeNonEnglish(t) # remove any word that does not contain any English alphabets
        # t = [token for token in t if not token in stopwords] # remove stopwords
        t = [lemmatize(w) for w in t]
        t = [w.strip(punctuation) for w in t] # remove punctuation
        ts.append(' '.join(t))
    return ts

train["preprocessed"] = [preprocess(thread) for thread in train['thread']]
dev["preprocessed"] = [preprocess(thread) for thread in dev['thread']]
test["preprocessed"] = [preprocess(thread) for thread in test['thread']]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):
  from collections import Mapping, defaultdict
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  eps=np.finfo(np.float).eps, positive=False):
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  max_n_alp

In [19]:
train.head()

Unnamed: 0,thread,label,preprocessed
0,[19. 5G mobile networks DO NOT spread COVID-19...,nonrumour,"[5g mobile network do not spread covid-19, 5g ..."
1,[@Telegraph we will be very satisfied if #Nawa...,rumour,[we will be very satisfy if nawazsharif resign...
2,[Coronavirus disease (COVID-19) advice for the...,nonrumour,[coronavirus disease covid-19 advice for the p...
3,[@WSJ when Canadians of all people start shoot...,nonrumour,[when canadian of all people start shooting we...
4,[if the primary focus of a government isn't to...,nonrumour,[if the primary focus of a government isn't to...


In [20]:
max_num_words = max(
    max([len(tweet.split()) for thread in dev["preprocessed"] for tweet in thread]),
    max([len(tweet.split()) for thread in test["preprocessed"] for tweet in thread]),
    max([len(tweet.split()) for thread in train["preprocessed"] for tweet in thread]))
print('max number of words in one tweet: {}'.format(max_num_words))

max number of words in one tweet: 30


In [21]:
max_num_tweets = max(max([len(thread) for thread in dev["preprocessed"]]),
    max([len(thread) for thread in train["preprocessed"]]),
    max([len(thread) for thread in test["preprocessed"]]))
print('max number of tweets in one thread: {}'.format(max_num_tweets))

max number of tweets in one thread: 305


# Tokenization and Label Encoding

In [22]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token="<UNK>", split = ' ')
tokenizer.fit_on_texts([tweet for thread in train['preprocessed'] for tweet in thread])

word_index = tokenizer.word_index
vocab_size = len(tokenizer.word_index) + 1
vocab_size

15851

In [23]:
# tokenise the input into word sequences
train['xseq'] = [tokenizer.texts_to_sequences(thread) for thread in train.thread]
dev['xseq'] = [tokenizer.texts_to_sequences(thread) for thread in dev.thread]
test['xseq'] = [tokenizer.texts_to_sequences(thread) for thread in test.thread]

train.head()

Unnamed: 0,thread,label,preprocessed,xseq
0,[19. 5G mobile networks DO NOT spread COVID-19...,nonrumour,"[5g mobile network do not spread covid-19, 5g ...","[[29, 406, 644, 1888, 15, 17, 85, 27, 29, 1, 1..."
1,[@Telegraph we will be very satisfied if #Nawa...,rumour,[we will be very satisfy if nawazsharif resign...,"[[1, 34, 48, 2, 152, 1, 37, 5342, 5343, 161, 3..."
2,[Coronavirus disease (COVID-19) advice for the...,nonrumour,[coronavirus disease covid-19 advice for the p...,"[[20, 90, 27, 29, 820, 16, 3, 344, 1, 69, 9, 1..."
3,[@WSJ when Canadians of all people start shoot...,nonrumour,[when canadian of all people start shooting we...,"[[1, 77, 7363, 7, 41, 25, 357, 1498, 125, 2128..."
4,[if the primary focus of a government isn't to...,nonrumour,[if the primary focus of a government isn't to...,"[[37, 3, 5355, 2236, 7, 4, 328, 366, 5, 3318, ..."


## padding

In [24]:
a = np.array([[1,2,3], [4,5,6]])
np.reshape(a, -1)

array([1, 2, 3, 4, 5, 6])

In [25]:
from keras.preprocessing.sequence import pad_sequences

def padding(max_num_tweets, max_num_words, data):
    padded = np.zeros((len(data), max_num_tweets, max_num_words))
    for i in range(len(data)):
        thread_seq = data[i]
        thread_len = len(thread_seq)
        padded[i] = np.concatenate((pad_sequences(thread_seq, padding='post', maxlen=max_num_words), np.zeros((max_num_tweets-thread_len, max_num_words))), axis=0)
    result = np.zeros((len(data), max_num_tweets*max_num_words))
    for i in range(len(data)):
        thread = padded[i]
        result[i] = np.reshape(thread, -1)
    return result


train_xseq_padded= padding(max_num_tweets, max_num_words, train.xseq.values).astype(int)
dev_xseq_padded = padding(max_num_tweets, max_num_words, dev.xseq.values).astype(int)
test_xseq_padded = padding(max_num_tweets, max_num_words, test.xseq.values).astype(int)
train_xseq_padded[0]

array([ 29, 406, 644, ...,   0,   0,   0])

# Word2Vec Embedding

In [26]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

model_w2v = api.load("glove-twitter-50")

In [27]:
import numpy as np
# creating an matrix with zeroes of shape vocab x embedding dimension
embedding_matrix = np.zeros((vocab_size, 50))
# Iterate through word, index in the dictionary
for word, i in word_index.items():
    # extract the corresponding vector for the vocab indice of same word
    try:
        embedding_vector = model_w2v[word]
        if embedding_vector is not None:
            # Storing it in a matrix
            embedding_matrix[i] = embedding_vector
    except:
        continue

In [28]:
sum([line.any() for line in embedding_matrix])

13116

# Build Model and Parameter Tuning

In [29]:
import keras
from keras.models import Sequential
from keras.layers import LSTM, TimeDistributed, Embedding, Dense, Dropout, Masking
from keras.optimizers import Adam
from sklearn.utils import class_weight

# Batch Size and Epochs

In [32]:
class_weights = class_weight.compute_class_weight('balanced',np.unique(y_train), y_train)
emb_dim = 50
def train_model(lstm_units, lr, epochs, batch_size):
    model = Sequential(name="lstm")
    embedding_layer = Embedding(input_dim=vocab_size, output_dim=emb_dim,
                                input_length=max_num_words*max_num_tweets, weights=[embedding_matrix], trainable=False)
    model.add(embedding_layer)
    model.add(Dropout(0.2))
    model.add(Masking(mask_value=0))
    model.add(LSTM(lstm_units))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer = Adam(lr=lr),
                loss='binary_crossentropy',
                metrics = ['accuracy', keras.metrics.FalseNegatives(name="fn"), keras.metrics.FalsePositives(name="fp"),
                            keras.metrics.TrueNegatives(name="tn"), keras.metrics.TruePositives(name="tp")])
    model.fit(
    train_xseq_padded,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    verbose=1,
    validation_data=(dev_xseq_padded, y_dev),
    class_weight=class_weights)

    return model

In [40]:
epochsL = [5, 10, 15, 20]
batch_sizeL = [128, 64, 32, 16, 8]
lstm_units = 128
lr=0.01

models = []
for epochs in epochsL:
    for batch_size in batch_sizeL:
        print('--- ### Testing: batch_size = {}, epochs = {}, lr = {}, lstm_units = {}'.format(batch_size, epochs, lr, lstm_units))
        models.append(train_model(lstm_units, lr, epochs, batch_size))

--- ### Testing: batch_size = 128, epochs = 5, lr = 0.01, lstm_units = 128
Train on 1790 samples, validate on 592 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- ### Testing: batch_size = 64, epochs = 5, lr = 0.01, lstm_units = 128
Train on 1790 samples, validate on 592 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- ### Testing: batch_size = 32, epochs = 5, lr = 0.01, lstm_units = 128
Train on 1790 samples, validate on 592 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- ### Testing: batch_size = 16, epochs = 5, lr = 0.01, lstm_units = 128
Train on 1790 samples, validate on 592 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
--- ### Testing: batch_size = 8, epochs = 5, lr = 0.01, lstm_units = 128
Train on 1790 samples, validate on 592 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

KeyboardInterrupt: 

### Best Batch size: 64

In [19]:
epochsL = [5, 10, 15, 20]
batch_sizeL = [128, 64, 32, 16, 8]
lstm_units = 128
lr=0.01

models = []
for epochs in epochsL:
    for batch_size in batch_sizeL:
        print('--- ### Testing: batch_size = {}, epochs = {}, lr = {}, lstm_units = {}'.format(batch_size, epochs, lr, lstm_units))
        models.append(train_model(lstm_units, lr, epochs, batch_size))

array([0.64409222, 2.235     ])

In [22]:
model.fit(
    train_xseq_padded,
    y_train,
    batch_size=64,
    epochs=10,
    verbose=1,
    validation_data=(dev_xseq_padded, y_dev),
    class_weight=class_weights
)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 1788 samples, validate on 590 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


ValueError: too many values to unpack (expected 2)

In [27]:
pred = model.predict(test_xseq_padded, batch_size=64)

In [32]:
def predict_label(score):
    if score > 0.5:
        return 1
    return 0
pred_label = [predict_label(s) for s in pred]
Counter(pred_label)

Counter({0: 421, 1: 137})

In [33]:
pd.DataFrame({'Id': list(range(len(test_xseq_padded))),'Predicted': pred_label}).to_csv('test.pred.csv', index=False)

In [31]:
import csv

with open('test.pred.csv', 'wb') as mf:
    wr = csv.writer(mf, quoting=csv.QUOTE_ALL)
    wr.writerow(pred_label)

## model 2

In [34]:
lstm_units = 256
trainable = False
lr = 0.008
class_weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
# epochs, learning rate, batch_size, early_stopping

emb_dim = 50 # fixed

model = Sequential(name="lstm")
model.add(Embedding(input_dim = vocab_size, output_dim = emb_dim,
                    input_length = max_num_words * max_num_tweets, weights=[embedding_matrix], trainable=trainable))
model.add(Dropout(0.2))
model.add(Masking(mask_value=0))
model.add(LSTM(lstm_units))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = Adam(lr=lr),
              loss='binary_crossentropy',
              metrics = ['accuracy', keras.metrics.FalseNegatives(name="fn"), keras.metrics.FalsePositives(name="fp"),
                        keras.metrics.TrueNegatives(name="tn"), keras.metrics.TruePositives(name="tp"),])

model.fit(
    train_xseq_padded,
    y_train,
    batch_size=64,
    epochs=10,
    verbose=1,
    validation_data=(dev_xseq_padded, y_dev),
    class_weight=class_weights
)
pred = model.predict(test_xseq_padded, batch_size=64)
pred_label = [predict_label(s) for s in pred]

Train on 1788 samples, validate on 590 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10