In [14]:
import pandas as pd
import numpy as np

from string import punctuation
from nltk.corpus import stopwords as sw
from nltk.tokenize import TweetTokenizer
import nltk
import re
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

%matplotlib inline

FEATURE_LIST = ['created_at', 'id', 'full_text', 'user', 'retweet_count', 'favorite_count', 'coordinates', 'place', 'class']

stopwords = sw.words('english') + ["'d", "'ll", "'re", "'s", "'ve", 'doe', 'ha', "n't", 'sha', 'wa', 'wo']

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def transform(text):

    if pd.isnull(text):
        return ''

    if type(text) != str or text=='':
        return ''

    # Clean the text
    text = re.sub("\'s", " ", text) # we have cases like "Sam is" or "Sam's" (i.e. his) these two cases aren't separable, I choose to compromise are kill "'s" directly
    text = re.sub(" whats ", " what is ", text, flags=re.IGNORECASE)
    text = re.sub("\'ve", " have ", text)
    text = re.sub("can't", "can not", text)
    text = re.sub("n't", " not ", text)
    text = re.sub("i'm", "i am", text, flags=re.IGNORECASE)
    text = re.sub("\'re", " are ", text)
    text = re.sub("\'d", " would ", text)
    text = re.sub("\'ll", " will ", text)
    text = re.sub("e\.g\.", " eg ", text, flags=re.IGNORECASE)
    text = re.sub("b\.g\.", " bg ", text, flags=re.IGNORECASE)
    text = re.sub("(\d+)(kK)", " \g<1>000 ", text)
    text = re.sub("e-mail", " email ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?U\.S\.A\.", " America ", text, flags=re.IGNORECASE)
    text = re.sub("(the[\s]+|The[\s]+)?United State(s)?", " America ", text, flags=re.IGNORECASE)
    text = re.sub("\(s\)", " ", text, flags=re.IGNORECASE)
    text = re.sub("[c-fC-F]\:\/", " disk ", text)

    text = re.sub("#{2,3}", "#", text)
    text = re.sub("#2020prot.*", "#2020protest", text, flags=re.IGNORECASE)
    text = re.sub("#abolish.*police", "#abolishpolice", text, flags=re.IGNORECASE)
    text = re.sub("#acab.*", "#", text, flags=re.IGNORECASE)
    text = re.sub("#ahmauda.*y", "#ahmaudaubrey", text, flags=re.IGNORECASE)
    text = re.sub(".*black_*li[f,v]e[s]*_*m\w*", "#blacklivesmatter", text, flags=re.IGNORECASE)
    text = re.sub(".*al{1,4}_*i[f,v]e[s]*_*m\w*", "#alllivesmatter", text, flags=re.IGNORECASE)


    # remove comma between numbers, i.e. 15,000 -> 15000
    text = re.sub('(?<=[0-9])\,(?=[0-9])', "", text)

    punct = punctuation.replace('@','').replace('#', '')
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punct])

    # Return a list of words
    return text

class CustomTokenizer(object):
    def __init__(self, tokenizer=TweetTokenizer(preserve_case=False)):
        self.tokenizer = tokenizer
        self.wordDict = {'PAD':0}
        self.num_words = 1

    def word2idx(self, df):
        re_digit = re.compile("[0-9]") # regular expression to filter digit tokens
        re_emoji = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

        tweets = []

        for index, row in df.iterrows():
            transformed_text = transform(row["full_text"])
            tokens=self.tokenizer.tokenize(transformed_text)
            tweet = []

            for token in tokens:
                if  re_emoji.match(token) or (
                token not in punctuation
                and len(token) > 1
                ):
                    if token not in self.wordDict:
                        self.wordDict[token]=self.num_words
                        self.num_words+=1

                    tweet.append(self.wordDict[token])

            tweets.append(tweet)

        return tweets, self.wordDict


In [3]:
training_set = pd.read_json('development.jsonl', lines=True)
training_set = training_set[FEATURE_LIST]

test_set = pd.read_json('evaluation.jsonl', lines=True)

In [4]:
tokenizer = CustomTokenizer()
train_words, _ = tokenizer.word2idx(training_set)
test_words, wordsDict = tokenizer.word2idx(test_set)

sorted(wordsDict)

['##nopolicestate',
 '#039s',
 '#03nov2020',
 '#10',
 '#100dayproject2020',
 '#100daysofcode',
 '#100kdead',
 '#12',
 '#1268botaheliodoro',
 '#13th',
 '#13thnetflix',
 '#13yearold',
 '#13yearoldboy',
 '#17',
 '#18',
 '#1965immigrationact',
 '#1968',
 '#1984',
 '#1984sikhgenocide',
 '#1989',
 '#19times',
 '#1a',
 '#1arights',
 '#1dstans',
 '#1miracle',
 '#1october',
 '#1stamendment',
 '#1stjune',
 '#1u',
 '#1up',
 '#1world1people',
 '#2020',
 '#2020btsfesta',
 '#2020census',
 '#2020election',
 '#2020elections',
 '#2020isbadbut',
 '#2020landslide',
 '#2020pandemic',
 '#2020protest',
 '#2020riot',
 '#2020riots',
 '#2020showedus',
 '#2020vision',
 '#2020wtf',
 '#2020年のtoday',
 '#20dollars',
 '#21jumpstreetforever',
 '#24thamendmentnow',
 '#254',
 '#25thamendmentbeforewealldie',
 '#25thamendmentnow',
 '#25theamendmentnow',
 '#290ink',
 '#2a',
 '#2amendment',
 '#2ashallnotbeinfringed',
 '#2baba',
 '#2millionofftumcial',
 '#2ndamendment',
 '#2nddegree',
 '#2ndwave',
 '#2ne1',
 '#2tothechest1t

In [5]:
word_count = lambda tweet: len(tweet)
longest_tweet = max(train_words, key=word_count)
len_longest_tweet = len(longest_tweet)
len_longest_tweet

220

In [16]:
from tensorflow.keras.preprocessing import sequence


X_train = sequence.pad_sequences(train_words, maxlen=len_longest_tweet)
X_test = sequence.pad_sequences(test_words, maxlen=len_longest_tweet)

y_train = training_set["class"].to_numpy()

In [17]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense

vocabulary_size = len(wordsDict.keys())
embedding_size = 100

model=Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=len_longest_tweet))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))

print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 220, 100)          7638400   
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 7,755,777
Trainable params: 7,755,777
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
from keras.callbacks import EarlyStopping
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

callback = [EarlyStopping(monitor='loss', patience=2)]
batch_size = 128
num_epochs = 15

history = model.fit(X_train, y_train, validation_split=0.2, batch_size=batch_size, epochs=num_epochs, callbacks=callback)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
103/500 [=====>........................] - ETA: 11:24 - loss: 0.0281 - accuracy: 0.9906

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

In [None]:
predictions = model.predict(X_test)
predictions = np.round(predictions.reshape(-1)).astype(int)
pred = pd.DataFrame()
pred.insert(0, "Predicted", predictions, True)
pred.to_csv('submission_LSTM.csv',sep=',', index_label='Id')
