In [125]:
# Mount to your drive to access folders
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [126]:
# Add filepath to the project
import os
import sys

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = '/content/drive/MyDrive/Junior Year/EECS 487/Group Project'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))
os.chdir(GOOGLE_DRIVE_PATH)

['Project Proposal Brainstorm.gdoc', 'cleaned_headlines.csv', 'vectors.kv', 'vectors.kv.vectors.npy', 'Questions for Prof.gdoc', 'Notes For Progress Report.gdoc', 'BERT.ipynb', 'cleaned_tweets.csv', 'Notes for Presentation.gdoc', 'LSTM_Headlines.ipynb', 'LSTM_Tweets.ipynb']


In [127]:
# Import tensorflow and other important packages
import tensorflow as tf

from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

In [128]:
# Model configuration
additional_metrics = ['accuracy']
batch_size = 128
embedding_output_dims = 500
loss_function = BinaryCrossentropy()
num_distinct_words = 10000
number_of_epochs = 5
optimizer = Adam()
validation_split = 0.20
verbosity_mode = 1

In [129]:
# Disable eager execution
tf.compat.v1.disable_eager_execution()

In [130]:
# Load in and check the data
data = pd.read_csv("./cleaned_tweets.csv", index_col="Unnamed: 0")
data.head()

Unnamed: 0,tweet,is_sarcastic,no_stopwords,tokenized,tokenized_no_stopwords
0,i love working midnights tweet,1,love working midnights tweet,i love working midnights tweet,love working midnights tweet
1,i hate when i buy a bag of air and there's chi...,1,hate buy bag air there's chips ðÿ˜’ #not,i hate when i buy a bag of air and there 's ch...,hate buy bag air there 's chips ðÿ˜ ’ # not
2,my grandad always sounds so ill when i speak t...,0,grandad always sounds ill speak phone,my grandad always sounds so ill when i speak t...,grandad always sounds ill speak phone
3,"i realize i'm annoying to everyone, so i won't...",0,"realize i'm annoying everyone, keep spamming y...","i realize i 'm annoying to everyone , so i wo ...","realize i 'm annoying everyone , keep spamming..."
4,i love when i find these dudes on vine!! #foll...,1,love find dudes vine!! #followme #giveaway #xb...,i love when i find these dudes on vine ! ! # f...,love find dudes vine ! ! # followme # giveaway...


In [131]:
# For now we'll do tokenized with stopwords
data["split_tokens"] = [headline.split(" ") for headline in data.tokenized]
max_sequence_length = max([len(headline) for headline in data.split_tokens])

In [132]:
# Load up the numeric word encoder
word_index = imdb.get_word_index()

# Compute embeddings
X = []
for headline in data.split_tokens:
  word_lst = []
  for token in headline:
    try:
      temp = word_index[token]
      word_lst.append(temp) if temp < num_distinct_words else word_lst.append(0)
    except:
      word_lst.append(0)
  X.append(word_lst)
X = np.array(X, dtype=object)
print(X)

# Extract the target variable
y = data.is_sarcastic

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=487)

# Pad all sequences
padded_inputs = pad_sequences(X_train, maxlen=max_sequence_length, value = 0.0) # 0.0 because it corresponds with <PAD>
padded_inputs_test = pad_sequences(X_test, maxlen=max_sequence_length, value = 0.0) # 0.0 because it corresponds with <PAD>

[list([10, 116, 777, 0, 0])
 list([10, 781, 51, 10, 815, 3, 3116, 4, 942, 2, 47, 3576, 0, 8, 9, 0, 0, 0, 21])
 list([58, 0, 207, 931, 35, 1812, 51, 10, 1125, 5, 87, 20, 1, 1696]) ...
 list([123, 66, 3, 610, 1125, 5, 22, 35, 73, 12, 22, 0, 915, 5, 867, 8, 126, 516, 2, 1629, 5, 9, 0, 10, 116, 9, 51, 12, 568, 0])
 list([256, 0, 12, 159, 0, 4973, 0, 10, 78, 0, 37, 5, 987, 15, 225, 0])
 list([115, 93, 5, 377, 1, 248, 0, 0, 0, 0, 0])]


In [133]:
# Check the paddings
padded_inputs

array([[   0,    0,    0, ...,  147,    0,    0],
       [   0,    0,    0, ...,  447,    0,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       ...,
       [   0,    0,    0, ...,   32, 1188,    0],
       [   0,    0,    0, ...,    0,    0,    0],
       [   0,    0,    0, ...,    0,    0,   21]], dtype=int32)

In [134]:
# Define the Keras model
model = Sequential()
model.add(Embedding(num_distinct_words, embedding_output_dims, input_length=max_sequence_length))
model.add(LSTM(10))
model.add(Dense(1, activation='sigmoid'))

In [135]:
# Compile the model
model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)

In [136]:
# Give a summary
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 61, 500)           5000000   
                                                                 
 lstm_8 (LSTM)               (None, 10)                20440     
                                                                 
 dense_8 (Dense)             (None, 1)                 11        
                                                                 
Total params: 5,020,451
Trainable params: 5,020,451
Non-trainable params: 0
_________________________________________________________________


In [137]:
# Train the model
history = model.fit(padded_inputs, y_train, batch_size=batch_size, epochs=number_of_epochs, verbose=verbosity_mode, validation_split=validation_split)

Train on 1276 samples, validate on 319 samples
Epoch 1/5

  updates = self.state_updates


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [138]:
# Test the model after training
test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
print(f'Test results - Loss: {round(test_results[0], 3)} - Accuracy: {round(100*test_results[1], 4)}%')

Test results - Loss: 0.319 - Accuracy: 88.7218%
