In [1]:
# Mount to your drive to access folders
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Add filepath to the project
import os
import sys

GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = '/content/drive/MyDrive/Junior Year/EECS 487/Group Project'
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
print(os.listdir(GOOGLE_DRIVE_PATH))
os.chdir(GOOGLE_DRIVE_PATH)

['Project Proposal Brainstorm.gdoc', 'cleaned_headlines.csv', 'vectors.kv', 'vectors.kv.vectors.npy', 'Questions for Prof.gdoc', 'Notes For Progress Report.gdoc', 'BERT.ipynb', 'cleaned_tweets.csv', 'Notes for Presentation.gdoc', 'LSTM_Headlines.ipynb', 'LSTM_Tweets.ipynb']


In [3]:
# Import tensorflow and other important packages
import tensorflow as tf

from tensorflow.keras.datasets import imdb
from tensorflow.keras.layers import Embedding, Dense, LSTM
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split

In [4]:
# Model configuration
additional_metrics = ['accuracy']
batch_size = 128
embedding_output_dims = 100
loss_function = BinaryCrossentropy()
num_distinct_words = 15000
number_of_epochs = 5
optimizer = Adam()
validation_split = 0.20
verbosity_mode = 1

In [5]:
# Disable eager execution
tf.compat.v1.disable_eager_execution()

In [7]:
# Load in and check data
data = pd.read_csv("./cleaned_headlines.csv", index_col="Unnamed: 0")\
data.head()

Unnamed: 0,is_sarcastic,headline,article_link,no_stopwords,tokenized,tokenized_no_stopwords
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...,dem rep. totally nails congress falling short ...,dem rep. totally nails why congress is falling...,dem rep. totally nails congress falling short ...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...,eat veggies: 9 deliciously different recipes,eat your veggies : 9 deliciously different rec...,eat veggies : 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...,inclement weather prevents liar getting work,inclement weather prevents liar from getting t...,inclement weather prevents liar getting work
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...,mother comes pretty close using word 'streamin...,mother comes pretty close to using word 'strea...,mother comes pretty close using word 'streamin...


In [8]:
# For now we'll do tokenized with stopwords
data["split_tokens"] = [headline.split(" ") for headline in data.tokenized]
max_sequence_length = max([len(headline) for headline in data.split_tokens])

In [9]:
# Load up the numeric word encoder
word_index = imdb.get_word_index()

# Compute embeddings
X = []
for headline in data.split_tokens:
  word_lst = []
  for token in headline:
    try:
      temp = word_index[token]
      word_lst.append(temp) if temp < num_distinct_words else word_lst.append(0)
    except:
      word_lst.append(0)
  X.append(word_lst)
X = np.array(X, dtype=object)
print(X)

# Extract the target variable
y = data.is_sarcastic

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=487)

# Pad all sequences
padded_inputs = pad_sequences(X_train, maxlen=max_sequence_length, value = 0.0) # 0.0 because it corresponds with <PAD>
padded_inputs_test = pad_sequences(X_test, maxlen=max_sequence_length, value = 0.0) # 0.0 because it corresponds with <PAD>

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[list([0, 3365, 0, 0, 5428, 4, 1150, 1934])
 list([0, 0, 481, 5570, 135, 11789, 6, 1451, 343, 20, 4669, 0, 4343, 14758])
 list([1897, 126, 0, 0, 787, 6919, 272, 0]) ...
 list([1, 88, 304, 5604, 2488, 11, 1266, 382, 36, 3, 14343, 3451])
 list([4596, 0, 2727, 31, 0, 0, 831, 5058])
 list([1243, 0, 11, 21, 3, 1641, 567])]


In [10]:
# Check the paddings
padded_inputs

array([[    0,     0,     0, ..., 13179,   205,   147],
       [    0,     0,     0, ...,     0,  3447,     0],
       [    0,     0,     0, ...,     0,    15,    22],
       ...,
       [    0,     0,     0, ...,     9,    13,  5371],
       [    0,     0,     0, ...,  1630,    77,    64],
       [    0,     0,     0, ...,     1,  5939,     0]], dtype=int32)

In [11]:
# Define the Keras model
model = Sequential()
model.add(Embedding(num_distinct_words, embedding_output_dims, input_length=max_sequence_length))
model.add(LSTM(10))
model.add(Dense(1, activation='sigmoid'))

In [12]:
# Compile the model
model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)

In [13]:
# Give a summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 166, 100)          1500000   
                                                                 
 lstm (LSTM)                 (None, 10)                4440      
                                                                 
 dense (Dense)               (None, 1)                 11        
                                                                 
Total params: 1,504,451
Trainable params: 1,504,451
Non-trainable params: 0
_________________________________________________________________


In [14]:
# Train the model
history = model.fit(padded_inputs, y_train, batch_size=batch_size, epochs=number_of_epochs, verbose=verbosity_mode, validation_split=validation_split)

Train on 18316 samples, validate on 4579 samples
Epoch 1/5

  updates = self.state_updates


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [15]:
# Test the model after training
test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
print(f'Test results - Loss: {round(test_results[0], 3)} - Accuracy: {round(100*test_results[1], 4)}%')

Test results - Loss: 0.443 - Accuracy: 83.84%
