<a href="https://colab.research.google.com/github/emilynairnn/msci598_final_project/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports

In [2]:
import pandas as pd
from google.colab import drive 
drive.mount('/content/gdrive')

import sys
sys.path.insert(0,'gdrive/My Drive/MSCI598 Project')
from utils.score import report_score, LABELS, score_submission

import tensorflow as tf

import numpy as np

from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from keras import Sequential, regularizers
from keras import optimizers
from keras import backend as K
from keras import initializers, constraints, regularizers
from keras.layers import Reshape, Dot, Concatenate, Input, Embedding, Dropout, Dense, LSTM, Bidirectional, Activation, BatchNormalization
from keras.utils import np_utils
from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from scipy import sparse
from scipy.sparse import csr_matrix

import re

import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors

Mounted at /content/gdrive


In [3]:
# build word2vec model 
dir = 'gdrive/My Drive/MSCI598 Project/GoogleNews-vectors-negative300.bin'
wv = gensim.models.KeyedVectors.load_word2vec_format(dir, binary=True)

Pre Processing Data

In [4]:
# set parameters
max_feat = 5000
batch = 128
num_epochs = 10
max_vocab = 30000
max_sent_len = 30
embedding_dim = 300
lstm_dim = 128

In [5]:
# this will tokenize the take, make all words lower case and extract any stopwords 
def tokenize(content):
  list = " ".join(re.findall(r'\w+', content, flags=re.UNICODE)).lower()
  return " ".join([word for word in content.split(" ") if word not in feature_extraction.text.ENGLISH_STOP_WORDS])

# the raw data in csv files are stored in the google drive 

# the training data it processed by reading in the files and running the tokenize method above to clean the data 
train_bodies = pd.read_csv('gdrive/My Drive/MSCI598 Project/fnc-1-master/train_bodies.csv')
train_stances = pd.read_csv('gdrive/My Drive/MSCI598 Project/fnc-1-master/train_stances.csv')
train_combined = train_stances.join(train_bodies.set_index('Body ID'), on='Body ID')
train_headlines_clean = [tokenize(headline) for headline in train_combined['Headline']]
train_bodies_clean = [tokenize(article_body) for article_body in train_combined['articleBody']]

# the test data it processed by reading in the files and running the tokenize method above to clean the data 
test_bodies = pd.read_csv('gdrive/My Drive/MSCI598 Project/fnc-1-master/competition_test_bodies.csv')
test_stances = pd.read_csv('gdrive/My Drive/MSCI598 Project/fnc-1-master/competition_test_stances.csv')
test_combined = test_stances.join(test_bodies.set_index('Body ID'), on='Body ID')
test_headlines_clean = [tokenize(headline) for headline in test_combined['Headline']]
test_bodies_clean = [tokenize(article_body) for article_body in test_combined['articleBody']]


In [6]:
# the keras preprocessing package text_to_word_sequence is ran on the test and training headlines and bodies
# this splits up the text into a list of words so that it can be iterated on 

train_headlines_words = [text_to_word_sequence(headline) for headline in train_headlines_clean]
train_bodies_words = [text_to_word_sequence(article_body) for article_body in train_bodies_clean]
test_headlines_words = [text_to_word_sequence(headline) for headline in test_headlines_clean]
test_bodies_words = [text_to_word_sequence(article_body) for article_body in test_bodies_clean]

# words from headlines and bodies are iterated on and added to an array
train_words_all = [None]*len(train_headlines_words)
for i in range(len(train_headlines_words)):
  train_words_all[i] = train_headlines_words[i] + train_bodies_words[i]

test_words_all = [None]*len(test_headlines_words)
for i in range(len(test_headlines_words)):
  test_words_all[i] = test_headlines_words[i] + test_bodies_words[i]


In [7]:
# find the amount of words in different percentiles 
all_words = train_words_all + test_words_all
lengths = []
totlength = 0
for list in all_words[:10]:
  length = len(list)
  lengths.append(length)
  totlength = totlength + length

avglength = np.percentile (lengths, 50)
ninetieth = np.percentile(lengths, 90)
tenth = np.percentile(lengths, 10)
print("average length: " + str(avglength))
print("ninetieth percentile: " + str(ninetieth))
print("tenth percentile: " + str(tenth))


average length: 217.5
ninetieth percentile: 352.5999999999999
tenth percentile: 111.4


In [8]:
# use tokenizer from Keras preprocessing to create train and test data sets using label encoder 
# have 4 classes for the 4 different ways to classify article 
tokenizer = Tokenizer(num_words = max_vocab)
tokenizer.fit_on_texts([' '.join(seq[:max_sent_len]) for seq in all_words])

trainX = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_len]) for seq in train_words_all])
trainX = pad_sequences(trainX, maxlen = max_sent_len, padding = 'post',truncating = 'post')
label_encoder_train = LabelEncoder().fit_transform(train_combined['Stance'])
trainY = np_utils.to_categorical(label_encoder_train, num_classes = 4)

testX = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_len]) for seq in test_words_all])
testX = pad_sequences(testX, maxlen = max_sent_len, padding = 'post',truncating = 'post')
label_encoder_test = LabelEncoder().fit_transform(test_combined['Stance'])
testY = np_utils.to_categorical(label_encoder_test, num_classes = 4)

# split train into train and validation datasets 
trainX, valX, trainY, valY = train_test_split(trainX, trainY, random_state=10, test_size=0.1)


Build Model

In [None]:
# build model 

# create embeddings matrix
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, embedding_dim))

for word, i in tokenizer.word_index.items():
  try:
    embeddings_vector = wv[word]
  except KeyError:
    embeddings_vector = None
  if embeddings_vector is not None:
    embeddings_matrix[i] = embeddings_vector

# build LSTM RNN 
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                    output_dim=embedding_dim,
                    weights=[embeddings_matrix], trainable=False, name='word_embedding_layer',
                    mask_zero=True))
model.add(LSTM(lstm_dim, return_sequences=False, name='lstm_layer'))
model.add(Dense(64, name='dense_1'))
model.add(BatchNormalization(name='bn_1'))
model.add(Dropout(rate=0.25, name='dropout_1'))
model.add(Activation(activation='relu', name='activation_1'))   #change hidden layer here (sigmoid, tanh, relu)
model.add(Dense(4, activation='softmax', name='output_layer'))  #was sigmoid
model.summary()

# optimizer to change learning rate
opt = tf.keras.optimizers.Adam(learning_rate=0.001)

# compile the model and calculate accuracy
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

# fit the model using training data and validation data
model.fit(trainX,trainY,
          batch_size = batch,
          epochs = num_epochs,
          validation_data=(valX, valY))


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 word_embedding_layer (Embed  (None, None, 300)        3470400   
 ding)                                                           
                                                                 
 lstm_layer (LSTM)           (None, 128)               219648    
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 bn_1 (BatchNormalization)   (None, 64)                256       
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 activation_1 (Activation)   (None, 64)                0         
                                                      

Evaluate Model

In [None]:
# evaluate on test data
model.evaluate(testX, testY,
               batch_size = batch)

predicted = [LABELS[np.argmax(i)] for i in model.predict(testX)]
actual = [LABELS[np.argmax(i)] for i in testY]
np.savetxt("predicted.csv", predicted, delimiter=",", fmt='%s')
np.savetxt("actual.csv", actual, delimiter=",", fmt='%s')
!cp predicted.csv "gdrive/My Drive/MSCI598 Project"
!cp actual.csv "gdrive/My Drive/MSCI598 Project"
report_score(actual,predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    592    |     9     |    207    |   1095    |
-------------------------------------------------------------
| disagree  |    142    |     7     |    72     |    476    |
-------------------------------------------------------------
|  discuss  |    647    |    16     |   1492    |   2309    |
-------------------------------------------------------------
| unrelated |   1227    |    51     |   1086    |   15985   |
-------------------------------------------------------------
Score: 6360.5 out of 11651.25	(54.590709151378604%)


54.590709151378604

#save

In [None]:
from google.colab import files
df.to_csv('output.csv', index=False, encoding = 'utf-8') 
files.download('answer.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>