<a href="https://colab.research.google.com/github/emilynairnn/msci598_final_project/blob/main/Feed_Forward_NN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Imports

In [1]:
pip install keras_self_attention

Collecting keras_self_attention
  Downloading keras-self-attention-0.51.0.tar.gz (11 kB)
Building wheels for collected packages: keras-self-attention
  Building wheel for keras-self-attention (setup.py) ... [?25l[?25hdone
  Created wheel for keras-self-attention: filename=keras_self_attention-0.51.0-py3-none-any.whl size=18912 sha256=0e57a1c0931608b805db62525b402f1c7c6285df96f31e2a6e290fcb8bb2c805
  Stored in directory: /root/.cache/pip/wheels/95/b1/a8/5ee00cc137940b2f6fa198212e8f45d813d0e0d9c3a04035a3
Successfully built keras-self-attention
Installing collected packages: keras-self-attention
Successfully installed keras-self-attention-0.51.0


In [2]:
import pandas as pd
from google.colab import drive 
drive.mount('/content/gdrive')

import sys
sys.path.insert(0,'gdrive/My Drive/MSCI598 Project')
from utils.score import report_score, LABELS, score_submission

import numpy as np
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer, TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras import Sequential, regularizers
from keras import optimizers
from keras import backend as K
from keras import initializers, constraints, regularizers
from keras.layers import Reshape, Dot, Concatenate, Input, Embedding, Dropout, Dense, LSTM, Bidirectional, Activation, BatchNormalization
from keras.utils import np_utils
from keras.models import Model
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from scipy import sparse
from scipy.sparse import csr_matrix
import re
import gensim
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from keras_self_attention import SeqSelfAttention

import keras
import tensorflow as tf
from keras_self_attention import SeqSelfAttention
from keras.layers import Dense, Activation, Flatten, SimpleRNN, GRU
from keras import optimizers

Mounted at /content/gdrive


In [3]:
dir = 'gdrive/My Drive/MSCI598 Project/GoogleNews-vectors-negative300.bin'
wv = gensim.models.KeyedVectors.load_word2vec_format(dir, binary=True)

Pre Processing Data

In [11]:
# set parameters
max_feat = 5000
batch = 128
num_epochs = 10
max_vocab = 30000
max_sent_len = 30
embedding_dim = 300
lstm_dim = 128

In [15]:
# this will tokenize the take, make all words lower case and extract any stopwords 
def tokenize(content):
  list = " ".join(re.findall(r'\w+', content, flags=re.UNICODE)).lower()
  return " ".join([word for word in content.split(" ") if word not in feature_extraction.text.ENGLISH_STOP_WORDS])

# the raw data in csv files are stored in the google drive 

# the training data it processed by reading in the files and running the tokenize method above to clean the data 
train_bodies = pd.read_csv('gdrive/My Drive/MSCI598 Project/fnc-1-master/train_bodies.csv')
train_stances = pd.read_csv('gdrive/My Drive/MSCI598 Project/fnc-1-master/train_stances.csv')
train_combined = train_stances.join(train_bodies.set_index('Body ID'), on='Body ID')
train_headlines_clean = [tokenize(headline) for headline in train_combined['Headline']]
train_bodies_clean = [tokenize(article_body) for article_body in train_combined['articleBody']]

# the test data it processed by reading in the files and running the tokenize method above to clean the data 
test_bodies = pd.read_csv('gdrive/My Drive/MSCI598 Project/fnc-1-master/competition_test_bodies.csv')
test_stances = pd.read_csv('gdrive/My Drive/MSCI598 Project/fnc-1-master/competition_test_stances.csv')
test_combined = test_stances.join(test_bodies.set_index('Body ID'), on='Body ID')
test_headlines_clean = [tokenize(headline) for headline in test_combined['Headline']]
test_bodies_clean = [tokenize(article_body) for article_body in test_combined['articleBody']]

In [16]:
# the keras preprocessing package text_to_word_sequence is ran on the test and training headlines and bodies
# this splits up the text into a list of words so that it can be iterated on 

train_headlines_words = [text_to_word_sequence(headline) for headline in train_headlines_clean]
train_bodies_words = [text_to_word_sequence(article_body) for article_body in train_bodies_clean]
test_headlines_words = [text_to_word_sequence(headline) for headline in test_headlines_clean]
test_bodies_words = [text_to_word_sequence(article_body) for article_body in test_bodies_clean]

# words from headlines and bodies are iterated on and added to an array
train_words_all = [None]*len(train_headlines_words)
for i in range(len(train_headlines_words)):
  train_words_all[i] = train_headlines_words[i] + train_bodies_words[i]

test_words_all = [None]*len(test_headlines_words)
for i in range(len(test_headlines_words)):
  test_words_all[i] = test_headlines_words[i] + test_bodies_words[i]


In [17]:
all_words = train_words_all + test_words_all
# use tokenizer from Keras preprocessing to create train and test data sets using label encoder 
# have 4 classes for the 4 different ways to classify article 
tokenizer = Tokenizer(num_words = max_vocab)
tokenizer.fit_on_texts([' '.join(seq[:max_sent_len]) for seq in all_words])

trainX = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_len]) for seq in train_words_all])
trainX = pad_sequences(trainX, maxlen = max_sent_len, padding = 'post',truncating = 'post')
label_encoder_train = LabelEncoder().fit_transform(train_combined['Stance'])
trainY = np_utils.to_categorical(label_encoder_train, num_classes = 4)

testX = tokenizer.texts_to_sequences([' '.join(seq[:max_sent_len]) for seq in test_words_all])
testX = pad_sequences(testX, maxlen = max_sent_len, padding = 'post',truncating = 'post')
label_encoder_test = LabelEncoder().fit_transform(test_combined['Stance'])
testY = np_utils.to_categorical(label_encoder_test, num_classes = 4)

# split train into train and validation datasets 
trainX, valX, trainY, valY = train_test_split(trainX, trainY, random_state=10, test_size=0.1)


In [21]:
# build model
model = Sequential() 
model.add(Input(shape=(4,), name='Input')) 
model.add(Dense(32, activation='softmax', name='Hidden'))
model.add(Dense(4, activation='sigmoid', name='Output')) 

opt = tf.keras.optimizers.Adam(learning_rate=0.0001)

model.compile(
    optimizer=opt,
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Hidden (Dense)              (None, 32)                160       
                                                                 
 Output (Dense)              (None, 4)                 132       
                                                                 
Total params: 292
Trainable params: 292
Non-trainable params: 0
_________________________________________________________________


In [22]:
# fit the model using training data and validation data
model.fit(trainX,trainY,
          batch_size = batch,
          epochs = num_epochs,
          validation_data=(valX, valY))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f187c0e4a50>

In [23]:
model.evaluate(testX, testY,
               batch_size = batch)

predicted = [LABELS[np.argmax(i)] for i in model.predict(testX)]
actual = [LABELS[np.argmax(i)] for i in testY]
np.savetxt("answer.csv", predicted, delimiter=",", fmt='%s')
report_score(actual,predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |     0     |     0     |     0     |   1903    |
-------------------------------------------------------------
| disagree  |     0     |     0     |     0     |    697    |
-------------------------------------------------------------
|  discuss  |     0     |     0     |     0     |   4464    |
-------------------------------------------------------------
| unrelated |     0     |     0     |     0     |   18349   |
-------------------------------------------------------------
Score: 4587.25 out of 11651.25	(39.37131209097736%)


39.37131209097736

In [None]:
from google.colab import files
df.to_csv('output.csv', index=False, encoding = 'utf-8') 
files.download('answer.csv')