In [None]:
# importing required libraries
import sys
import gensim
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import keras
from keras.models import Sequential
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, LSTM, Embedding, Dropout, Bidirectional, Activation
from keras.utils import np_utils

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from string import punctuation
from keras import backend as K
from keras import initializers, regularizers, constraints

In [None]:
# setting values for these parameters
EMBEDDING_DIM = 300
BATCH_SIZE = 128
MAX_VOCAB_SIZE = 40000
MAX_SENT_LEN = 150
N_EPOCHS = 10
LSTM_DIM = 100

In [None]:
# !wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# W2V_DIR = '/root/input/GoogleNews-vectors-negative300.bin.gz'
from google.colab import drive
drive.mount('/content/drive')

#from google.colab import files
#uploaded = files.upload()
#
#W2V_DIR = 'glove.twitter.27B.50d.txt'

W2V_DIR = 'GoogleNews-vectors-negative300.bin.gz'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# reading in the train and test data
train_stance = pd.read_csv('train_stances.csv')
train_body = pd.read_csv('train_bodies.csv')
test_stance = pd.read_csv('test_stances_unlabeled.csv')
test_body = pd.read_csv('competition_test_bodies.csv')

# replacing the stances with numerical values so that a model can be trained on them
train_stance.replace('unrelated', 1, True)
train_stance.replace('agree', 2, True)
train_stance.replace('disagree', 3, True)
train_stance.replace('discuss', 4, True)

# merging datasets so that the bodies and titles can be together
df_train = train_stance.join(train_body.set_index('Body ID'), on='Body ID')
df_test = test_stance.join(test_body.set_index('Body ID'), on='Body ID')


In [None]:
import os
import re
import nltk
nltk.download('punkt')
nltk.download('wordnet')
import numpy as np
from sklearn import feature_extraction
from tqdm import tqdm

def clean(s):
  return re.sub("[^a-zA-Z]", " ",str(s)).lower()

_wnl = nltk.WordNetLemmatizer()

def normalize_word(w):
  return _wnl.lemmatize(w).lower()

def get_tokenized_lemmas(s):
  return [normalize_word(t) for t in nltk.word_tokenize(s)]

def remove_stopwords(l):
    return [w for w in l if w not in feature_extraction.text.ENGLISH_STOP_WORDS]

def preprocess(data, title):
  content = []
  content = [clean(line) for line in data[title]]
  content = [remove_stopwords(line) for line in data[title]]
  content = [get_tokenized_lemmas(line) for line in data[title]]
  content = [' '.join(x) for x in content]
  data[title] = content


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# do all necessary preprocessing on all data we need
preprocess(train_stance, 'Headline')
preprocess(train_body, 'articleBody')
preprocess(test_stance, 'Headline')
preprocess(test_body, 'articleBody')


In [None]:
# function to merge the headlines and articlebody datasets
def merge(d1, d2):
  data = pd.merge(d1, d2, how='inner', left_on=['Body ID'], right_on=['Body ID'])
  return data

In [None]:
# merge bodies and headlines
test_data = merge(test_stance, test_body)
train_data = merge(train_stance, train_body)

In [None]:
# getting the words sequences
wsh_train = [text_to_word_sequence(text) for text in train_data['Headline']]
wsb_train = [text_to_word_sequence(text) for text in train_data['articleBody']]
wsh_test = [text_to_word_sequence(text) for text in df_test['Headline']]
wsb_test = [text_to_word_sequence(text) for text in df_test['articleBody']]

seq = []
for i in range(len(wsh_train)):
    seq.append(wsh_train[i])
for i in range(len(wsb_train)):
    seq.append(wsb_train[i])
for i in range(len(wsh_test)):
    seq.append(wsh_test[i])
for i in range(len(wsb_test)):
    seq.append(wsb_test[i])


In [None]:

#special_chars = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

# tokenizing and converting text to numerical values
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts([seq for seq in seq])

ws_train = [list(line) for line in wsh_train]
for line in range(len(wsh_train)):
    ws_train[line].extend(wsb_train[line])

ws_test = [list(line) for line in wsh_test]
for line in range(len(wsh_test)):
    ws_test[line].extend(wsb_test[line])

X_train = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in ws_train])

In [None]:
X_train = pad_sequences(X_train, maxlen=MAX_SENT_LEN, padding='post', truncating='post')

X_test = tokenizer.texts_to_sequences([' '.join(seq[:MAX_SENT_LEN]) for seq in ws_test])
X_test = pad_sequences(X_test, maxlen=MAX_SENT_LEN, padding='post', truncating='post')


In [None]:
# encoding the target variable (stance)
y_train = df_train['Stance']
LabelEncoder = LabelEncoder()
LabelEncoder.fit(y_train)
train_encode = LabelEncoder.transform(y_train)
# one hot encoding
y_train = np_utils.to_categorical(train_encode)

In [None]:
embeddings = gensim.models.KeyedVectors.load_word2vec_format(W2V_DIR, binary=True, limit=50000)

embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(
len(tokenizer.word_index) + 1, EMBEDDING_DIM)) 
for word, i in tokenizer.word_index.items(): 
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None

    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector

del embeddings

In [None]:
# Sequential Model
import time
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1,
                      output_dim=EMBEDDING_DIM,
                      weights=[embeddings_matrix], trainable=True, name='word_embedding_layer', 
                      mask_zero=True))
model.add(LSTM(LSTM_DIM, return_sequences=False, name='lstm_layer'))
model.add(Dropout(rate=0.8, name='dropout_1'))
model.add(Dense(4, activation='softmax', name='output_layer'))

model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
starting_time = time.time()
model.fit(X_train, y_train,
                        batch_size=BATCH_SIZE,
                        epochs=N_EPOCHS
                        )
training_time = round((time.time() - starting_time) /60, 2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 word_embedding_layer (Embed  (None, None, 300)        9754200   
 ding)                                                           
                                                                 
 lstm_layer (LSTM)           (None, 100)               160400    
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 output_layer (Dense)        (None, 4)                 404       
                                                                 
Total params: 9,915,004
Trainable params: 9,915,004
Non-trainable params: 0
_________________________________________________________________


In [None]:
# saving the model so can be accessed again without running
model.save('LSTM')



INFO:tensorflow:Assets written to: LSTM/assets


INFO:tensorflow:Assets written to: LSTM/assets


In [None]:
print(training_time)

51.14


In [None]:
new_model = keras.models.load_model('LSTM')

In [None]:
# get predictions from model
predictions = new_model.predict(X_test)

In [None]:
# the value closest to "1" in every entry is the largest value
# the value closest to "1" sits in the index corresponding to the stance
# the following gets the stances per entry, but in integer form
stance_integer = [np.argmax(p, axis = -1) for p in predictions]

for s in range(len(stance_integer)):
  if stance_integer[s] == 0: 
    stance_integer[s] = "unrelated"
  if stance_integer[s] == 1: 
    stance_integer[s] = "disagree"
  if stance_integer[s] == 2: 
    stance_integer[s] = "agree"
  if stance_integer[s] == 3: 
    stance_integer[s] = "discuss"

predictions_df = {}
predictions_df = pd.DataFrame({'Stance': stance_integer})

In [None]:
competition_test_stance = pd.read_csv('competition_test_stances.csv')

In [None]:
def get_accuracy(real, test):
  correct = 0
  total = 0
  for i in range(len(real)):
    if real[i] == test[i]:
      correct += 1
    total += 1
  print( correct/total)

get_accuracy(competition_test_stance['Stance'], predictions_df['Stance'])

0.7094794003069296


In [None]:
def score_submission(gold_labels, test_labels):
    score = 0.0

    for i in range(len(gold_labels)):  
      if gold_labels[i] == test_labels[i]:
        if gold_labels[i] == 'unrelated':
          score += 0.25
        if gold_labels[i]!= 'unrelated':
          score += 0.75
      elif gold_labels[i] != 'unrelated':
        if test_labels[i] in ['agrees', 'disagrees', 'discusses']:
          score += 0.25

    return score

In [None]:
# get score from this model

score_submission(competition_test_stance['Stance'], predictions_df['Stance'])


4533.5

In [None]:
## adding drop outs
import time
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          weights = [embeddings_matrix], trainable=False, name='word_embedding_layer', 
                          mask_zero=True))

model.add(LSTM(LSTM_DIM, return_sequences=False, name='lstm_layer')) ## can add Bidirectional in here too
model.add(Dropout(rate=0.8, name='dropout1'))
model.add(Dense(4, activation='softmax', name='activation1'))

model.add(Dropout(rate=0.5, name='dropout2'))
model.add(Activation(activation='relu', name='activation2'))

model.add(Dense(4, activation='softmax', name='output_layer2'))
model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

starting_time = time.time()
model.fit(X_train, y_train,
          batch_size=BATCH_SIZE,
          epochs=N_EPOCHS
         )
training_time = round((time.time() - starting_time) /60, 2)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 word_embedding_layer (Embed  (None, None, 300)        9754200   
 ding)                                                           
                                                                 
 lstm_layer (LSTM)           (None, 100)               160400    
                                                                 
 dropout1 (Dropout)          (None, 100)               0         
                                                                 
 activation1 (Dense)         (None, 4)                 404       
                                                                 
 dropout2 (Dropout)          (None, 4)                 0         
                                                                 
 activation2 (Activation)    (None, 4)                 0         
                                                      

In [None]:
predictions2 = model.predict(X_test)

In [None]:
print(training_time)

36.43


In [None]:
stance_integer = [np.argmax(p, axis = -1) for p in predictions2]

for s in range(len(stance_integer)):
  if stance_integer[s] == 0: 
    stance_integer[s] = "unrelated"
  if stance_integer[s] == 1: 
    stance_integer[s] = "disagree"
  if stance_integer[s] == 2: 
    stance_integer[s] = "agree"
  if stance_integer[s] == 3: 
    stance_integer[s] = "discuss"

predictions_df = {}
predictions_df = pd.DataFrame({'Stance': stance_integer})


In [None]:
#get accuracy
get_accuracy(competition_test_stance['Stance'], predictions_df['Stance'])

0.7220320308503522


In [None]:
# get score from this model

score_submission(competition_test_stance['Stance'], predictions_df['Stance'])


4587.25