In [21]:
import sys
import os
import numpy as np
from numpy import array
from numpy import asarray
from tensorflow.keras.utils import to_categorical
import pandas as pd
import tensorflow as tf
#from gensim.models import Word2Vec
from tensorflow.keras.models import load_model, Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional, Flatten
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import callbacks, regularizers
from utils.dataset import DataSet
from utils.generate_test_splits import train_vali_split, get_stances_for_folds
from utils.score import report_score, LABELS, score_submission
from tensorflow.keras.utils import plot_model
from utils.system import parse_params, check_version
import nltk

import string


In [22]:
LSTM_DIM = 300
EMBEDDING_DIM = 300

In [23]:

#Load the training dataset and generate folds
d = DataSet()
training, hold_out = train_vali_split(d)
training_stances, hold_out_stances = get_stances_for_folds(d,training,hold_out)
# Load the competition dataset
competition_dataset = DataSet("competition_test")
#X_competition, y_competition = generate_features(competition_dataset.stances, competition_dataset, "competition")

Xs = dict()
ys = dict()






Reading dataset
Total stances: 49972
Total bodies: 1683
Reading dataset
Total stances: 25413
Total bodies: 904


In [24]:
def prepareData(dataset, stances):
    headline = []
    body = []
    stanceFinal = []
    for stance in stances:
        headline.append(stance['Headline'])
        body.append(dataset.articles[stance['Body ID']])
        stanceFinal.append(stance['Stance'])
    return headline, body, stanceFinal
        
        

In [25]:
train_headline, train_body, train_stanceFinal = prepareData(d, training_stances)
hold_out_headline, hold_out_body, hold_out_stanceFinal = prepareData(d, hold_out_stances)

In [26]:
competition_headline, competition_body, competition_stanceFinal = prepareData(competition_dataset, competition_dataset.stances)

In [27]:
def stance_to_onehot(stance):
    li = []
    LABELS = ['agree', 'disagree', 'discuss', 'unrelated']
    for i in stance:
        if i == LABELS[0]:
            li.append([1,0,0,0])
        elif i == LABELS[1]:
            li.append([0,1,0,0])
        elif i == LABELS[2]:
            li.append([0,0,1,0])
        elif i == LABELS[3]:
            li.append([0,0,0,1])
    return np.array(li)
        

In [28]:
competition_stance_onehot = stance_to_onehot(competition_stanceFinal)
train_stance_onehot = stance_to_onehot(train_stanceFinal)
hold_out_stance_onehot = stance_to_onehot(hold_out_stanceFinal)

In [29]:
def get_tokens(text):
    lowers = text.lower()

    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    no_punctuation = lowers.translate(remove_punctuation_map)
    tokens = nltk.word_tokenize(no_punctuation)

    return tokens
train_headline = [get_tokens(text) for text in train_headline]
hold_out_headline = [get_tokens(text) for text in hold_out_headline]
train_body = [get_tokens(text) for text in train_body]
hold_out_body = [get_tokens(text) for text in hold_out_body]
competition_headline = [get_tokens(text) for text in competition_headline]
competition_body = [get_tokens(text) for text in competition_body]

In [30]:
tokenizer = Tokenizer()

In [31]:
corpus = train_headline+train_body+hold_out_headline+hold_out_body+competition_headline+competition_body

In [32]:
tokenizer.fit_on_texts([' '.join(seq) for seq in corpus])

In [33]:
print("Number of words in vocabulary:", len(tokenizer.word_index))

Number of words in vocabulary: 34118


In [34]:
vocab_size = len(tokenizer.word_index) + 1

In [35]:
train_headline = tokenizer.texts_to_sequences(train_headline)
train_body = tokenizer.texts_to_sequences(train_body)
hold_out_headline = tokenizer.texts_to_sequences(hold_out_headline)
hold_out_body = tokenizer.texts_to_sequences(hold_out_body)
competition_headline = tokenizer.texts_to_sequences(competition_headline)
competition_body = tokenizer.texts_to_sequences(competition_body)

In [37]:
MAX_HEADLINE_LENGTH = int(np.percentile([len(doc) for doc in train_headline+hold_out_headline], 90))
print('90th Percentile headline Length:', MAX_HEADLINE_LENGTH)
MAX_BODY_LENGTH = int(np.mean([len(doc) for doc in train_body+hold_out_body]))
print('90th Percentile body Length:', MAX_BODY_LENGTH)
MAX_BODY_TRUNC = 50

90th Percentile headline Length: 16
90th Percentile body Length: 378


In [38]:
train_headline = pad_sequences(train_headline, maxlen=MAX_HEADLINE_LENGTH, padding='post', truncating='post')
hold_out_headline = pad_sequences(hold_out_headline, maxlen=MAX_HEADLINE_LENGTH, padding='post', truncating='post')
train_first_body = pad_sequences(train_body, maxlen=MAX_BODY_TRUNC, padding='post', truncating='post')
train_last_body = pad_sequences(train_body, maxlen=MAX_BODY_TRUNC, padding='pre', truncating='pre')
hold_out_first_body = pad_sequences(hold_out_body, maxlen=MAX_BODY_TRUNC, padding='post', truncating='post')
hold_out_last_body = pad_sequences(hold_out_body, maxlen=MAX_BODY_TRUNC, padding='pre', truncating='pre')
competition_headline = pad_sequences(competition_headline, maxlen=MAX_HEADLINE_LENGTH, padding='post', truncating='post')
competition_first_body = pad_sequences(competition_body, maxlen=MAX_BODY_TRUNC, padding='post', truncating='post')
competition_last_body = pad_sequences(competition_body, maxlen=MAX_BODY_TRUNC, padding='pre', truncating='pre')



In [39]:
# load the whole embedding into memory
embeddings_index = dict()
f = open('glove.42B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 1917494 word vectors.


In [40]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        print("not found words: ", word)
del embeddings_index

not found words:  sotloff
not found words:  albaghdadi
not found words:  emwazi
not found words:  kobani
not found words:  tridevil
not found words:  jongun
not found words:  usled
not found words:  zehafbibeau
not found words:  vorozhbitsyn
not found words:  chibok
not found words:  alnusra
not found words:  aldulaimi
not found words:  crabzilla
not found words:  fredou
not found words:  21yearold
not found words:  clarkemurphy
not found words:  sergeantatarms
not found words:  hairgate
not found words:  sotloffs
not found words:  kadalim
not found words:  31yearold
not found words:  yojong
not found words:  jongil
not found words:  alanbar
not found words:  lobison
not found words:  tridevils
not found words:  albaghdadis
not found words:  abdelmajed
not found words:  helric
not found words:  adsupported
not found words:  17yearold
not found words:  pumpkinspice
not found words:  canadianisraeli
not found words:  albritani
not found words:  dejeus
not found words:  badeh
not found wo

not found words:  xinggui
not found words:  solju
not found words:  wellconnected
not found words:  syriairaq
not found words:  aotlcpawak
not found words:  midoctober
not found words:  ­entering
not found words:  4npower
not found words:  lady­amanita
not found words:  foxnewspress
not found words:  seoulbased
not found words:  62acre
not found words:  mexicanamerican
not found words:  deirezzour
not found words:  5mmwide
not found words:  espncom
not found words:  catcallers
not found words:  facebookjasmine
not found words:  kirchhner
not found words:  dollarsworth
not found words:  alsuri
not found words:  28yearold
not found words:  jasminetridevil
not found words:  chabadlubavitch
not found words:  wwwattendancegovin
not found words:  alshariah
not found words:  millan…
not found words:  cctvnews
not found words:  httpkvorscomclicks88377c89569subid21987
not found words:  2millionayear
not found words:  33yearold
not found words:  yaqoobi
not found words:  dornella
not found words

not found words:  statebacked
not found words:  alfalluja
not found words:  adygeisk
not found words:  nutmeglaced
not found words:  nesslovntrey247
not found words:  early2015
not found words:  £12000
not found words:  telewonderwomen
not found words:  600pound
not found words:  40pound
not found words:  msnbcwebsitecom
not found words:  ·
not found words:  httptcol01frufkwz
not found words:  58yearold
not found words:  englishcntvcn
not found words:  52yearold
not found words:  professionallooking
not found words:  tumblestyle
not found words:  situationi
not found words:  carbeile
not found words:  aminy
not found words:  39footwide
not found words:  carancas
not found words:  asteroidwatchtwitter
not found words:  isilkulsky
not found words:  snopescom—the
not found words:  myths—said
not found words:  farm—
not found words:  —doesn
not found words:  exiraqi
not found words:  turkishsyria
not found words:  60footwide
not found words:  breitbartcom
not found words:  thaek
not found 

not found words:  81yearsold
not found words:  —————
not found words:  amzno
not found words:  bitly1bu9mcu
not found words:  nflxo
not found words:  arathy
not found words:  hadsurgery
not found words:  byeongchul
not found words:  haedanghwa
not found words:  skindigger
not found words:  hainess
not found words:  preinterview
not found words:  rcalifornia
not found words:  brothernorth
not found words:  50footlong
not found words:  manhood…
not found words:  fatheroffive
not found words:  stillangry
not found words:  alahsa
not found words:  genderrelated
not found words:  australiawhat
not found words:  otherancient
not found words:  ndla
not found words:  militantsindia
not found words:  7inus
not found words:  statuehuge
not found words:  alienlike
not found words:  israel012
not found words:  slowtrack
not found words:  halfsiblings
not found words:  chiefofstaff
not found words:  koreawatchers
not found words:  parthero
not found words:  httptcon2voitsj3m
not found words:  pictw

not found words:  23minute
not found words:  topranked
not found words:  relatedibm
not found words:  wpplenovo
not found words:  businessappleibm
not found words:  ipadmaker
not found words:  cringelys
not found words:  cattleraising
not found words:  balmacedain
not found words:  ultrahardline
not found words:  werewolfchild
not found words:  haissam
not found words:  governmentowned
not found words:  hoaxtracking
not found words:  viningtwitter
not found words:  know…
not found words:  syriankurdish
not found words:  fengs
not found words:  kmovcom
not found words:  righttowork
not found words:  50day
not found words:  relatedisrael
not found words:  netanyahuslayer
not found words:  palestinegaza
not found words:  palestinewest
not found words:  territoriesisrael
not found words:  alshejaiya
not found words:  suburbgaza
not found words:  israelipalestinian
not found words:  strausss
not found words:  418am
not found words:  washingtonwin
not found words:  mcnameegetty
not found wor

not found words:  sunnyvales
not found words:  triplenet
not found words:  10building
not found words:  17millionsquarefoot
not found words:  blackstonestarwood
not found words:  innetwork
not found words:  iamcreesummer
not found words:  gforgames
not found words:  statedespite
not found words:  splitups
not found words:  tombstyle
not found words:  groomtobe
not found words:  brotherinlaws
not found words:  midmauling
not found words:  bieberfication
not found words:  bearandbieber
not found words:  timescen
not found words:  bieberless
not found words:  timesceneuropics
not found words:  selfpublished
not found words:  to…
not found words:  bieberfied
not found words:  jetwaymj
not found words:  outcomeit
not found words:  israelbound
not found words:  newlyfound
not found words:  manmagnet
not found words:  imagereuters
not found words:  humanbug
not found words:  ebolas
not found words:  alsabaah
not found words:  rebelcontrolled
not found words:  edmalki
not found words:  hevals


not found words:  acids…
not found words:  acidsphoto
not found words:  fullfat
not found words:  death…
not found words:  cholesterol…
not found words:  nutritionists…
not found words:  347747
not found words:  643226
not found words:  grains…
not found words:  effect…
not found words:  heatinduced
not found words:  highheat
not found words:  fedraised
not found words:  omega3s…
not found words:  proven…
not found words:  sense…
not found words:  prehumans
not found words:  wholefat
not found words:  foodbased
not found words:  selfreporting
not found words:  lowerfat
not found words:  foodminds
not found words:  obesitycausing
not found words:  fatheart
not found words:  antisaturated
not found words:  acids—found
not found words:  oil—may
not found words:  health—one
not found words:  acids—lauric
not found words:  plametic
not found words:  acid—were
not found words:  items—including
not found words:  cookies—can
not found words:  ceaseanddesist
not found words:  wttgtv
not found w

In [43]:
np.save('embedding_matrix',embedding_matrix)

In [42]:
embedding_matrix = np.load('embedding_matrix.npy')

In [32]:
def builtModel(drop_out_rate=0.5,dense_neuron = 64):
    headline_input = Input(shape=(MAX_HEADLINE_LENGTH,), name='headline_input')
    first_body_input = Input(shape=(MAX_BODY_TRUNC,), name='first_body_input')
    last_body_input = Input(shape=(MAX_BODY_TRUNC,), name='last_body_input')
    headline_embedding = Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          weights = [embedding_matrix], trainable=False, name='headline_word_embedding_layer', 
                          mask_zero=True)(headline_input)
    first_body_embedding = Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          weights = [embedding_matrix], trainable=False, name='first_body_word_embedding_layer', 
                          mask_zero=True)(first_body_input)
    last_body_embedding = Embedding(input_dim=len(tokenizer.word_index)+1,
                          output_dim=EMBEDDING_DIM,
                          weights = [embedding_matrix], trainable=False, name='last_bodyword_embedding_layer', 
                          mask_zero=True)(last_body_input)
    encoder_outputs, state_h, state_c = LSTM(LSTM_DIM, return_sequences=False, return_state = True, name='lstm_layer_encoder')(headline_embedding)
    encoder_states = [state_h, state_c]
    first_body_outputs, _, _ = LSTM(LSTM_DIM, return_sequences=False, return_state = True, name='lstm_layer_first_body')(first_body_embedding, initial_state=encoder_states)
    last_body_outputs, _, _ = LSTM(LSTM_DIM, return_sequences=False, return_state = True, name='lstm_layer_last_body')(last_body_embedding, initial_state=encoder_states)
    x = tf.keras.layers.concatenate([first_body_outputs, last_body_outputs])
    x = Dense(dense_neuron, activation='relu')(x)
    x = Dropout(rate=drop_out_rate, name='dropout_1')(x)
    x = Dense(dense_neuron, activation='relu')(x)
    x = Dropout(rate=drop_out_rate, name='dropout_2')(x)

    # And finally we add the main logistic regression layer
    main_output = Dense(4, activation='softmax', name='main_output')(x)
    model = Model(inputs=[headline_input, first_body_input, last_body_input], outputs=main_output)
    model.summary()
    return model
    
    
    
    
    

In [34]:
def trainModel(model,batch=1000,ep=50):
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    output_directory = ''
    model_checkpoint = callbacks.ModelCheckpoint(os.path.join(output_directory , 'weights.{epoch:02d}-{val_loss:.2f}.hdf5'),save_best_only=True,mode='auto',period=10)
    model.fit([train_headline, train_first_body, train_last_body], train_stance_onehot,
          batch_size=batch,
          epochs=ep,
          validation_data=([hold_out_headline, hold_out_first_body, hold_out_last_body], hold_out_stance_onehot),callbacks=[model_checkpoint])
   # model.evaluate(testX, testY)

In [None]:
trainModel(builtModel())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
headline_input (InputLayer)     (None, 16)           0                                            
__________________________________________________________________________________________________
first_body_input (InputLayer)   (None, 50)           0                                            
__________________________________________________________________________________________________
headline_word_embedding_layer ( (None, 16, 300)      8362200     headline_input[0][0]             
__________________________________________________________________________________________________
last_body_input (InputLayer)    (None, 50)           0                                            
__________________________________________________________________________________________________
first_body