In [4]:
from gensim.models import KeyedVectors
import numpy as np
from nltk.corpus import stopwords
import pandas as pd

In [15]:
word2vec_location = '../main/GoogleNews-vectors-negative300.bin'

In [1]:
class Sentence :

    def __init__(self, sent) :
        self.sentence = sent
        self.is_tokenized = False
        self.token_list = []
        self.tf = []

    def remove_punct(self) :
        table = str.maketrans('', '', string.punctuation)
        self.sentence = self.sentence.translate(table)

    def remove_num(self) :
        table = str.maketrans('', '', '1234567890')
        self.sentence = self.sentence.translate(table)

    def tokenize(self):
        self.token_list = self.sentence.split()
        self.is_tokenized = True

    def to_lower(self): 
        if self.is_tokenized : 
            for i in range(len(self.token_list)) : 
                self.token_list[i] = self.token_list[i].lower()

    def remove_stop(self) : 
        if self.is_tokenized : 
            stop_words = set(stopwords.words('english'))
            doc_stop = []
            for word in self.token_list :
                if(not word in stop_words) : 
                    doc_stop.append(word)
            self.token_list = doc_stop

    def stem(self): 
        if self.is_tokenized : 
            porter = PorterStemmer()
            for i in range(len(self.token_list)) : 
                self.token_list[i] = porter.stem(self.token_list[i])

    def get_sent(self) :
        return self.sentence

    def get_token_list(self) : 
        return self.token_list
    
    def preprocess(self) : 
        self.remove_punct()
        self.remove_num()
        self.tokenize()
        self.to_lower()
        self.remove_stop()
        self.stem()

    def tf_calc(self, base) :
        tf = []
        for i in range(len(base)) : 
            tf.append(0);
        
        count = []
        for word in self.token_list : 
            c = 0
            for x in self.token_list : 
                if word == x : 
                    c += 1
            count.append(c)
                
        sent_length = len(self.token_list)
        for i in range(len(self.token_list)) : 
            if self.token_list[i] in base :
                index = base.index(self.token_list[i]) 
                tf[index] = count[i] / sent_length
        self.tf = tf

    def get_tf(self) : 
        return self.tf
    
class Corpus(Sentence) : 

    def __init__(self, corpus) : 
        self.corpus = corpus 
        self.corpus_tokens = corpus
        self.is_tokenized = False
        self.idf = []
        self.n = -1
        self.base = []
        self.counted = False
        self.filterd = False
        
    
    def count_words(self) : 
        print("-------Corpus.count_words-------")
        if self.is_tokenized : 
            set_words = set()
            for i in self.corpus_tokens : 
                for word in i : 
                    set_words.add(word)
            dict_words = {}
            for i in set_words : 
                dict_words[i] = 0
            for i in self.corpus_tokens : 
                for word in i : 
                    dict_words[word] += 1
            self.set_words = set_words
            self.dict_words = dict_words

    def contains(self, word) : 
        if self.is_tokenized : 
            count = 0
            for i in self.corpus_tokens :
                if word in i : 
                    count += 1
            return count  

    def filter_top_n(self) : 
        print("-------Corpus.filter_top_n-------")
        list_words = [] 
        for i in self.set_words : 
            list_words.append([self.dict_words[i], i])
        list_words.sort()
        list_words.reverse()
        self.top_n = list_words[0:self.n]

    def preprocess(self) : 
        self.is_tokenized = True
        total = len(self.corpus)
        print("-------Corpus.preprocess-------")
        for i in range(total) : 
            self.sentence = self.corpus[i]
            super().preprocess()
            # print(supre().get_corpus_tokens())
            self.corpus_tokens[i] = super().get_token_list()
            print("-----" + str(i*100 / total) + "-----", end="\r")

    def set_n(self, n) : 
        self.n = n        
    
    def idf_n(self) : 
        if self.counted == False: 
            self.count_words()
            self.counted = True
        if self.filterd == False: 
            self.filter_top_n()
            self.filterd = True
        self.filter_top_n()
        print("-------Corpus.idf_n-------")
        doc_size = len(self.corpus)
        idf = []
        prog = 0
        total = len(self.top_n)
        for i in self.top_n : 
            val = math.log(doc_size / (1 + self.contains(i[1])))
            idf.append(val)
            print("-----" + str(prog*100 / total) + "-----", end="\r")
            prog+= 1
        self.idf = idf

    def tf_n(self) :
        tf = []
        #self.word_base()
        prog = 1
        total = len(self.corpus_tokens)
        for sent in self.corpus_tokens : 
            s = Sentence(sent)
            s.token_list = sent
            s.tf_calc(self.base) 
            tf.append(s.get_tf())
            print("-----" + str(prog*100 / total) + "-----", end="\r")
            prog += 1
        self.tf = tf
    def set_base(self, base) : 
        self.base = base
        
    def get_tf(self) :
        return self.tf 

    def word_base(self) : 
        base = []
        for word in self.top_n : 
            base.append(word[1])
        self.base = base

    def get_base(self) : 
        return self.base

    def get_corpus(self) : 
        return self.corpus
    def get_corpus_tokens(self) : 
        return self.corpus_tokens
    def get_idf(self) : 
        return self.idf

In [72]:
t = 'The young boys are playing outdoors and the man is smiling nearby'
h = 'There is no boy playing outdoors and there is no man smiling'

In [73]:
T = Sentence(t)
T.tokenize()
T.to_lower()
T.remove_stop()

H = Sentence(h)
H.tokenize()
H.to_lower()
H.remove_stop()

t = T.token_list
h = H.token_list

In [16]:
# word2vec = KeyedVectors.load_word2vec_format(word2vec_location, binary=True)

In [31]:
word2vec.similarity("present", "press")

0.13545474825854131

In [74]:
sim = []

In [75]:
h

['boy', 'playing', 'outdoors', 'man', 'smiling']

In [80]:
t

['young', 'boys', 'playing', 'outdoors', 'man', 'smiling', 'nearby']

In [76]:
for i in t : 
    for j in h :
        if (i in word2vec.vocab) & (j in word2vec.vocab) & (i != j):
            print(i + ', ' + j, end=" : ")
            print((word2vec.similarity(i, j)))
            sim.append([word2vec.similarity(i, j), i, j])

young, boy : 0.386192217209
young, playing : 0.236690315662
young, outdoors : 0.198397977277
young, man : 0.344799870874
young, smiling : 0.266963227777
boys, boy : 0.59610577784
boys, playing : 0.25072721552
boys, outdoors : 0.150362475206
boys, man : 0.385427394945
boys, smiling : 0.197837400765
playing, boy : 0.177916052581
playing, outdoors : 0.20239288119
playing, man : 0.176845761504
playing, smiling : 0.256174462226
outdoors, boy : 0.092055530636
outdoors, playing : 0.20239288119
outdoors, man : 0.0820254732602
outdoors, smiling : 0.124396206891
man, boy : 0.682487058299
man, playing : 0.176845761504
man, outdoors : 0.0820254732602
man, smiling : 0.219556866142
smiling, boy : 0.259454960362
smiling, playing : 0.256174462226
smiling, outdoors : 0.124396206891
smiling, man : 0.219556866142
nearby, boy : 0.193545520289
nearby, playing : 0.0504325433476
nearby, outdoors : 0.18477158899
nearby, man : 0.16083101936
nearby, smiling : 0.151701435923


In [35]:
round(0.1232323, 10)

0.1232323

In [77]:
sim.sort()

In [78]:
sim

[[0.050432543347556015, 'nearby', 'playing'],
 [0.082025473260242843, 'man', 'outdoors'],
 [0.082025473260242843, 'outdoors', 'man'],
 [0.092055530636023217, 'outdoors', 'boy'],
 [0.1243962068908044, 'outdoors', 'smiling'],
 [0.1243962068908044, 'smiling', 'outdoors'],
 [0.15036247520636303, 'boys', 'outdoors'],
 [0.15170143592329519, 'nearby', 'smiling'],
 [0.16083101936011146, 'nearby', 'man'],
 [0.17684576150415998, 'man', 'playing'],
 [0.17684576150415998, 'playing', 'man'],
 [0.17791605258092669, 'playing', 'boy'],
 [0.18477158899019641, 'nearby', 'outdoors'],
 [0.1935455202888533, 'nearby', 'boy'],
 [0.19783740076465264, 'boys', 'smiling'],
 [0.19839797727709962, 'young', 'outdoors'],
 [0.20239288118950344, 'outdoors', 'playing'],
 [0.20239288118950344, 'playing', 'outdoors'],
 [0.21955686614221681, 'man', 'smiling'],
 [0.21955686614221681, 'smiling', 'man'],
 [0.23669031566161133, 'young', 'playing'],
 [0.25072721552015187, 'boys', 'playing'],
 [0.2561744622260862, 'playing', 's

In [82]:
word2vec.similarity('no', 'not')

0.5200426502478388

# simple model

In [9]:
import keras as K
import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras import regularizers


Using TensorFlow backend.


In [38]:
import string 
import math
from nltk.stem.porter import PorterStemmer

In [7]:
sick = pd.read_csv('../main/SICK.txt', sep="\t", header=None)

In [8]:
sick.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,pair_ID,sentence_A,sentence_B,entailment_label,relatedness_score,entailment_AB,entailment_BA,sentence_A_original,sentence_B_original,sentence_A_dataset,sentence_B_dataset,SemEval_set
1,1,A group of kids is playing in a yard and an ol...,A group of boys in a yard is playing and a man...,NEUTRAL,4.5,A_neutral_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
2,2,A group of children is playing in the house an...,A group of kids is playing in a yard and an ol...,NEUTRAL,3.2,A_contradicts_B,B_neutral_A,"A group of children playing in a yard, a man i...","A group of children playing in a yard, a man i...",FLICKR,FLICKR,TRAIN
3,3,The young boys are playing outdoors and the ma...,The kids are playing outdoors near a man with ...,ENTAILMENT,4.7,A_entails_B,B_entails_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRAIN
4,4,The young boys are playing outdoors and the ma...,There is no boy playing outdoors and there is ...,CONTRADICTION,3.6,A_contradicts_B,B_contradicts_A,"The children are playing outdoors, while a man...","The children are playing outdoors, while a man...",FLICKR,FLICKR,TRIAL


In [23]:
A = list(sick[1])[1:-1]
B = list(sick[2])[1:-1]

In [25]:
C = A + B

In [76]:
len(A)

9839

In [32]:
c = Corpus(C)

In [33]:
c.preprocess()

-------Corpus.preprocess-------
-----99.99491818274215---------

In [39]:
c.set_n(5000)
c.idf_n()

-------Corpus.filter_top_n-------
-------Corpus.idf_n-------
-----99.94169096209913-------

In [40]:
c.word_base()

In [42]:
len(c.get_base())

1715

In [43]:
base = c.get_base()

In [45]:
len(base)

1715

In [130]:
epochs = 90
stance = Sequential()
stance.add(Dense(100, activation ='relu',input_shape=(3430,),kernel_regularizer = regularizers.l2(0.0001)) )
stance.add(Dense(100, activation ='relu',kernel_regularizer = regularizers.l2(0.0001)) )
stance.add(Dense(100, activation ='relu',kernel_regularizer = regularizers.l2(0.0001)) )
stance.add(Dropout(0.5))
stance.add(Dense(3,activation='softmax'))
opt = keras.optimizers.adam(lr=0.01,clipnorm=5)
stance.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [131]:
stance.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 100)               343100    
_________________________________________________________________
dense_16 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_17 (Dense)             (None, 100)               10100     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 3)                 303       
Total params: 363,603
Trainable params: 363,603
Non-trainable params: 0
_________________________________________________________________


In [56]:
len(y)

9839

In [58]:
y.unique()

array(['NEUTRAL', 'ENTAILMENT', 'CONTRADICTION'], dtype=object)

In [59]:
y = y.map({"NEUTRAL": 0, "ENTAILMENT":1, "CONTRADICTION" : 2})

In [60]:
y.unique()

array([0, 1, 2])

In [61]:
y = list(y)

[0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 1,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 1,
 2,
 0,
 2,
 1,
 1,
 0,
 1,
 2,
 1,
 0,
 1,
 0,
 0,
 1,
 2,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 0,
 2,
 0,
 2,
 0,
 1,
 2,
 1,
 0,
 1,
 0,
 0,
 2,
 1,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 0,


In [78]:
inputA = Corpus(A) 
inputA.preprocess()
inputA.set_base(base)
inputA.tf_n()

-------Corpus.preprocess-------
-----100.0-----654843---------

In [81]:
len(inputA.tf[0])

1715

In [82]:
inputB = Corpus(B) 
inputB.preprocess()
inputB.set_base(base)
inputB.tf_n()

-------Corpus.preprocess-------
-----100.0-----654843---------

In [104]:
input_ = []

In [105]:
for i in range(len(inputA.tf)) : 
    r = [inputA.tf[i] + inputB.tf[i]]
    
    input_ += r

In [106]:
len(input_)

9839

In [94]:
len(inputA.tf[0] + inputA.tf[0])

3430

In [97]:
d = []
d += [inputA.tf[0] + inputA.tf[0]]
d += [inputA.tf[0] + inputA.tf[0]]

In [100]:
len(d[1])

3430

In [107]:
input_np = np.array(input_)

In [108]:
y_cat = keras.utils.np_utils.to_categorical(y)

In [119]:
type(y_cat)

numpy.ndarray

In [120]:
y_np = np.array(y)

In [121]:
y

[0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 1,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 1,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 1,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 1,
 2,
 0,
 2,
 1,
 1,
 0,
 1,
 2,
 1,
 0,
 1,
 0,
 0,
 1,
 2,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 0,
 2,
 0,
 2,
 0,
 1,
 2,
 1,
 0,
 1,
 0,
 0,
 2,
 1,
 1,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 2,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 0,


In [132]:
batch_size = 500
##HAVE TO TEST THE MODEL
from sklearn.model_selection import cross_val_score
from sklearn import metrics 
from sklearn.model_selection import StratifiedKFold

seed = 6 
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
cvscores = []

In [127]:
stance.fit(input_np, y_cat, epochs=epochs, verbose=1, batch_size=batch_size)

Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
2000/9839 [=====>........................] - ETA: 1s - loss: 0.4607 - acc: 0.8605

KeyboardInterrupt: 

In [None]:
for train, test in kfold.split(input_np, y_np):
    
    epochs = 200
    stance = Sequential()
    stance.add(Dense(50, activation ='relu',input_shape=(3430,),kernel_regularizer = regularizers.l2(0.0001)) )
    
#     stance.add(Dropout(0.4))
#     stance.add(Dense(200, activation ='relu',kernel_regularizer = regularizers.l2(0.0001)) )
#     stance.add(Dropout(0.4))
#     stance.add(Dense(200, activation ='relu',kernel_regularizer = regularizers.l2(0.0001)) )
    stance.add(Dropout(0.6))
    stance.add(Dense(3,activation='softmax'))
    opt = keras.optimizers.adam(lr=0.01,clipnorm=5)
    stance.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
    
    stance.fit(input_np[train], y_cat[train], epochs=epochs, verbose=1, batch_size=batch_size)
    
    # evaluate the model
    scores = stance.evaluate(input_np[test], y_cat[test], verbose=0)
    print("%s: %.2f%%" % (stance.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)

print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
1500/7870 [====>.........................] - ETA: 0s - loss: 0.6524 - acc: 0.7720