In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Textual entailment task

In [0]:
import os
import numpy as np
import csv
import matplotlib.pyplot as plt
import pandas as pd
import json
import string
import collections
import itertools
from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from keras_preprocessing import sequence, text
from tensorboard.plugins.hparams import api as hp
import tensorflow.keras.backend as K
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [0]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


In [0]:
datafile_train = '/content/drive/My Drive/Colab Notebooks/train.csv'
df_train = pd.read_csv(datafile_train, index_col='id').sort_index()
df_train.shape

(320552, 7)

In [0]:
df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/test.csv', index_col='id')
df_test.shape

(80126, 6)

In [0]:
df_snli = pd.read_csv('/content/drive/My Drive/Colab Notebooks/train_snli.csv')
df_snli.shape

(550152, 4)

# Data exploration

In [0]:
df_train.head()

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP topped Hong Kong last year? She...,unrelated
3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
4,2,8,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？统计局辟谣：未超但差距再度缩小,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP overtakes Hong Kong? Bureau of ...,unrelated


In [0]:
null_counts = df_train.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

title2_zh    7
dtype: int64

In [0]:
null_counts = df_test.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

title2_zh    1
dtype: int64

In [0]:
# No id 247 !
#df_train['tid1'][247]

#### Since the label repartition is bad, 68% are unrelated, LSTM could give more often than expected this label, that's why BERT model has to be implemented to compare the 2 models

In [0]:
from collections import Counter
Counter(df_train.label)

Counter({'agreed': 92973, 'disagreed': 8266, 'unrelated': 219313})

In [0]:
219313/320552

0.68417292670144

In [0]:
print("Min nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).min())
print("Min nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).min())
print("Max nb words title 1  :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).max())
print("Max nb words title 2  :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).max())
print("Mean nb words title 1 :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).mean())
print("Mean nb words title 2 :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).mean())
print("99 percentile nb words title 1 :",df_train['title1_en'].apply(lambda x: len(x.split(" "))).quantile(0.99))
print("99 percentile nb words title 2 :",df_train['title2_en'].apply(lambda x: len(x.split(" "))).quantile(0.99))

Min nb words title 1  : 1
Min nb words title 2  : 1
Max nb words title 1  : 500
Max nb words title 2  : 539
Mean nb words title 1 : 16.383588310164967
Mean nb words title 2 : 16.572528014175546
Mean nb words title 1 : 30.0
Mean nb words title 2 : 30.0
Min nb words title 1  : 1
Min nb words title 2  : 1
Max nb words title 1  : 12
Max nb words title 2  : 12
Mean nb words title 1 : 1.1453679902168759
Mean nb words title 2 : 1.1562991339938606
Mean nb words title 1 : 3.0
Mean nb words title 2 : 3.0


## SNLI, csv reduced

In [0]:
df_snli.head()

Unnamed: 0.1,Unnamed: 0,gold_label,sentence1,sentence2
0,0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
3,3,neutral,Children smiling and waving at camera,They are smiling at their parents
4,4,entailment,Children smiling and waving at camera,There are children present


In [0]:
null_counts = df_snli.isnull().sum()
null_counts[null_counts > 0].sort_values(ascending=False)

Series([], dtype: int64)

In [0]:
print("Min nb words title 1  :",df_snli['sentence1'].apply(lambda x: len(x.split(" "))).min())
print("Min nb words title 2  :",df_snli['sentence2'].apply(lambda x: len(x.split(" "))).min())
print("Max nb words title 1  :",df_snli['sentence1'].apply(lambda x: len(x.split(" "))).max())
print("Max nb words title 2  :",df_snli['sentence2'].apply(lambda x: len(x.split(" "))).max())
print("Mean nb words title 1 :",df_snli['sentence1'].apply(lambda x: len(x.split(" "))).mean())
print("Mean nb words title 2 :",df_snli['sentence2'].apply(lambda x: len(x.split(" "))).mean())
print("Mean nb words title 1 :",df_snli['sentence1'].apply(lambda x: len(x.split(" "))).quantile(0.99))
print("Mean nb words title 2 :",df_snli['sentence2'].apply(lambda x: len(x.split(" "))).quantile(0.99))

Min nb words title 1  : 2
Min nb words title 2  : 1
Max nb words title 1  : 78
Max nb words title 2  : 58
Mean nb words title 1 : 12.850772150242115
Mean nb words title 2 : 7.42041108639067
Mean nb words title 1 : 31.0
Mean nb words title 2 : 18.0


In [0]:
df_snli['gold_label'].value_counts()

entailment       183416
contradiction    183187
neutral          182764
Name: gold_label, dtype: int64

# Preprocessing
- Cleaning data
- Lower case
- Deal with N/A and NaN

In [0]:
import nltk
nltk.download("popular")

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

In [0]:
#Cleaning Kaggle dataset
translator = str.maketrans('','', string.punctuation)
df_train['title1_en'] = df_train['title1_en'].str.lower().str.translate(translator)
df_train['title2_en'] = df_train['title2_en'].str.lower().str.translate(translator)
df_test['title1_en']  = df_test['title1_en'].str.lower().str.translate(translator)
df_test['title2_en']  = df_test['title2_en'].str.lower().str.translate(translator)
#Remove stop words (depends on the model)
stop_words = set(stopwords.words('english'))
df_train['title1_en'] = df_train['title1_en'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_train['title2_en'] = df_train['title2_en'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_test['title1_en'] = df_test['title1_en'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df_test['title2_en'] = df_test['title2_en'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [0]:
#Cleaning snli dataset
df_snli = df_snli.drop(df_snli[df_snli.gold_label =="-"].index)
translator = str.maketrans('','', string.punctuation)
df_snli.sentence2.fillna('UNKNOWN', inplace=True)
df_snli['sentence1'] = df_snli['sentence1'].str.lower().str.translate(translator)
df_snli['sentence2'] = df_snli['sentence2'].str.lower().str.translate(translator)
#Removing stop words
# df_snli['sentence1'] = df_snli['sentence1'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
# df_snli['sentence2'] = df_snli['sentence2'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [0]:
#Cleaning chinese text
# df_train.title2_zh.fillna('UNKNOWN', inplace=True)
# df_test.title2_zh.fillna('UNKNOWN', inplace=True)

In [0]:
df_train.head()

Unnamed: 0_level_0,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0,1,2017养老保险又新增两项，农村老人人人可申领，你领到了吗,警方辟谣“鸟巢大会每人领5万” 仍有老人坚持进京,two new oldage insurance benefits old people r...,police disprove birds nest congress person get...,unrelated
1,2,4,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",GDP首超香港？深圳澄清：还差一点点……,come shenzhen sooner later son also come less ...,gdp overtopped hong kong shenzhen clarified li...,unrelated
2,2,5,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",去年深圳GDP首超香港？深圳统计局辟谣：还差611亿,come shenzhen sooner later son also come less ...,shenzhens gdp topped hong kong last year shenz...,unrelated
3,2,3,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？深圳统计局辟谣：只是差距在缩小,come shenzhen sooner later son also come less ...,shenzhens gdp outstrips hong kong shenzhen sta...,unrelated
4,2,8,"""你不来深圳，早晚你儿子也要来""，不出10年深圳人均GDP将超香港",深圳GDP首超香港？统计局辟谣：未超但差距再度缩小,come shenzhen sooner later son also come less ...,shenzhens gdp overtakes hong kong bureau stati...,unrelated


# NN implementation (LSTM, GRU)

## Hyper parameters definition

In [0]:
vocab_size = 5000 #snli :5000 ; kaggle : 10000
nb_labels = 3
embedding_size = 512
max_len = 15 #df_train = 30 df_snli = 15
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>'
training_portion = .8

In [0]:
HP_VOCAB_SIZE = hp.HParam('vocab_size',hp.Discrete([10000,15000,20000]))
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([256,512,1028]))
HP_BATCH_SIZE = hp.HParam('batch_size', hp.Discrete([64, 256]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.1,0.3]))
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete([0.01, 0.1]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam', 'rmsprop']))
HP_EMBEDDING_LAYER = hp.HParam('embedding', hp.Discrete(['pretrained', 'from_scratch']))
HP_HIDDEN_LAYER = hp.HParam('hidden_layer',hp.Discrete(['LSTM',"GRU"]))
HP_ATTENTION = hp.HParam('attention', hp.Discrete(['yes', 'no']))

In [0]:
3*2*2*2*2*2

96

In [0]:
METRIC_ACCURACY = 'accuracy'
METRIC_F1_MAC = 'f1_macro' #f1 per class then averaged
METRIC_F1_MIC = 'f1_micro' #global average of each inidvidual instances
METRIC_PRECISION = 'precision'
METRIC_RECALL = 'recall'
METRIC_LOSS = 'loss'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
    hp.hparams_config(
        hparams=[HP_OPTIMIZER, HP_NUM_UNITS, HP_DROPOUT,HP_HIDDEN_LAYER,HP_LEARNING_RATE,HP_ATTENTION , HP_EMBEDDING_LAYER],        metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy'),
                 hp.Metric(METRIC_F1_MIC, display_name='F1 Micro'),
                 hp.Metric(METRIC_LOSS, display_name='Loss'),
                 hp.Metric(METRIC_PRECISION, display_name='Precision'),
                 hp.Metric(METRIC_RECALL, display_name='Recall')],
      )

## Word embedding

### Kaggle embedding

In [0]:
train_size = int(len(df_train['title1_en']) * training_portion)

x_train = df_train[['title1_en','title2_en']][0:train_size]
y_train = df_train['label'][0:train_size]
x_validation = df_train[['title1_en','title2_en']][train_size:]
y_validation = df_train['label'][train_size:]
x_test = df_test[['title1_en','title2_en']]

In [0]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_train['title1_en']+df_train['title2_en'])
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'days': 8,
 'eat': 3,
 'new': 4,
 'one': 9,
 'people': 6,
 'rumor': 7,
 'rumors': 2,
 'three': 10,
 'years': 5}

In [0]:
X = {'title1': x_train['title1_en'], 'title2': x_train['title2_en']}

for x_train_seq, side in itertools.product([X], ['title1', 'title2']):
    x_train_seq[side] = tokenizer.texts_to_sequences(x_train_seq[side])
    x_train_seq[side] = pad_sequences(x_train_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [0]:
X_val = {'title1': x_validation['title1_en'], 'title2': x_validation['title2_en']}

for x_validation_seq, side in itertools.product([X_val], ['title1', 'title2']):
    x_validation_seq[side] = tokenizer.texts_to_sequences(x_validation_seq[side])
    x_validation_seq[side] = pad_sequences(x_validation_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [0]:
X = {'title1': x_test['title1_en'], 'title2': x_test['title2_en']}

for x_test_seq, side in itertools.product([X], ['title1', 'title2']):
    x_test_seq[side] = tokenizer.texts_to_sequences(x_test_seq[side])
    x_test_seq[side] = pad_sequences(x_test_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [0]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

values = array(df_train['label'])
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]


In [0]:
onehot_encoded_y_train = onehot_encoded[0:train_size]
onehot_encoded_y_validation = onehot_encoded[train_size:]

In [0]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_title(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_title(x_train_seq['title2'][59]))
print('---')
print(x_train['title2_en'][59])

<OOV> rumour spinach <OOV> vegetable made iron eating ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
---
315s rumour spinach greased vegetable made iron eating


## SNLI embedding

In [0]:
train_size = int(len(df_snli['sentence1']) * training_portion)

x_train = df_snli[['sentence1','sentence2']][0:train_size]
y_train = df_snli['gold_label'][0:train_size]
x_validation = df_snli[['sentence1','sentence2']][train_size:]
y_validation = df_snli['gold_label'][train_size:]

In [0]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df_snli['sentence1']+df_snli['sentence2'])
word_index = tokenizer.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 'a': 2,
 'and': 8,
 'are': 9,
 'in': 3,
 'is': 5,
 'man': 6,
 'of': 10,
 'on': 7,
 'the': 4}

In [0]:
X = {'title1': x_train['sentence1'], 'title2': x_train['sentence2']}

for x_train_seq, side in itertools.product([X], ['title1', 'title2']):
    x_train_seq[side] = tokenizer.texts_to_sequences(x_train_seq[side])
    x_train_seq[side] = pad_sequences(x_train_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [0]:
X_val = {'title1': x_validation['sentence1'], 'title2': x_validation['sentence2']}

for x_validation_seq, side in itertools.product([X_val], ['title1', 'title2']):
    x_validation_seq[side] = tokenizer.texts_to_sequences(x_validation_seq[side])
    x_validation_seq[side] = pad_sequences(x_validation_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [0]:
from numpy import array
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

values = array(df_snli['gold_label'])
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

In [0]:
onehot_encoded_y_train = onehot_encoded[0:train_size]
onehot_encoded_y_validation = onehot_encoded[train_size:]

In [0]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_title(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_title(x_train_seq['title2'][9]))
print('---')
print(x_train['sentence2'][9])

## Embedding pre-trained

In [0]:
embeddings_index = {}
GLOVE_DIR='/content/drive/My Drive/Colab Notebooks'
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [0]:
words_not_fount = 0
embedding_matrix = np.random.random((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_fount += 1


In [0]:
words_not_fount

## Chinese Embedding

In [0]:
import jieba.posseg as pseg

df_train['title1_zh']=df_train['title1_zh'].apply(lambda x: ' '.join([word for word, flag in pseg.cut(x) if flag != 'x']))
df_train['title2_zh']=df_train['title2_zh'].apply(lambda x: ' '.join([word for word, flag in pseg.cut(x) if flag != 'x']))
df_test['title1_zh'] =df_test['title1_zh'].apply(lambda x: ' '.join([word for word, flag in pseg.cut(x) if flag != 'x']))
df_test['title2_zh'] =df_test['title2_zh'].apply(lambda x: ' '.join([word for word, flag in pseg.cut(x) if flag != 'x']))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.883 seconds.
Prefix dict has been built successfully.


In [0]:
tokenizer_chinese = Tokenizer(num_words = vocab_size, oov_token=oov_tok,lower=False)
tokenizer_chinese.fit_on_texts(df_train['title1_zh']+df_train['title2_zh'])
word_index = tokenizer_chinese.word_index
dict(list(word_index.items())[0:10])

{'<OOV>': 1,
 '不': 9,
 '了': 3,
 '你': 5,
 '吃': 7,
 '是': 4,
 '的': 2,
 '被': 6,
 '谣言': 8,
 '这': 10}

In [0]:
X = {'title1': df_train['title1_zh'], 'title2': df_train['title2_zh']}

for x_train_seq, side in itertools.product([X], ['title1', 'title2']):
    x_train_seq[side] = tokenizer_chinese.texts_to_sequences(x_train_seq[side])
    x_train_seq[side] = pad_sequences(x_train_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [0]:
X = {'title1': df_test['title1_zh'], 'title2': df_test['title2_zh']}

for x_test_seq, side in itertools.product([X], ['title1', 'title2']):
    x_test_seq[side] = tokenizer_chinese.texts_to_sequences(x_test_seq[side])
    x_test_seq[side] = pad_sequences(x_test_seq[side], padding=padding_type, truncating=trunc_type, maxlen=max_len)

In [0]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_title(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_title(x_train_seq['title2'][9]))
print('---')
print(df_train['title2_zh'][9])

吃 了 30 年 食用油 才 知道 一片 大蒜 轻松 鉴别 地沟油 ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
---
吃 了 30 年 食用油 才 知道 一片 大蒜 轻松 鉴别 地沟油


## Model creation working without HP parameters

In [0]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [0]:
#Attention implementation borrowed from https://matthewmcateer.me/blog/getting-started-with-attention-for-classification/

class Attention(tf.keras.Model):
	def __init__(self, units):
		super(Attention, self).__init__()
		self.W1 = tf.keras.layers.Dense(units)
		self.W2 = tf.keras.layers.Dense(units)
		self.V = tf.keras.layers.Dense(1)

	def call(self, features, hidden):
		# hidden shape == (batch_size, hidden size)
		# hidden_with_time_axis shape == (batch_size, 1, hidden size)
		# we are doing this to perform addition to calculate the score
		hidden_with_time_axis = tf.expand_dims(hidden, 1)
		  
		# score shape == (batch_size, max_length, 1)
		# we get 1 at the last axis because we are applying score to self.V
		# the shape of the tensor before applying self.V is (batch_size, max_length, units)
		score = tf.nn.tanh(
			self.W1(features) + self.W2(hidden_with_time_axis))
		# attention_weights shape == (batch_size, max_length, 1)
		attention_weights = tf.nn.softmax(self.V(score), axis=1)
		  
		# context_vector shape after sum == (batch_size, hidden_size)
		context_vector = attention_weights * features
		context_vector = tf.reduce_sum(context_vector, axis=1)
		return context_vector, attention_weights

In [0]:
shared_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(vocab_size, 1024, input_length=max_len, trainable=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1024, dropout=0.3,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(1024,dropout=0.3, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(1024,dropout=0.3, return_sequences=False)),
])

###Model architecture with attention 68%
# title1_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
# title2_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
# embed1 = tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_len, trainable=True)(title1_input)
# embed2 = tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_len, trainable=True)(title2_input)

# lstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=100, return_sequences = True), name="bi_lstm_0")(embed1)
# lstm2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=100, return_sequences = True), name="bi_lstm_1")(embed2)

# lstm1, forward_h1, forward_c1, backward_h1, backward_c1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, dropout=0.1,return_sequences=True,return_state=True))(lstm1)
# lstm2, forward_h2, forward_c2, backward_h2, backward_c2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, dropout=0.1,return_sequences=True,return_state=True))(lstm2)

# state_h1 = tf.keras.layers.Concatenate()([forward_h1, backward_h1])
# state_h2 = tf.keras.layers.Concatenate()([forward_h2, backward_h2])

# context_vector1, attention_weights1 = Attention(10)(lstm1, state_h1)
# context_vector2, attention_weights2 = Attention(10)(lstm2, state_h2)
# merged = tf.keras.layers.concatenate([context_vector1,context_vector2])
# output = tf.keras.layers.Dense(units=3, activation='softmax')(merged)
# # output2 = tf.keras.layers.Dense(units=3, activation='softmax')(context_vector2)

# # output = tf.keras.layers.concatenate([output1,output2])
# model = tf.keras.models.Model(inputs=[title1_input, title2_input], outputs=[output])
shared_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 1024)          8192000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 30, 2048)          16785408  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 30, 2048)          18886656  
_________________________________________________________________
bidirectional_5 (Bidirection (None, 2048)              18886656  
Total params: 62,750,720
Trainable params: 62,750,720
Non-trainable params: 0
_________________________________________________________________


In [0]:
title1_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
title2_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
lstm1 = shared_model(title1_input)
lstm2 = shared_model(title2_input)
merged = tf.keras.layers.concatenate([lstm1,lstm2])
#merged = tf.keras.layers.Dense(3, activation='relu')(merged)
output = tf.keras.layers.Dense(3, activation='softmax')(merged)
model = tf.keras.models.Model(inputs=[title1_input, title2_input], outputs=[output])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 30)]         0                                            
__________________________________________________________________________________________________
sequential_1 (Sequential)       (None, 2048)         62750720    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 4096)         0           sequential_1[1][0]           

In [0]:
#Choose optimizer and compile model
opt_algo = tf.keras.optimizers.RMSprop(0.001)
#opt_algo = tf.keras.optimizers.Adam(0.001)

model.compile(loss='categorical_crossentropy', optimizer=opt_algo, metrics=[get_f1,'accuracy',
                                                                                      tf.keras.metrics.Precision()
                                                                                      ,tf.keras.metrics.Recall()
                                                                                      ])

In [0]:
# num_epochs = 4
# trained_model = model.fit([x_train_seq['title1'], x_train_seq['title2']], np.array(onehot_encoded),
#                            epochs=num_epochs,
#                            validation_split=0.2,
#                            verbose=2)

In [0]:
num_epochs = 5
trained_model = model.fit([x_train_seq['title1'], x_train_seq['title2']], np.array(onehot_encoded_y_train),
                           epochs=num_epochs,
                           batch_size = 64,
                           callbacks = tf.keras.callbacks.EarlyStopping(patience=2),
                           validation_data=([x_validation_seq['title1'], x_validation_seq['title2']], np.array(onehot_encoded_y_validation)),
                           verbose=2)

Epoch 1/5
4007/4007 - 1174s - loss: 0.6145 - get_f1: 0.7177 - accuracy: 0.7202 - precision: 0.7230 - recall: 0.7138 - val_loss: 0.6409 - val_get_f1: 0.6836 - val_accuracy: 0.6842 - val_precision: 0.6849 - val_recall: 0.6825
Epoch 2/5
4007/4007 - 1174s - loss: 0.4776 - get_f1: 0.7905 - accuracy: 0.7910 - precision: 0.7924 - recall: 0.7887 - val_loss: 0.6390 - val_get_f1: 0.6872 - val_accuracy: 0.6902 - val_precision: 0.6921 - val_recall: 0.6834
Epoch 3/5
4007/4007 - 1170s - loss: 0.4576 - get_f1: 0.8057 - accuracy: 0.8062 - precision: 0.8075 - recall: 0.8040 - val_loss: 0.6829 - val_get_f1: 0.6895 - val_accuracy: 0.6895 - val_precision: 0.6899 - val_recall: 0.6891
Epoch 4/5
4007/4007 - 1171s - loss: 0.4475 - get_f1: 0.8131 - accuracy: 0.8134 - precision: 0.8147 - recall: 0.8115 - val_loss: 0.6734 - val_get_f1: 0.6691 - val_accuracy: 0.6734 - val_precision: 0.6777 - val_recall: 0.6631


In [0]:
#loss, f1, accuracy, prec, rec = model.evaluate([x_validation_seq['title1'], x_validation_seq['title2']], np.array(y_validation_seq))

## Model creation with HP parameters

In [0]:
def get_opt_algo(algo, learning_rate):
    #Depending on the optimization algo specified create the algo object with specified learning rate
    if algo == 'rmsprop':
        opt_algo = tf.keras.optimizers.RMSprop(learning_rate)
    elif algo == 'adam':
        opt_algo = tf.keras.optimizers.Adam(learning_rate)
    else:#For now it defaults to SGD
        opt_algo = tf.keras.optimizers.SGD(learning_rate)
    return opt_algo

In [0]:
def getEmbeddingLayer(embedding):
    if embedding == 'pretrained':
        embedding_layer = tf.keras.layers.Embedding(len(word_index) + 1,
                            100,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=True)
    else:
        embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_size, 
                                            input_length=max_len, trainable=True)
    return embedding_layer

In [0]:
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [0]:
def getHiddenLayer(layer):
  if layer == 'LSTM':
    hidden = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512,dropout=0.3, return_sequences=False))
  else:
    hidden = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512,dropout=0.3, return_sequences=False))
  return hidden

In [0]:
def train_and_test_model(hparams):
    shared_model = tf.keras.models.Sequential([
        #getEmbeddingLayer(hparams[HP_EMBEDDING_LAYER]),
        tf.keras.layers.Embedding(vocab_size, 512, input_length=max_len, trainable=True),
        getHiddenLayer(hparams[HP_HIDDEN_LAYER]),
        #tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512,dropout=0.3, return_sequences=False)),
        #tf.keras.layers.Bidirectional(tf.keras.layers.GRU(hparams[HP_NUM_UNITS],dropout=hparams[HP_DROPOUT], return_sequences=False)),
        #tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hparams[HP_NUM_UNITS],dropout=0.3, return_sequences=False)),
        #tf.keras.layers.LSTM(hparams[HP_NUM_UNITS],dropout=hparams[HP_DROPOUT]),
    ])
    title1_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
    title2_input = tf.keras.layers.Input(shape=(max_len,), dtype='int32')
    lstm1 = shared_model(title1_input)
    lstm2 = shared_model(title2_input)
    merged = tf.keras.layers.concatenate([lstm1,lstm2])
    #merged = tf.keras.layers.Dense(3, activation='relu')(merged)
    output = tf.keras.layers.Dense(3, activation='softmax')(merged)
    model = tf.keras.models.Model(inputs=[title1_input, title2_input], outputs=[output])
    opt_algo = get_opt_algo(hparams[HP_OPTIMIZER], hparams[HP_LEARNING_RATE])
    #opt_algo = tf.keras.optimizers.Adam(hparams[HP_LEARNING_RATE])
    model.compile(loss='categorical_crossentropy', optimizer=opt_algo, metrics=[get_f1,'accuracy',
                                                                                      tf.keras.metrics.Precision(),
                                                                                      tf.keras.metrics.Recall()
                                                                                      ])
    num_epochs = 2
    
    trained_model = model.fit([x_train_seq['title1'], x_train_seq['title2']], np.array(onehot_encoded_y_train),
                           epochs=num_epochs,
                           validation_split=0.2,
                           batch_size = hparams[HP_BATCH_SIZE],
                           callbacks = tf.keras.callbacks.EarlyStopping(patience=2),
                           verbose=2)

    loss, f1,accuracy, prec, recall = model.evaluate([x_validation_seq['title1'], x_validation_seq['title2']], np.array(onehot_encoded_y_validation))
    return loss, f1, accuracy, recall, prec 

In [0]:
def run(run_dir, hparams):
    with tf.summary.create_file_writer(run_dir).as_default():
        hp.hparams(hparams)  # record the values used in this trial
        loss, f1,accuracy, prec,recall = train_and_test_model(hparams)
        tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)
        tf.summary.scalar(METRIC_LOSS, loss, step=1)
        tf.summary.scalar(METRIC_F1_MAC, f1, step=1)
        tf.summary.scalar(METRIC_PRECISION, prec, step=1)
        tf.summary.scalar(METRIC_RECALL, recall, step=1)

In [0]:
session_num = 0

for optimizer in HP_OPTIMIZER.domain.values:
  #for num_units in HP_NUM_UNITS.domain.values:
    for batch_size in HP_BATCH_SIZE.domain.values:
      #for dropout_rate in HP_DROPOUT.domain.values:
        for hidden_layer in HP_HIDDEN_LAYER.domain.values:
                #for vocab_size in HP_VOCAB_SIZE.domain.values:
          for learning_rate in HP_LEARNING_RATE.domain.values:
           for embedding_layer in HP_EMBEDDING_LAYER.domain.values:
            #for attention in HP_ATTENTION.domain.values:
              hparams = {
                #HP_NUM_UNITS: num_units,
               # HP_DROPOUT: dropout_rate,
                HP_OPTIMIZER: optimizer,
                HP_HIDDEN_LAYER: hidden_layer,
                HP_EMBEDDING_LAYER: embedding_layer,
                #HP_VOCAB_SIZE: vocab_size,
                HP_LEARNING_RATE: learning_rate,
                HP_BATCH_SIZE: batch_size,
               # HP_ATTENTION: attention
              }
              run_name = "run-%d" % session_num
              print('--- Starting trial: %s' % run_name)
              print({h.name: hparams[h] for h in hparams})
              run('logs1/hparam_tuning/' + run_name, hparams)
              session_num += 1

--- Starting trial: run-0
{'optimizer': 'adam', 'hidden_layer': 'GRU', 'embedding': 'from_scratch', 'learning_rate': 0.01, 'batch_size': 64}
Epoch 1/2


KeyboardInterrupt: ignored

In [0]:
%load_ext tensorboard
%tensorboard --logdir logs1/hparam_tuning/ --host localhost --port 6010

# Predictions on test data working without HP parameters

In [0]:
preds = model.predict([x_test_seq['title1'], x_test_seq['title2']], verbose=1)
preds += model.predict([x_test_seq['title2'], x_test_seq['title1']], verbose=1)
preds /= 2



In [0]:
preds

In [0]:
results = []
for i in range(len(preds)):
    maxi = 0
    index = 0
    for j in range(3):
        if preds[i][j]>maxi:
            maxi = preds[i][j]
            index = j
    results.append(index)

In [0]:
pred_labels = []
for a in results:
    if a ==0:
        pred_labels.append("agreed")
    elif a == 1:
        pred_labels.append("disagreed")
    else:
        pred_labels.append("unrelated")
#pred_labels

In [0]:
with open('sample_submission.csv', 'w', newline='') as csv_file:  
    writer = csv.writer(csv_file)
    writer.writerow(['Id','Category'])
    for i in range(len(pred_labels)):
        writer.writerow([df_test.index[i], pred_labels[i]])