# **Word2Vec**

In [0]:
import gensim
import logging
import pandas as pd
from sklearn.model_selection import train_test_split

In [0]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [0]:
data=pd.read_csv("/content/drive/My Drive/Precily/Text_Similarity_Dataset.csv")

In [0]:
train,test = train_test_split(data, random_state=30, test_size = 0.2)
X_train = train.iloc[:,1:3]
X_test = test.iloc[:,1:3]
X_train.shape

(3218, 2)

In [0]:
X_test.head

<bound method NDFrame.head of                                                   text1                                              text2
3661  royal couple watch nation s mood prince charle...  firefox browser takes on microsoft microsoft s...
307   young debut cut short by ginepri fifteen-year-...  top gig award for scissor sisters new york ban...
1670  kennedy predicts bigger turnout voters   pent ...  straw backs ending china embargo uk foreign se...
2268  howard s unfinished business  he s not finishe...  custody death rate  shocks  mps deaths in cust...
1319  bafta to hand out movie honours movie stars fr...  ireland surge past scots ireland maintained th...
...                                                 ...                                                ...
3351  wenger offers mutu hope arsenal boss arsene we...  baghdad blogger on big screen a film based on ...
999   firefox browser takes on microsoft microsoft s...  gamers snap up new sony psp gamers have bought...
1752  o

In [0]:
def extract_text():
    """
    Extract questions for making word2vec model.
    """
    df1 = X_train
    df2 = X_test

    for dataset in [df1, df2]:
        for i, row in dataset.iterrows():
            if i != 0 and i % 1000 == 0:
                logging.info("read {0} sentences".format(i))

            if row['text1']:
                yield gensim.utils.simple_preprocess(row['text1'])
            if row['text2']:
                yield gensim.utils.simple_preprocess(row['text2'])

In [0]:
documents = list(extract_text())
logging.info("Done reading data file")

model = gensim.models.Word2Vec(documents, size=300)
model.train(documents, total_examples=len(documents), epochs=10)
model.save("/content/Text_Pairs.w2v")

2020-02-19 15:46:52,868 : INFO : read 2000 sentences
2020-02-19 15:46:53,235 : INFO : read 4000 sentences
2020-02-19 15:46:54,658 : INFO : read 3000 sentences
2020-02-19 15:46:55,202 : INFO : read 1000 sentences
2020-02-19 15:46:56,315 : INFO : Done reading data file
2020-02-19 15:46:56,316 : INFO : collecting all words and their counts
2020-02-19 15:46:56,317 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-02-19 15:46:56,850 : INFO : collected 27820 word types from a corpus of 2978631 raw words and 8046 sentences
2020-02-19 15:46:56,852 : INFO : Loading a fresh vocabulary
2020-02-19 15:46:57,014 : INFO : effective_min_count=5 retains 18552 unique words (66% of original 27820, drops 9268)
2020-02-19 15:46:57,015 : INFO : effective_min_count=5 leaves 2944874 word corpus (98% of original 2978631, drops 33757)
2020-02-19 15:46:57,071 : INFO : deleting the raw counts dictionary of 27820 items
2020-02-19 15:46:57,073 : INFO : sample=0.001 downsamples 42 most-

# **UTIL**

In [0]:
import re

from tensorflow.python.keras import backend as K
from tensorflow.python.keras.layers import Layer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from nltk.corpus import stopwords
from gensim.models import KeyedVectors

import gensim

import numpy as np

import itertools

In [0]:
def text_to_word_list(text):
    # Pre process and convert texts to a list of words
    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    text = text.split()

    return text

In [0]:
def make_w2v_embeddings(data, embedding_dim=300, empty_w2v=False):
    vocabs = {}
    vocabs_cnt = 0

    vocabs_not_w2v = {}
    vocabs_not_w2v_cnt = 0

    # Stopwords
    stops = set(stopwords.words('english'))

    # Load word2vec
    print("Loading word2vec model(it may takes 2-3 mins) ...")

    if empty_w2v:
        word2vec = EmptyWord2Vec
    else:
        word2vec = KeyedVectors.load_word2vec_format("/content/drive/My Drive/GoogleNews-vectors-negative300.bin.gz", binary=True)
        # word2vec = gensim.models.word2vec.Word2Vec.load("./data/Quora-Question-Pairs.w2v").wv

    for index, row in data.iterrows():
        # Print the number of embedded sentences.
        if index != 0 and index % 1000 == 0:
            print("{:,} sentences embedded.".format(index), flush=True)

        # Iterate through the text of both questions of the row
        for text in ['text1', 'text2']:

            t2t = []  # t2t -> text numbers representation
            for word in text_to_word_list(row[text]):
                # Check for unwanted words
                if word in stops:
                    continue

                # If a word is missing from word2vec model.
                if word not in word2vec.vocab:
                    if word not in vocabs_not_w2v:
                        vocabs_not_w2v_cnt += 1
                        vocabs_not_w2v[word] = 1

                # If you have never seen a word, append it to vocab dictionary.
                if word not in vocabs:
                    vocabs_cnt += 1
                    vocabs[word] = vocabs_cnt
                    t2t.append(vocabs_cnt)
                else:
                    t2t.append(vocabs[word])

            # Append question as number representation
            data.at[index, text + '_n'] = t2t

    embeddings = 1 * np.random.randn(len(vocabs) + 1, embedding_dim)  # This will be the embedding matrix
    embeddings[0] = 0  # So that the padding will be ignored

    # Build the embedding matrix
    for word, index in vocabs.items():
        if word in word2vec.vocab:
            embeddings[index] = word2vec.word_vec(word)
    del word2vec

    return data, embeddings


In [0]:
def split_and_zero_padding(df, max_seq_length):
    # Split to dicts
    X = {'left': df['text1_n'], 'right': df['text2_n']}

    # Zero padding
    for dataset, side in itertools.product([X], ['left', 'right']):
        dataset[side] = pad_sequences(dataset[side], padding='pre', truncating='post', maxlen=max_seq_length)

    return dataset

In [0]:
class ManDist(Layer):
    """
    Keras Custom Layer that calculates Manhattan Distance.
    """

    # initialize the layer, No need to include inputs parameter!
    def __init__(self, **kwargs):
        self.result = None
        super(ManDist, self).__init__(**kwargs)

    # input_shape will automatic collect input shapes to build layer
    def build(self, input_shape):
        super(ManDist, self).build(input_shape)

    # This is where the layer's logic lives.
    def call(self, x, **kwargs):
        self.result = K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True))
        return self.result

    # return output shape
    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)


In [0]:
class EmptyWord2Vec:
    """
    Just for test use.
    """
    vocab = {}
    word_vec = {}

# **TRAIN**

In [0]:
from time import time
import pandas as pd

import matplotlib

matplotlib.use('Agg')
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import tensorflow as tf

from tensorflow.python.keras.models import Model, Sequential
from tensorflow.python.keras.layers import Input, Embedding, LSTM, GRU, Conv1D, Conv2D, GlobalMaxPool1D, Dense, Dropout


In [0]:
# Load training set
train_df = X_train
for q in ['text1', 'text2']:
    train_df[q + '_n'] = train_df[q]


In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Make word2vec embeddings
embedding_dim = 300
max_seq_length = 20
use_w2v = True

train_df, embeddings = make_w2v_embeddings(train_df, embedding_dim=embedding_dim, empty_w2v=not use_w2v)

2020-02-19 16:14:39,139 : INFO : loading projection weights from /content/drive/My Drive/GoogleNews-vectors-negative300.bin.gz


Loading word2vec model(it may takes 2-3 mins) ...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
2020-02-19 16:16:13,357 : INFO : loaded (3000000, 300) matrix from /content/drive/My Drive/GoogleNews-vectors-negative300.bin.gz


2,000 sentences embedded.
4,000 sentences embedded.
3,000 sentences embedded.
1,000 sentences embedded.


# **Further code can be executed if labels would be given.**

In [0]:
# Split to train validation
validation_size = int(len(train_df) * 0.1)
training_size = len(train_df) - validation_size

X = train_df[['text1_n', 'text2_n']]
Y = train_df['similarity_score']

X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size)

X_train = split_and_zero_padding(X_train, max_seq_length)
X_validation = split_and_zero_padding(X_validation, max_seq_length)


In [0]:
# Convert labels to their numpy representations
Y_train = Y_train.values
Y_validation = Y_validation.values


In [0]:
# Make sure everything is ok
assert X_train['left'].shape == X_train['right'].shape
assert len(X_train['left']) == len(Y_train)

In [0]:
# Model variables
gpus = 2
batch_size = 400 * gpus
n_epoch = 50
n_hidden = 50

In [0]:
# Define the shared model
x = Sequential()
x.add(Embedding(len(embeddings), embedding_dim,
                weights=[embeddings], input_shape=(max_seq_length,), trainable=False))

x.add(LSTM(n_hidden))

shared_model = x

In [0]:
# The visible layer
left_input = Input(shape=(max_seq_length,), dtype='int32')
right_input = Input(shape=(max_seq_length,), dtype='int32')

In [0]:
# Pack it all up into a Manhattan Distance model
malstm_distance = ManDist()([shared_model(left_input), shared_model(right_input)])
model = Model(inputs=[left_input, right_input], outputs=[malstm_distance])

if gpus >= 2:
    # `multi_gpu_model()` is a so quite buggy. it breaks the saved model.
    model = tf.keras.utils.multi_gpu_model(model, gpus=gpus)
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
model.summary()
shared_model.summary()

In [0]:
# Start trainings
training_start_time = time()
malstm_trained = model.fit([X_train['left'], X_train['right']], Y_train,
                           batch_size=batch_size, epochs=n_epoch,
                           validation_data=([X_validation['left'], X_validation['right']], Y_validation))
training_end_time = time()
print("Training time finished.\n%d epochs in %12.2f" % (n_epoch,
                                                        training_end_time - training_start_time))

model.save('/content/SiameseLSTM.h5')

In [0]:
# Plot accuracy
plt.subplot(211)
plt.plot(malstm_trained.history['acc'])
plt.plot(malstm_trained.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot loss
plt.subplot(212)
plt.plot(malstm_trained.history['loss'])
plt.plot(malstm_trained.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout(h_pad=1.0)
plt.savefig('/content/history-graph.png')

print(str(malstm_trained.history['val_acc'][-1])[:6] +
      "(max: " + str(max(malstm_trained.history['val_acc']))[:6] + ")")
print("Done.")

In [0]:
import pandas as pd

import tensorflow as tf

In [0]:
TEST_CSV = X_test

In [0]:
# Load training set
test_df = TEST_CSV
for q in ['text1', 'text2']:
    test_df[q + '_n'] = test_df[q]


In [0]:
# Make word2vec embeddings
embedding_dim = 300
max_seq_length = 20
test_df, embeddings = make_w2v_embeddings(test_df, embedding_dim=embedding_dim, empty_w2v=False)

In [0]:
# Split to dicts and append zero padding.
X_test = split_and_zero_padding(test_df, max_seq_length)

In [0]:
# Make sure everything is ok
assert X_test['left'].shape == X_test['right'].shape


In [0]:
# --

model = tf.keras.models.load_model('/content/SiameseLSTM.h5', custom_objects={'ManDist': ManDist})
model.summary()

prediction = model.predict([X_test['left'], X_test['right']])
print(prediction)