# Transfer Learning
In this notebooks different pre-trained trasformers are used for solve our problem of detecting duplicates in Quora questions.

# Initial Operations

## Needed imports

In [1]:
import os
import time
import shutil

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rn

import tensorflow as tf

!pip install -q tf-models-official
!pip install -q -U tensorflow-text 

import tensorflow_hub as hub
import tensorflow_text as text

from keras import backend as K
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn import metrics
from sklearn.metrics import f1_score, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from official.nlp import optimization  # to create AdamW optimizer
#import shutil

tf.get_logger().setLevel('ERROR')

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

import re

from string import punctuation
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm

[K     |████████████████████████████████| 2.2 MB 4.3 MB/s 
[K     |████████████████████████████████| 4.9 MB 32.6 MB/s 
[K     |████████████████████████████████| 234 kB 50.3 MB/s 
[K     |████████████████████████████████| 99 kB 9.0 MB/s 
[K     |████████████████████████████████| 47.7 MB 42.8 MB/s 
[K     |████████████████████████████████| 636 kB 37.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 35.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 35.5 MB/s 
[K     |████████████████████████████████| 90 kB 8.3 MB/s 
[K     |████████████████████████████████| 352 kB 38.5 MB/s 
[K     |████████████████████████████████| 43 kB 1.8 MB/s 
[K     |████████████████████████████████| 462 kB 34.5 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package 

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
SEED = 42
os.environ["PYTHONHASHSEED"]="0"
# The below is necessary for starting Numpy generated random numbers in a well-defined initial state.
np.random.seed(SEED)
# The below is necessary for starting core Python generated random numbers in a well-defined state.
rn.seed(SEED)
# The below tf.random.set_seed will make random number generation in TensorFlow have a well-defined initial state.
tf.random.set_seed(SEED)

## Load the Datasets

In [4]:
DATASETS_PATH = '/content/drive/MyDrive/Quora/Dataset/'

train_df = pd.read_csv(DATASETS_PATH + 'training.csv')
val_df = pd.read_csv(DATASETS_PATH + 'validation.csv')
test_df = pd.read_csv(DATASETS_PATH + 'test.csv')

In [5]:
Q1_train = train_df.iloc[:, [1, 3]]
Q2_train = train_df.iloc[:, [2,4]]

Q1_val = val_df.iloc[:, [1, 3]]
Q2_val = val_df.iloc[:, [2,4]]

Q1_test = test_df.iloc[:, [1, 3]]
Q2_test = test_df.iloc[:, [2,4]]

# Building Vectors Using doc2Vec

Obtain a training set to be used for training the doc2Vec. This set is obtained as the concatenation of question1 and question2 from the training set

In [None]:
# importing doc2vec from gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [None]:
Q1_train_doc2vec = Q1_train.copy()
Q1_train_doc2vec.rename(columns={'qid1': 'qid', 'question1': 'question'}, inplace=True)

Q2_train_doc2vec = Q2_train.copy()
Q2_train_doc2vec.rename(columns={'qid2': 'qid', 'question2': 'question'}, inplace=True)

In [None]:
# append Q1 a Q2 in order to train Doc2Vec
Q_doc2vec_train = Q1_train_doc2vec.append(Q2_train_doc2vec, ignore_index=True)

In [None]:
Q_doc2vec_train

Unnamed: 0,qid,question
0,322068,How can I sell at Snapdeal? What are the terms...
1,353581,Why are most prosecutors in American courts no...
2,312815,What are some good government jobs without a c...
3,104282,What would happen if humans no longer needed t...
4,178955,How do I shave my bikini line?
...,...,...
646857,34814,Is it really true that US is backing ISIS?
646858,457526,How can I study and invest in the Indian share...
646859,175127,How safe is it to use non-HE detergent in a HE...
646860,21122,Would you consider teaching as a full time job?


In [None]:
# tokenizing the sentences
tok_quora = [word_tokenize(wrd) for wrd in Q_doc2vec_train.question]

In [None]:
# creating training data
quora_training_data = [TaggedDocument(d, [i]) for i, d in enumerate(tok_quora)]

In [None]:
# training doc2vec model
doc_model = Doc2Vec(quora_training_data, vector_size=100, window = 5, min_count = 3, epochs=25)

In [None]:
doc_model.save("/content/drive/MyDrive/Quora/Models/Transfer Learning/doc2Vec25.model")

In [None]:
doc_model = Doc2Vec.load("/content/drive/MyDrive/Quora/Models/Transfer Learning/doc2Vec25.model")

Let's build a function to get embedding vectors of each sentence. Also, let's make sure we are using only those words from sentences in vocabulary.

In [None]:
# function to get vectors from model
def fetch_embeddings(model, tokens):
  tokens = [x for x in word_tokenize(tokens) if x in list(doc_model.wv.vocab)]
  # if words is not present then vector becomes zero
  if len(tokens) >= 1:
    return doc_model.infer_vector(tokens)
  else:
    return np.array([0]*100)

In [None]:
def single_get_features_doc2vec(questions):
  doc_embeddings = []
  for w in tqdm(questions):
    doc_embeddings.append(list(fetch_embeddings(doc_model, w)))
  
  return np.asarray(doc_embeddings)

In [None]:
def get_features_doc2vec(question1, question2, set_type):
  emb_question1 = single_get_features_doc2vec(question1)
  emb_question2 = single_get_features_doc2vec(question2)

  # save them
  np.save('/content/drive/MyDrive/Quora/Features/Doc2Vec/' + set_type + '/question1_doc2vec', emb_question1)
  np.save('/content/drive/MyDrive/Quora/Features/Doc2Vec/' + set_type + '/question2_doc2vec', emb_question2)


  # merge them
  questions_features = np.zeros((323431, 200))
  for i in range(emb_question2.shape[0]):
    questions_features[i] = np.concatenate([np.array(emb_question1[i]), np.array(emb_question2[i])])

  # save the final vector
  np.save('/content/drive/MyDrive/Quora/Features/Doc2Vec/' + set_type +'/full_question_doc2vec', questions_features)

Test correctness

In [None]:
# Storing all embedded sentence vectors in a list
# defining empty list and iterating through all the questions
doc_embeddings = []
for w in tqdm(Q1_train.question1):
  doc_embeddings.append(list(fetch_embeddings(doc_model, w)))

# converting it into array
doc_embeddings = np.asarray(doc_embeddings)

100%|██████████| 323431/323431 [46:47<00:00, 115.20it/s]


In [None]:
doc_embeddings.shape

(323431, 100)

Extract the feature

In [None]:
get_features_doc2vec(Q1_train.question1, Q2_train.question2, 'training')
get_features_doc2vec(Q1_test.question1, Q2_test.question2, 'test')
get_features_doc2vec(Q1_val.question1, Q2_val.question2, 'validation')

100%|██████████| 323431/323431 [1:15:04<00:00, 71.81it/s]
100%|██████████| 323431/323431 [1:07:43<00:00, 79.59it/s]
100%|██████████| 40428/40428 [08:05<00:00, 83.32it/s]
100%|██████████| 40428/40428 [08:04<00:00, 83.47it/s]
100%|██████████| 40428/40428 [08:01<00:00, 83.88it/s]
100%|██████████| 40428/40428 [08:02<00:00, 83.83it/s]


# Building Vectors Using Sentence-BERT

In [6]:
# install SBERT
!pip install sentence-transformers

# import the SBERT
from sentence_transformers import SentenceTransformer 

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.2 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 9.6 MB/s 
Collecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 885 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 32.2 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 23.9 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentenc

In [7]:
# let use paraphrase-MiniLM-L12-v2 pre trained model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L12-v2')

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.70k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
def get_features(question1, question2, set_type):
  sentence_embeddings_BERT1 = sbert_model.encode(question1)
  sentence_embeddings_BERT2 = sbert_model.encode(question2)

  # save them
  np.save('/content/drive/MyDrive/Quora/Features/Sentence BERT/' + set_type + '/question1_sBERT', np.array(sentence_embeddings_BERT1))
  np.save('/content/drive/MyDrive/Quora/Features/Sentence BERT/' + set_type + '/question2_sBERT', np.array(sentence_embeddings_BERT2))


  # merge them
  questions_features = np.zeros((40428, 768))
  if set_type == 'train':
    questions_features = np.zeros((323431, 768))  
  for i in range(sentence_embeddings_BERT2.shape[0]):
    questions_features[i] = np.concatenate([np.array(sentence_embeddings_BERT1[i]), np.array(sentence_embeddings_BERT2[i])])

  # save the final vector
  np.save('/content/drive/MyDrive/Quora/Features/Sentence BERT/' + set_type +'/full_question_sBERT', questions_features)

In [None]:
get_features(Q1_train.question1, Q2_train.question2, 'training')
get_features(Q1_test.question1, Q2_test.question2, 'test')
get_features(Q1_val.question1, Q2_val.question2, 'validation')

In [8]:
sentence_embeddings_BERT2 = sbert_model.encode(Q2_train.question2)
np.save('/content/drive/MyDrive/Quora/Features/Sentence BERT/training/question2_sBERT', np.array(sentence_embeddings_BERT2))

In [9]:
sentence_embeddings_BERT2 = sbert_model.encode(Q2_test.question2)
np.save('/content/drive/MyDrive/Quora/Features/Sentence BERT/test/question2_sBERT', np.array(sentence_embeddings_BERT2))

In [10]:
sentence_embeddings_BERT2 = sbert_model.encode(Q2_val.question2)
np.save('/content/drive/MyDrive/Quora/Features/Sentence BERT/validation/question2_sBERT', np.array(sentence_embeddings_BERT2))

# Building Vectors Using all-distilroberta-v1

In [6]:
# install SBERT
!pip install sentence-transformers

# import the SBERT
from sentence_transformers import SentenceTransformer 

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 383 kB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 23.1 MB/s 
Collecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.9 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 32.3 MB/s 
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=senten

In [7]:
# let use paraphrase-MiniLM-L12-v2 pre trained model
sbert_model = SentenceTransformer('all-distilroberta-v1')

Downloading:   0%|          | 0.00/737 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.86k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/653 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/15.7k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/329M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
def get_features(question1, question2, set_type):
  sentence_embeddings_BERT1 = sbert_model.encode(question1)
  sentence_embeddings_BERT2 = sbert_model.encode(question2)

  # save them
  np.save('/content/drive/MyDrive/Quora/Features/all-distilroberta-v1/' + set_type + '/question1_distilroberta', np.array(sentence_embeddings_BERT1))
  np.save('/content/drive/MyDrive/Quora/Features/all-distilroberta-v1/' + set_type + '/question2_distilroberta', np.array(sentence_embeddings_BERT2))


  # merge them
  questions_features = np.zeros((40428, 1536))
  if set_type == 'training':
    questions_features = np.zeros((323431, 1536))  
  for i in range(sentence_embeddings_BERT2.shape[0]):
    questions_features[i] = np.concatenate([np.array(sentence_embeddings_BERT1[i]), np.array(sentence_embeddings_BERT2[i])])

  # save the final vector
  np.save('/content/drive/MyDrive/Quora/Features/all-distilroberta-v1/' + set_type +'/full_question_distilroberta', questions_features)

In [9]:
get_features(Q1_train.question1, Q2_train.question2, 'training')
get_features(Q1_test.question1, Q2_test.question2, 'test')
get_features(Q1_val.question1, Q2_val.question2, 'validation')

# Building Vectors Using all-mpnet-base-v2

In [10]:
# install SBERT
!pip install sentence-transformers

# import the SBERT
from sentence_transformers import SentenceTransformer 



In [11]:
# let use paraphrase-MiniLM-L12-v2 pre trained model
sbert_model = SentenceTransformer('all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
def get_features(question1, question2, set_type):
  sentence_embeddings_BERT1 = sbert_model.encode(question1)
  sentence_embeddings_BERT2 = sbert_model.encode(question2)

  # save them
  np.save('/content/drive/MyDrive/Quora/Features/all-mpnet-base-v2/' + set_type + '/question1_mpnet', np.array(sentence_embeddings_BERT1))
  np.save('/content/drive/MyDrive/Quora/Features/all-mpnet-base-v2/' + set_type + '/question2_mpnet', np.array(sentence_embeddings_BERT2))


  # merge them
  questions_features = np.zeros((40428, 1536))
  if set_type == 'training':
    questions_features = np.zeros((323431, 1536))  
  for i in range(sentence_embeddings_BERT2.shape[0]):
    questions_features[i] = np.concatenate([np.array(sentence_embeddings_BERT1[i]), np.array(sentence_embeddings_BERT2[i])])

  # save the final vector
  np.save('/content/drive/MyDrive/Quora/Features/all-mpnet-base-v2/' + set_type +'/full_question_mpnet', questions_features)

In [13]:
get_features(Q1_train.question1, Q2_train.question2, 'training')
get_features(Q1_test.question1, Q2_test.question2, 'test')
get_features(Q1_val.question1, Q2_val.question2, 'validation')

# GPT

In [None]:
!pip install pytorch_pretrained_bert

# importing required tokenizer, OpenAiGPT model
import torch
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel

Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[?25l[K     |██▋                             | 10 kB 19.4 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 15.4 MB/s eta 0:00:01[K     |████████                        | 30 kB 8.5 MB/s eta 0:00:01[K     |██████████▋                     | 40 kB 7.9 MB/s eta 0:00:01[K     |█████████████▎                  | 51 kB 4.3 MB/s eta 0:00:01[K     |███████████████▉                | 61 kB 5.1 MB/s eta 0:00:01[K     |██████████████████▌             | 71 kB 5.1 MB/s eta 0:00:01[K     |█████████████████████▏          | 81 kB 5.6 MB/s eta 0:00:01[K     |███████████████████████▉        | 92 kB 5.8 MB/s eta 0:00:01[K     |██████████████████████████▌     | 102 kB 5.0 MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112 kB 5.0 MB/s eta 0:00:01[K     |███████████████████████████████▊| 122 kB 5.0 MB/s eta 0:00:01[K     |████████████████████████████████| 

ImportError: ignored

In [None]:
# initializing the tokenizer
tok_gpt = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

In [None]:
# initializing the gpt model
model_gpt = OpenAIGPTModel.from_pretrained('openai-gpt')
model_gpt.eval()

In [None]:
def fetch_gpt_vectors(question):
  # tokenize words
  words = word_tokenize(question)
  emb = np.zeros((1,768))

  # get vector for each word
  for word in words:
    w = tok_gpt.tokenize(word)
    indexed_words = tok_gpt.conver_tokens_to_ids(w)
    

# Find similar questions

In [None]:
# import
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def top_n_questions(user, embeddings, df):
  # getting cosine similarities of overall data set with input queries from user
  x = cosine_similarity(user, embeddings).tolist()[0]
  temp_list=list(x)

  # sorting
  sort_res = sorted(range(len(x)), key=lambda sub: x[sub])[:]
  sim_score=[temp_list[i] for i in reversed(sort_res)]

  # print
  print(sort_res[0:5])

  # index fetching
  L = []
  for i in reversed(sort_res):
    L.append(i)

  # get the index from dataframe
  return df.iloc[L[0:5], [0,1]]

In [None]:
def get_input_vector(query, model):
  if model == 'Doc2Vec':
    k = fetch_embeddings(doc_model, query)
    k = k.reshape(1, -1)
  elif model == 'BERT':
    k = sbert_model.encode(str(query))
    k = k.reshape(1, -1)
  elif model=='GPT':
    k = fetch_gpt_vectors(query)

  return k