In [0]:
# input files: test dataset or whole dataset,paragraphs file, addInfo model
# ouput: Zip file containing testing dataset with selected paragraph

In [0]:
'''!pip install tensorflow-gpu==2.0
!pip install tensorflow_hub
!pip install bert-for-tf2
!pip install sentencepiece'''

In [25]:
import tensorflow as tf
import tensorflow_hub as hub
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

TF version:  2.0.0
Hub version:  0.8.0


In [0]:
import bert
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
#import math
import numpy as np
import zipfile
import json
import os
import glob
from sklearn.metrics.pairwise import cosine_similarity

In [0]:
basePath = '/content/'

data_dir = basePath+'dataset'
drive_dir = basePath+'drive/My Drive/temp/DatasetN'
paraFile = basePath + 'paragraph2.json'

'''data_dir = basePath+'tempdataset'
drive_dir = basePath+'drive/My Drive/temp/tempDataset'
paraFile = basePath + 'temp.json' '''

modelSave = basePath+'bert_weights_sum_800.pth'
outDir = basePath+'test'

embSize = 768
max_seq_length = 512

In [0]:
'''for file_name in os.listdir(drive_dir):
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(drive_dir+'/'+file_name,'r') as zip_dir:
            zip_dir.extractall(path='/content/')'''

In [0]:
def getBERTModel():
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,name="segment_ids")

    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)
    #bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1",trainable=False)
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    FullTokenizer = bert.bert_tokenization.FullTokenizer
    tokenizer = FullTokenizer(vocab_file, do_lower_case)

    return {'model':model,'tokenizer':tokenizer}

In [0]:
def get_masks(tokens, max_seq_length):
    #print('len(tokens),max_seq_length)
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [0]:
def getEmbeddings(model,tokenizer,sentence): 
    stokens = tokenizer.tokenize(sentence)

    if len(stokens) > (max_seq_length - 2):
      stokens = stokens[:max_seq_length-2]

    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #print(len(input_ids))
    input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)

    '''print(input_masks)
    print(input_segments)'''

    pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])
    '''print('see')
    print(all_embs.shape)
    print(pool_embs.shape)'''
    # pool_ebmbs is an embeding of CLS token
    # all_embs contains embeding for words of input sentence.
    return pool_embs

In [0]:
def BERTEmbeddings(model,tokenizer,nSentences):
    trainX = np.asarray([])
    n = len(nSentences)
    for i in range(n): 
        if i%500 == 0:
          print( 'Processing ',i,' out of ',n)

        #senLen = len(nSentences[i].split())
        embs = getEmbeddings(model,tokenizer,nSentences[i])
        '''print(embs.shape)
        x = embs'''
        if trainX.shape[0] == 0:
            trainX = embs
        else:
            trainX = np.concatenate((trainX, embs), axis=0)
    return trainX

In [0]:
def getFileSent(inpFile):
    fileSent = ''
    article = ''
    with open(inpFile,'r') as f:
        x = json.loads(f.read())
        article = x['article']
        for option in x['options'][0]:
            fileSent += option
            fileSent += ' '
        fileSent += x['questions'][0]
    return fileSent,article

In [0]:
def getQuesEmb(model,tokenizer,inpFile):
    fileSent,article = getFileSent(inpFile)
    fileSent = fileSent + (' ' + fileSent)*5
    emb = getEmbeddings(model,tokenizer,fileSent)
    return emb,article

In [0]:
def getMatchingPara(quesEmb,paraEmbs):
    #paraEmbs = np.loadtxt('paraEmbs.csv',delimiter=',')

    scores = []
    for paraEmb in paraEmbs:
        score=float( cosine_similarity([paraEmb],[quesEmb]) )
        scores.append(score)
    print(scores)
    mxScoreInd = scores.index(max(scores))
    print(mxScoreInd)
    mxScoreInd += 1

    with open(paraFile,'r') as f:
      x = json.loads(f.read())
      return x['para'+str(mxScoreInd)]

In [0]:
def saveFile(inpFile,outFile,para):
    with open(inpFile,'r') as f:
      x = json.loads(f.read())
      x['article'] = para

      with open(outFile, 'w') as jsonOut:
          json.dump(x, jsonOut)

In [0]:
def initialize_model():
    model = nn.Sequential(  nn.Linear(embSize,1024),
                            nn.ReLU(),
                            nn.BatchNorm1d(1024),
                          
                            nn.Linear(1024,1024),
                            nn.ReLU(),
                            nn.BatchNorm1d(1024),
                          
                            nn.Linear(1024,1024),
                            nn.ReLU(),
                            nn.BatchNorm1d(1024),
                          
                            nn.Linear(1024,embSize)
                            
    )

    return model

In [0]:
def restorModel():
    addInfoModel = initialize_model()
    addInfoModel.load_state_dict(torch.load(modelSave))
    addInfoModel.eval()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    addInfoModel = addInfoModel.to(device)
    return addInfoModel

In [0]:
def getInfoEmb(model,emb):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    tensorEmb = torch.from_numpy(np.expand_dims(emb, axis=0)).to(device)
    outEmb = model(tensorEmb)
    qusEmb = outEmb.detach().cpu().numpy()[0]
    return qusEmb

# **Embedding Generation**

In [0]:
outModel = getBERTModel()
model = outModel['model']
tokenizer = outModel['tokenizer']

In [0]:
f = open(paraFile,'r')
x = json.loads(f.read())
f.close()
fileSents = [ x['para'+str(id)] for id in range(1,len(x)+1)]
#print(len(fileSents))

In [0]:
paraEembs = BERTEmbeddings(model,tokenizer,fileSents)
print(paraEembs.shape)

In [0]:
np.savetxt(basePath+'paraEmbs.csv',paraEembs, delimiter=',',fmt='%.15f')

In [0]:
'''loaddata = np.loadtxt(basePath+'paraEmbs.csv', delimiter=',')
print(loaddata[0])'''

"loaddata = np.loadtxt(basePath+'paraEmbs.csv', delimiter=',')\nprint(loaddata[0])"

# **Generate Testing Dataset**

In [0]:
addInfoModel = restorModel()

In [0]:
filenames = glob.glob(os.path.join(data_dir,'test')+"/*json")
if not os.path.exists(outDir):
    os.mkdir(outDir)
count = 0
ind = 0
for filename in filenames:
    if ind == 100:
      break
    ind = ind + 1
    emb,article = getQuesEmb(model,tokenizer,filename)
    qusEmb = getInfoEmb(addInfoModel,emb[0])
    print()
    para = getMatchingPara(qusEmb,paraEembs)
    #print()
    print(article)
    print(para)
    if article == para:
        count += 1
    name = filename.split('/')[-1]
    #print(name)
    saveFile(filename,os.path.join(outDir,name),para)

print()
print('accuracy: ',(count/len(filenames))*100)

In [0]:
with zipfile.ZipFile(basePath+'testdata.zip','w') as zf:
    for dirname, subdirs, files in os.walk(outDir):
        for filename in files:
            zf.write(os.path.join(dirname.split('/')[-1], filename))

In [0]:
#print("temp")