In [0]:
# input files: whole dataset, paragraphs file, addInfo Model
# ouput: Zip file containing testing dataset with selected paragraph

In [0]:
import numpy as np
from gensim.models import Word2Vec
import re
import json
import glob
import os
from sklearn.metrics.pairwise import cosine_similarity
import zipfile

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [0]:
basePath = '/content/'

data_dir = basePath+'dataset'
drive_dir = basePath+'drive/My Drive/temp/DatasetN'
paraFile = basePath + 'paragraph2.json'

'''data_dir = basePath+'tempdataset'
drive_dir = basePath+'drive/My Drive/temp/tempDataset'
paraFile = basePath + 'temp.json' '''

modelSave = basePath+'weights.pth'
outDir = basePath+'test'
alpha = 0.001
embSize = 200

In [7]:
'''for file_name in os.listdir(drive_dir):
    if file_name.endswith('.zip'):
        with zipfile.ZipFile(drive_dir+'/'+file_name,'r') as zip_dir:
            zip_dir.extractall(path='/content/')'''

"for file_name in os.listdir(drive_dir):\n    if file_name.endswith('.zip'):\n        with zipfile.ZipFile(drive_dir+'/'+file_name,'r') as zip_dir:\n            zip_dir.extractall(path='/content/')"

In [0]:
def sifEmbeddings(sentences, model, alpha):

    vocab = model.wv.vocab
    embs = model.wv       
    size = model.vector_size  
    
    total = 0
    for word in vocab:
        total += vocab[word].count 

    output = []
    
    for sent in sentences:
        count = 0
        v = np.zeros(size, dtype=np.float32) 
        for word in sent:
          if word in vocab:
            #print(word)
            weight = alpha / (alpha + (vocab[word].count/total))
            v += weight * embs[word]
            count += 1 
                
        if count > 0:
            v *= 1/count

        output.append(v)
    return np.vstack(output)

In [0]:
def splitSent(sent):
    regX = '[\s()\.\?]'
    splitSent = re.split(regX,sent)
    splitSent = list(filter(None,splitSent))
    return splitSent

In [0]:
def getFileSent(inpFile):
    fileSent = ''
    article = ''
    with open(inpFile,'r') as f:
        x = json.loads(f.read())
        article = x['article']
        '''for option in x['options'][0]:
            fileSent += option
            fileSent += ' ' '''
        fileSent += x['questions'][0]
    return fileSent,article

In [0]:
def getDatasetAsSents():
    fileSents = []
    dirs = os.listdir(data_dir)
    print(dirs)
    for d in dirs:
      filenames = glob.glob(os.path.join(data_dir,d)+"/*json")
      for file in filenames:
          with open(file,'r') as f:
              x = json.loads(f.read())
              for option in x['options'][0]:
                  fileSents.append(option)
              fileSents.append(x['questions'][0])
              fileSents.append(x['article'])
    return fileSents

In [0]:
def getQuesEmb(model,inpFile):
    fileSent,article = getFileSent(inpFile)
    #fileSent = fileSent + (' ' + fileSent)*5
    sent = splitSent(fileSent)
    emb = sifEmbeddings([sent], model, alpha=alpha)[0]
    return emb,article

In [13]:
'''def getMatchingPara(quesEmb,paraEmbs):
    #paraEmbs = np.loadtxt('paraEmbs.csv',delimiter=',')

    scores = []
    for paraEmb in paraEmbs:
        score=float( cosine_similarity([paraEmb],[quesEmb]) )
        scores.append(score)
    #print(scores)
    mxScoreInd = scores.index(max(scores))
    mxScoreInd += 1

    with open(paraFile,'r') as f:
      x = json.loads(f.read())
      return x['para'+str(mxScoreInd)]'''

"def getMatchingPara(quesEmb,paraEmbs):\n    #paraEmbs = np.loadtxt('paraEmbs.csv',delimiter=',')\n\n    scores = []\n    for paraEmb in paraEmbs:\n        score=float( cosine_similarity([paraEmb],[quesEmb]) )\n        scores.append(score)\n    #print(scores)\n    mxScoreInd = scores.index(max(scores))\n    mxScoreInd += 1\n\n    with open(paraFile,'r') as f:\n      x = json.loads(f.read())\n      return x['para'+str(mxScoreInd)]"

In [0]:
def getMatchingPara(quesEmb,paraEmbs):
    #paraEmbs = np.loadtxt('paraEmbs.csv',delimiter=',')

    scores = []
    for paraEmb in paraEmbs:
        score=float( cosine_similarity([paraEmb],[quesEmb]) )
        scores.append(score)
    #print(scores)
    #mxScoreInd = scores.index(max(scores))
    #mxScoreInd += 1
    topK = 3
    #a = [2,6,5,9,7,2,3,6]
    topIds = np.argsort(scores)[-topK:]
    topIds += 1
    #topVal = [scores[i] for i in topIds]

    with open(paraFile,'r') as f:
      x = json.loads(f.read())
      for id in topIds:
          print(x['para'+str(id)])
      return x['para'+str(topIds[-1])]

In [0]:
def saveFile(inpFile,outFile,para):
    with open(inpFile,'r') as f:
      x = json.loads(f.read())
      x['article'] = para

      with open(outFile, 'w') as jsonOut:
          json.dump(x, jsonOut)

In [0]:
def initialize_model():
    model = nn.Sequential(  nn.Linear(embSize,1024),
                            nn.ReLU(),
                            nn.BatchNorm1d(1024),
                          
                            nn.Linear(1024,1024),
                            nn.ReLU(),
                            nn.BatchNorm1d(1024),
                          
                            nn.Linear(1024,1024),
                            nn.ReLU(),
                            nn.BatchNorm1d(1024),
                          
                            nn.Linear(1024,embSize)
                            
    )

    return model

In [0]:
def restorModel():
    addInfoModel = initialize_model()
    addInfoModel.load_state_dict(torch.load(modelSave))
    addInfoModel.eval()
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    addInfoModel = addInfoModel.to(device)
    return addInfoModel

In [0]:
def getInfoEmb(model,emb):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    tensorEmb = torch.from_numpy(np.expand_dims(emb, axis=0)).to(device)
    outEmb = model(tensorEmb)
    qusEmb = outEmb.detach().cpu().numpy()[0]
    return qusEmb

# **Word2Vec**

In [19]:
fileSents = getDatasetAsSents()
sentences = [ splitSent(sent) for sent in fileSents]

['dev', 'train', 'test']


In [20]:
model = Word2Vec(sentences, min_count=1, size=embSize)
model.save(basePath+"word2vec.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


# **Embedding Generation**

In [0]:
f = open(paraFile,'r')
x = json.loads(f.read())
f.close()
fileSents = [ x['para'+str(id)] for id in range(1,len(x)+1)]
sentences = [ splitSent(sent) for sent in fileSents]

In [22]:
#model = Word2Vec.load("word2vec.model")
paraEembs = sifEmbeddings(sentences, model, alpha=alpha)
print(paraEembs.shape)
#np.savetxt(basePath+'paraEmbs.csv',paraEembs, delimiter=',',fmt='%.15f')

(12304, 200)


# **Generate Testing Dataset**

In [0]:
addInfoModel = restorModel()

In [0]:
filenames = glob.glob(os.path.join(data_dir,'test')+"/*json")


if not os.path.exists(outDir):
    os.mkdir(outDir)
count = 0
ind = 0
for filename in filenames:
    if ind == 20:
      break
    emb,article = getQuesEmb(model,filename)
    '''tensorEmb = torch.from_numpy(np.expand_dims(emb, axis=0)).to(device)
    outEmb = addInfoModel(tensorEmb)
    qusEmb = outQusEmb.detach().cpu().numpy()[0]'''
    qusEmb = getInfoEmb(addInfoModel,emb)
    #print(qusEmb.shape)
    #dataloader = getDataLoaders(emb)
    #print(dataloader)
    #break

    print()
    para = getMatchingPara(qusEmb,paraEembs)
    #print()
    print(article)
    #print(para)
    if article == para:
        count += 1
    name = filename.split('/')[-1]
    #print(name)
    saveFile(filename,os.path.join(outDir,name),para)
    ind += 1

print()
print('accuracy: ',(count/len(filenames))*100)

In [25]:
'''with zipfile.ZipFile(basePath+'testdata.zip','w') as zf:
    for dirname, subdirs, files in os.walk(outDir):
        for filename in files:
            zf.write(os.path.join(dirname.split('/')[-1], filename))'''

"with zipfile.ZipFile(basePath+'testdata.zip','w') as zf:\n    for dirname, subdirs, files in os.walk(outDir):\n        for filename in files:\n            zf.write(os.path.join(dirname.split('/')[-1], filename))"