In [1]:
import os
import re
import string

import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from gensim.models import Word2Vec
import spacy
from copy import deepcopy


# Word2Vec

In [2]:
corpus = "./nasa/"

filelist = os.listdir(corpus)

files = {}

def prepreprocess(s):
    s = s.lower()
    s = re.sub(r'\n', ' ', s)
    s = s.strip()
    return s

for file in filelist:
    if file.endswith(".txt"):
        with open(corpus + file, "r") as f:
            files[file] = prepreprocess(f.read())

In [3]:
def split_sentences(s):
    sentence_list = re.split(r'[.?!]\s*', s)
    if sentence_list[-1]:
        return sentence_list
    else:
        return sentence_list[:-1]

def preprocess(s):
    subbed = re.sub(r'[^A-Za-z]+', ' ', s)
    return subbed

# files['emt01995.txt']
# split_sentences(files['emt01995.txt'])

sentences = []
for file in files:
    sentences += split_sentences(deepcopy(files[file]))


for file in files:
    files[file] = preprocess(files[file])


for i, sentence in enumerate(sentences):
    sentences[i] = preprocess(sentence)

for i, sentence in enumerate(sentences):
    sentences[i] = sentence.split(' ')
sentences

[['integration',
  'of',
  'mechanical',
  'design',
  'analysis',
  'and',
  'fabrication',
  'processes',
  'mechanical',
  'design',
  'has',
  'been',
  'integrated',
  'with',
  'thermal',
  'structural',
  'and',
  'optical',
  'analysis',
  'and',
  'with',
  'fabrication'],
 ['electronic',
  'import',
  'of',
  'the',
  'model',
  'geometry',
  'eliminates',
  'the',
  'repetitive',
  'steps',
  'of',
  'geometry',
  'input',
  'to',
  'develop',
  'each',
  'analysis',
  'model',
  'leading',
  'to',
  'faster',
  'and',
  'more',
  'accurate',
  'analyses'],
 ['electronic',
  'transfer',
  'of',
  'a',
  'part',
  'to',
  'fabrication',
  'eliminates',
  'the',
  'need',
  'to',
  'manually',
  'input',
  'a',
  'complex',
  'geometry',
  'into',
  'a',
  'numeric',
  'control',
  'nc',
  'machine'],
 ['potential',
  'commercial',
  'uses',
  'any',
  'design',
  'or',
  'manufacturing',
  'process',
  'e'],
 ['g'],
 ['o',
  'automotive',
  'o',
  'appliance',
  'o',
  'plast

In [4]:
model = Word2Vec(sentences, min_count=1)

model.build_vocab([["<UNK>"]], update=True)

words = list(model.wv.index_to_key)

print(model)
print(model.wv)

Word2Vec<vocab=6205, vector_size=100, alpha=0.025>
KeyedVectors<vector_size=100, 6205 keys>


In [5]:
def mean_pool(words):
    used_words = []
    for word in words:
        if word in model.wv.index_to_key:
            used_words.append(word)
    if len(used_words) >= 1:
        return np.mean(model.wv[used_words], axis=0)
    else:
        return []

def get_doc_embedding(doc):
    return mean_pool(doc.split())

# mean_pool(files['emt01995.txt'].split())

mean_pool_document_vectors = {}
for file in files:
    mean_pool_document_vectors[file] = get_doc_embedding(files[file])

mean_pool_document_vectors

{'emt01995.txt': array([-0.33743027,  0.72110504, -0.01114454,  0.23505709,  0.20772563,
        -0.64276344,  0.08552174,  1.3219808 , -0.46182522, -0.33785713,
        -0.08830535, -0.66720325,  0.03270434,  0.5377204 ,  0.08076148,
        -0.5855853 , -0.02683975, -0.3197186 , -0.18880199, -0.86065716,
         0.3055614 , -0.05371389,  0.2461078 , -0.2982036 ,  0.07878783,
        -0.22442102, -0.15394866,  0.01231919, -0.37670052,  0.10366918,
         0.4921153 , -0.18850006,  0.09384165, -0.4861746 , -0.39013103,
         0.82518256,  0.29190326, -0.3910344 , -0.26582128, -0.9047595 ,
        -0.04405228, -0.37818587, -0.22091211, -0.26827922,  0.21791625,
        -0.18594147, -0.34436277, -0.12803543,  0.11470606,  0.25280184,
         0.36842042, -0.4451095 , -0.23403007, -0.28111434, -0.20791616,
         0.104212  ,  0.18943293, -0.25300995, -0.4444808 , -0.04377148,
         0.33848846,  0.09918541,  0.37836188,  0.03751552, -0.43380728,
         0.28977245,  0.19486558,  

In [6]:
query = "integrated analysis"
query_embedding = get_doc_embedding(query)

query_embedding

array([-4.02933240e-01,  6.58994913e-01, -7.83768296e-02,  2.95271218e-01,
        2.96141297e-01, -4.14105058e-01, -7.00773299e-02,  1.19877338e+00,
       -3.08211029e-01, -5.80908135e-02, -8.25459063e-02, -5.89773893e-01,
        1.03042804e-01,  6.30501032e-01, -7.30502307e-02, -5.37332654e-01,
       -1.04253953e-02, -2.63556302e-01, -1.44050941e-01, -6.55243218e-01,
        2.85752326e-01, -2.21172810e-01,  2.89277643e-01, -3.18653941e-01,
        2.01806545e-01, -2.42463380e-01, -9.34452266e-02,  1.10546887e-01,
       -3.59387279e-01,  4.45013680e-02,  4.50548798e-01, -1.31451026e-01,
        1.46884378e-02, -4.97288674e-01, -2.71900207e-01,  8.05949450e-01,
        2.20411256e-01, -4.24628079e-01, -3.90801907e-01, -7.83462107e-01,
       -1.63914964e-01, -3.45120430e-01, -3.82524282e-01, -2.84786493e-01,
        7.81025812e-02, -1.30066961e-01, -2.52957165e-01, -7.61522949e-02,
        6.04231507e-02,  2.75633365e-01,  3.85950863e-01, -5.03215194e-01,
       -3.28210920e-01, -

In [7]:
import operator

In [8]:
cosine_similarities_with_query = {}
for file in files:
    cosine_similarities_with_query[file] = cosine_similarity([query_embedding], [mean_pool_document_vectors[file]])[0][0]
cosine_similarities_with_query

def get_top(x, d):
    dict_list = []
    for key in d.keys():
        dict_list.append((d[key], key))

    sorted_list = sorted(dict_list, reverse=True)
    return sorted_list[:x]

get_top(10, cosine_similarities_with_query)

[(0.97152615, 'sbr12195.txt'),
 (0.9710187, 'emt10695.txt'),
 (0.969862, 'sbr06195.txt'),
 (0.9689544, 'mat06395.txt'),
 (0.9687294, 'sbr17995.txt'),
 (0.96836835, 'sbr01495.txt'),
 (0.96698225, 'sbr01395.txt'),
 (0.9668887, 'ins20495.txt'),
 (0.966227, 'emt01995.txt'),
 (0.9661464, 'sbr21495.txt')]

# Spacy

In [9]:
nlp = spacy.load("en_core_web_lg")

#embeddings for the documents

spacy_doc_vectors = {}
for file in files:
    spacy_doc_vectors[file] = nlp(files[file])

In [10]:
spacy_doc_vectors

{'emt01995.txt': integration of mechanical design analysis and fabrication processes mechanical design has been integrated with thermal structural and optical analysis and with fabrication electronic import of the model geometry eliminates the repetitive steps of geometry input to develop each analysis model leading to faster and more accurate analyses electronic transfer of a part to fabrication eliminates the need to manually input a complex geometry into a numeric control nc machine potential commercial uses any design or manufacturing process e g o automotive o appliance o plastics o airplane o nuclear laboratory optical testing automated process analysis nuclear plant analysis benefits rapid model development accuracy and precision of models ease of analysis transfer automatic tolerance definition and fit checking true optical performance predictions rapid efficient and exact fabrication the technology in many industries there has recently been a concerted movement toward concurre

In [11]:
spacy_query_embedding = nlp(" ".join(query))

spacy_cosine_similarities = {}
for file in files:
    spacy_cosine_similarities[file] = spacy_query_embedding.similarity(spacy_doc_vectors[file])

get_top(10, spacy_cosine_similarities)

[(0.3569761792529935, 'emt05095.txt'),
 (0.33977543846728053, 'sbr06295.txt'),
 (0.3384800826116411, 'sbr01495.txt'),
 (0.33514201671148763, 'emt04995.txt'),
 (0.3346343367249719, 'eos03795.txt'),
 (0.33456275763460896, 'sbr01395.txt'),
 (0.3337068261071418, 'eos16095.txt'),
 (0.32824074225209005, 'sbr18095.txt'),
 (0.3243281549194393, 'ins20595.txt'),
 (0.32026474294644597, 'inf12795.txt')]

# DPR

In [12]:
from transformers import DPRContextEncoder, DPRQuestionEncoder, DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer

In [15]:
doc_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
query_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
document_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
query_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

# dpr_doc_vectors = {}
# for file in files:
#     dpr_doc_vectors[file] = document_encoder(**doc_tokenizer(files[file], return_tensors='pt', padding=True, truncation=True, max_length=512))["pooler_output"].detach().numpy()

dpr_doc_vectors

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.weight', 'ctx_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification mod

{'emt01995.txt': array([[ 3.85320514e-01,  3.16337228e-01,  1.22290164e-01,
          2.45374009e-01,  1.05519772e-01,  2.12021977e-01,
         -2.90340871e-01, -2.92630255e-01,  4.39845622e-02,
         -4.91929710e-01, -1.19869970e-01,  9.51675296e-01,
         -1.37416497e-01,  6.80393949e-02, -1.25346124e-01,
          2.13572904e-01,  6.14577293e-01, -2.23123431e-02,
         -3.09059620e-01, -2.28034168e-01, -6.18843675e-01,
         -2.67587006e-01,  4.19049561e-01, -7.12126121e-02,
         -3.33187938e-01,  5.96626624e-02,  1.07286178e-01,
          3.26923788e-01, -2.38086373e-01,  1.64718181e-01,
          1.83013290e-01,  3.71281803e-01, -4.84606266e-01,
         -4.42418158e-01, -8.47403705e-02,  2.21059062e-02,
         -9.18874368e-02, -1.77126199e-01,  1.25063255e-01,
         -3.58002961e-01, -2.17875093e-01,  2.49048918e-01,
          1.20471343e-02,  1.96486354e-01, -7.93448463e-02,
         -4.35990095e-01, -1.61140466e+00, -2.15788409e-02,
         -5.57413816e-01

In [14]:
dpr_query_embedding = query_encoder(**query_tokenizer(query, return_tensors='pt'))["pooler_output"].detach().numpy()

dpr_cosine_similarities = {}
for file in files:
    dpr_cosine_similarities[file] = cosine_similarity(dpr_query_embedding.reshape(1,-1), dpr_doc_vectors[file].reshape(1,-1))[0][0]

get_top(10, dpr_cosine_similarities)


[(0.63890994, 'eos11695.txt'),
 (0.57324654, 'inf11495.txt'),
 (0.5717596, 'inf21695.txt'),
 (0.56599337, 'emt01995.txt'),
 (0.5652106, 'sbr17995.txt'),
 (0.54340976, 'emt13895.txt'),
 (0.5345992, 'sbr17895.txt'),
 (0.53454757, 'eos19995.txt'),
 (0.5314743, 'emt02495.txt'),
 (0.52591914, 'eos07195.txt')]