In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
os.environ['TRANSFORMERS_CACHE'] = '../huggingface_cache/model_cache'
os.environ['HF_DATASETS_CACHE'] = '../huggingface_cache/data_cache'

In [2]:
data_path = Path("../data/pairs1.csv")
data = pd.read_csv(data_path, index_col='Unnamed: 0')
data.head()

Unnamed: 0,q,a
0,1. How expensive is NMSU?,New Mexico State University is proud to offer ...
1,2. What kind of university is NMSU?,"NMSU is a public, land-grant university, which..."
2,3. Is NMSU ranked?,Yes! Visit this page for NMSU rankings and exp...
3,4. What are the most popular fields of study?,NMSU is a comprehensive university. Popular un...
4,5. What is the academic calendar at NMSU?,"The academic calendar at NMSU, like most U.S. ..."


In [20]:
import re

def remove_number_start(sentence):
    return re.sub('[0-9]+\.\ ', '', sentence)

s = "1. How expensive is NMSU?"
data['q'] = data['q'].apply(remove_number_start)
data.head(20)

Unnamed: 0,q,a
0,How expensive is NMSU?,New Mexico State University is proud to offer ...
1,What kind of university is NMSU?,"NMSU is a public, land-grant university, which..."
2,Is NMSU ranked?,Yes! Visit this page for NMSU rankings and exp...
3,What are the most popular fields of study?,NMSU is a comprehensive university. Popular un...
4,What is the academic calendar at NMSU?,"The academic calendar at NMSU, like most U.S. ..."
5,How many international students are there at N...,Currently NMSU enrolls more than 700 internati...
6,"How do NMSU students pay their tuition, fees, ...",Payment for tuition (course costs) and fees ar...
7,Is financial aid available to international st...,There are limited numbers of scholarships and ...
8,What is the health insurance requirement? How ...,All F-1 students and J-1 students/scholars are...
9,Can I purchase a health insurance plan from my...,If you would like to have a non-NMSU health in...


In [39]:
import torch
from sentence_transformers import SentenceTransformer

# DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE = torch.device('cpu')

sbert_model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

In [40]:
question_embeddings = sbert_model.encode(data['q'], device=DEVICE, convert_to_tensor=True)
question_embeddings.shape

torch.Size([45, 384])

In [43]:
# %%timeit
user_q = "what is your name?"
uq_embedding = sbert_model.encode(user_q, device=DEVICE, convert_to_tensor=True)
uq_embedding = uq_embedding.repeat(45, 1)
calc_score = torch.nn.CosineSimilarity(dim=1, eps=1e-08)
dists = calc_score(uq_embedding, question_embeddings)
ans_idx = torch.argmax(dists).cpu().numpy()
question = data['q'][ans_idx]
ans = data['a'][ans_idx]
print('Matching score: ', dists[ans_idx])
print('Question: ', question)
print('Answer: ', ans)

6.38 ms ± 183 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
uq_embedding = uq_embedding.repeat(44, 1)

In [14]:
dists = torch.cdist(uq_embedding, question_embeddings, p=2.0)
ans_idx = torch.argmin(dists)
question = data['q'][ans_idx]
question = data['q'][ans_idx]

tensor(0, device='cuda:0')