In [78]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np

In [148]:
data = pd.read_csv("Data/data.csv")

In [124]:
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-albert-small-v2')
model = AutoModel.from_pretrained('sentence-transformers/paraphrase-albert-small-v2')

In [162]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [157]:
def create_embedding(data):
    encoded_input = tokenizer(list(data), padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embedding = mean_pooling(model_output, encoded_input['attention_mask'])
    sentence_embedding_np = sentence_embedding.numpy()
    return sentence_embedding_np

In [158]:
test = create_embedding(data.Question)

In [154]:
index = faiss.IndexFlatL2(768)
index.add(sentence_embedding_np)

In [159]:
def search(query):
    query_encoding = create_embedding(query)
    distance, ques_index = index.search(query_encoding, 1)
    return (distance, ques_index)

In [165]:
query = ["Who is your father"]

# data.iloc[search(query).item()]
distance, ques_index = search(query)
print(data.iloc[ques_index[0][0]])


Question     Who is your father?
Answer      I don't have family.
Name: 1182, dtype: object
