In [11]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import re
from datasets import Dataset
from tqdm import tqdm
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS, ElasticVectorSearch
import elasticsearch
import torch.nn as nn
import numpy as np



In [2]:
#### load data

df = pd.read_csv("pt_question_answers.csv")

df["text"] = df["pt_title"] + "\n" + df["pt_body"] + "\n" + df["pt_answer"]

df = df[["text"]]

CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext
df["text"] = df["text"].apply(lambda x: cleanhtml(x))

df["text"] = df["text"].str.lower()

df

Unnamed: 0,text
0,extracting the top-k value-indices from a 1-d ...
1,how to display custom images in tensorboard (e...
2,python wheels: cp27mu not supported\ni'm tryin...
3,loading torch7 trained models (.t7) in pytorch...
4,pytorch: how to use dataloaders for custom dat...
...,...
14588,how to disable neptune callback in transformer...
14589,bgr to rgb for cub_200 images by image.split()...
14590,neural networks extending learning domain\ni h...
14591,how do i multiply tensors like this?\ni am wor...


## Custom Embeddings and FAISS index using dataset

In [3]:

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
model= nn.DataParallel(model)

device = torch.device("cuda")
model.to(device)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    ).to(device)
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

def get_context(question, truncate_length=512, k=5):

    question_embedding = get_embeddings([question]).cpu().detach().numpy()

    scores, samples = embeddings_dataset.get_nearest_examples(
        "embeddings", question_embedding, k=k
    )

    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=False, inplace=True)
    samples_df["text"] = samples_df["text"].str[:truncate_length]

    return samples_df


In [5]:
df.shape

(14593, 1)

In [6]:
from tqdm import tqdm

tqdm.pandas()

In [7]:
df["embeddings"] = df.progress_apply(lambda x: get_embeddings(x["text"]).detach().cpu().numpy()[0], axis=1)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14593/14593 [04:54<00:00, 49.61it/s]


In [8]:
# df.to_csv("embeddings.csv", index=False)

In [9]:
df.loc[0, "embeddings"].shape[0]

768

In [13]:
x = df.embeddings.tolist()
x = np.array(x)

x.shape

(14593, 768)

In [14]:
# x = np.random.rand(100, 512)
# print(x.shape)
x = x.reshape(x.shape[0], -1).astype('float32')
d = x.shape[1]

In [15]:
x.shape

(14593, 768)

In [16]:
d

768

In [38]:
import faiss

ngpus = faiss.get_num_gpus()

ngpus

1

In [43]:
import time

start_time = time.time()

res = faiss.StandardGpuResources()
flat_config = faiss.GpuIndexFlatConfig()
flat_config.useFloat16 = False
flat_config.device = 0
index = faiss.GpuIndexFlatL2(res, d, flat_config)

index.add(x)

print("time taken: ", (time.time() - start_time) * 1000)

time taken:  202.40092277526855


In [27]:
%%time
dataset = Dataset.from_pandas(df)

embeddings_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

  0%|          | 0/14593 [00:00<?, ?ex/s]

CPU times: user 18min 51s, sys: 52.9 s, total: 19min 44s
Wall time: 4min 57s


In [28]:
embeddings_dataset

Dataset({
    features: ['text', 'embeddings'],
    num_rows: 14593
})

In [46]:
import time

start_time = time.time()

embeddings_dataset.add_faiss_index(column="embeddings", device=0)

print("time taken: ", (time.time() - start_time) * 1000)

  0%|          | 0/15 [00:00<?, ?it/s]

time taken:  263.9632225036621


In [18]:
# embeddings_dataset.save_faiss_index("embeddings", "embeddings.faiss")

In [47]:
%%time
get_context('How do I check if PyTorch is using the GPU?', k=5)

CPU times: user 23.2 ms, sys: 47 µs, total: 23.2 ms
Wall time: 21.9 ms


Unnamed: 0,text,embeddings,scores
4,different method of running pytorch on gpu\nse...,"[-0.3892856538295746, -0.34232109785079956, -0...",24.639446
3,is there a way to allocate remaining gpu to yo...,"[-0.23101496696472168, -0.38610121607780457, -...",23.941151
2,pytorch is not using gpu even it detects the g...,"[-0.2389562726020813, -0.2919696867465973, -0....",21.565796
1,how to confirm that pytorch lightning is using...,"[0.0017349837580695748, -0.5566844344139099, -...",21.380344
0,how do i check if pytorch is using the gpu?\nh...,"[-0.2592145800590515, -0.6313626170158386, -0....",17.068993
