In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import re
from datasets import Dataset
from tqdm import tqdm
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS, ElasticVectorSearch
import elasticsearch


In [89]:
#### load data

df = pd.read_csv("pt_question_answers.csv")

df["text"] = df["pt_title"] + "\n" + df["pt_body"] + "\n" + df["pt_answer"]

df = df[["text"]]

CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext
df["text"] = df["text"].apply(lambda x: cleanhtml(x))

df["text"] = df["text"].str.lower()

df

Unnamed: 0,text
0,extracting the top-k value-indices from a 1-d ...
1,how to display custom images in tensorboard (e...
2,python wheels: cp27mu not supported\ni'm tryin...
3,loading torch7 trained models (.t7) in pytorch...
4,pytorch: how to use dataloaders for custom dat...
...,...
14588,how to disable neptune callback in transformer...
14589,bgr to rgb for cub_200 images by image.split()...
14590,neural networks extending learning domain\ni h...
14591,how do i multiply tensors like this?\ni am wor...


## Custom Embeddings and FAISS index using dataset

In [90]:

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

device = torch.device("cuda")
model.to(device)

def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

def get_context(question, truncate_length=512, k=5):

    question_embedding = get_embeddings([question]).cpu().detach().numpy()

    scores, samples = embeddings_dataset.get_nearest_examples(
        "embeddings", question_embedding, k=k
    )

    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=False, inplace=True)
    samples_df["text"] = samples_df["text"].str[:truncate_length]

    return samples_df


In [4]:
%%time
dataset = Dataset.from_pandas(df)

embeddings_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

  0%|          | 0/14593 [00:00<?, ?ex/s]

CPU times: user 17min 13s, sys: 46.1 s, total: 17min 59s
Wall time: 4min 31s


In [5]:
embeddings_dataset

Dataset({
    features: ['text', 'embeddings'],
    num_rows: 14593
})

In [6]:
%%time
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/15 [00:00<?, ?it/s]

CPU times: user 150 ms, sys: 43.9 ms, total: 194 ms
Wall time: 301 ms


Dataset({
    features: ['text', 'embeddings'],
    num_rows: 14593
})

In [11]:
%%time
get_context('How do I check if PyTorch is using the GPU?', k=5)

CPU times: user 33.7 ms, sys: 0 ns, total: 33.7 ms
Wall time: 32.6 ms


Unnamed: 0,text,embeddings,scores
4,different method of running pytorch on gpu\nse...,"[-0.3892856538295746, -0.34232109785079956, -0...",24.639463
3,is there a way to allocate remaining gpu to yo...,"[-0.23101496696472168, -0.38610121607780457, -...",23.941139
2,pytorch is not using gpu even it detects the g...,"[-0.2389562726020813, -0.2919696867465973, -0....",21.565807
1,how to confirm that pytorch lightning is using...,"[0.0017349837580695748, -0.5566844344139099, -...",21.380346
0,how do i check if pytorch is using the gpu?\nh...,"[-0.2592145800590515, -0.6313626170158386, -0....",17.069004


## langchain.vectorstores FAISS

In [12]:
model_name = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
hf_embeddings = HuggingFaceEmbeddings(model_name=model_name)

In [13]:
text = df['text'].tolist()

In [14]:
%%time
faiss_docsearch = FAISS.from_texts(text, hf_embeddings)

CPU times: user 6min 34s, sys: 48.9 s, total: 7min 23s
Wall time: 2min 19s


In [15]:
%%time
docs = faiss_docsearch.similarity_search(query='How do I check if PyTorch is using the GPU?', k=5)

CPU times: user 26.5 ms, sys: 3.65 ms, total: 30.1 ms
Wall time: 29.2 ms


In [17]:
docs[0]

Document(page_content="how do i check if pytorch is using the gpu?\nhow do i check if pytorch is using the gpu? the nvidia-smi command can detect gpu activity, but i want to check it directly from inside a python script.\n\nso the learning rate is stored in optim.param_groups[i]['lr'].\noptim.param_groups is a list of the different weight groups which can have different learning rates. thus, simply doing:\nfor g in optim.param_groups:\n    g['lr'] = 0.001\n\nwill do the trick.\n  \n  \n**alternatively,**\nas mentionned in the comments, if your learning rate only depends on the epoch number, you can use a learning rate scheduler.\nfor example (modified example from the doc):\ntorch.optim.lr_scheduler import lambdalr\noptimizer = torch.optim.sgd(model.parameters(), lr=0.1, momentum=0.9)\n# assuming optimizer has two groups.\nlambda_group1 = lambda epoch: epoch // 30\nlambda_group2 = lambda epoch: 0.95 ** epoch\nscheduler = lambdalr(optimizer, lr_lambda=[lambda1, lambda2])\nfor epoch in r

## langchain.vectorstores ElasticSearch

In [20]:
%%time
elastic_vector_search = ElasticVectorSearch.from_texts(text, hf_embeddings, elasticsearch_url="http://localhost:9200")

CPU times: user 6min 43s, sys: 49.6 s, total: 7min 32s
Wall time: 2min 39s


In [103]:
%%time
docs = elastic_vector_search.similarity_search(query='How do I check if PyTorch is using the GPU?', k=5)

CPU times: user 24 ms, sys: 327 µs, total: 24.4 ms
Wall time: 50.8 ms


In [34]:
docs[0]

Document(page_content="how do i check if pytorch is using the gpu?\nhow do i check if pytorch is using the gpu? the nvidia-smi command can detect gpu activity, but i want to check it directly from inside a python script.\n\nso the learning rate is stored in optim.param_groups[i]['lr'].\noptim.param_groups is a list of the different weight groups which can have different learning rates. thus, simply doing:\nfor g in optim.param_groups:\n    g['lr'] = 0.001\n\nwill do the trick.\n  \n  \n**alternatively,**\nas mentionned in the comments, if your learning rate only depends on the epoch number, you can use a learning rate scheduler.\nfor example (modified example from the doc):\ntorch.optim.lr_scheduler import lambdalr\noptimizer = torch.optim.sgd(model.parameters(), lr=0.1, momentum=0.9)\n# assuming optimizer has two groups.\nlambda_group1 = lambda epoch: epoch // 30\nlambda_group2 = lambda epoch: 0.95 ** epoch\nscheduler = lambdalr(optimizer, lr_lambda=[lambda1, lambda2])\nfor epoch in r

| VectorStore | Compute embeddings on 14593 rows | similarity search top 5 |
| --- | --- | --- |
| Custom | 4min 31s (embeddings) + 301 ms (add_faiss_index) | 32.6 ms |
| FAISS | 2min 19s | 29.2 ms |
| ElasticSearch | 2min 39s | 50.8 ms |