In [1]:
!pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [15]:
! pip install datasets

Collecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: xxhash, responses, datasets
Successfully installed datasets-2.9.0 responses-0.18.0 xxhash-3.2.0


In [37]:
import pandas as pd

comments_df = pd.read_csv("PytorchDocs.csv")

In [38]:
comments_df.columns

Index(['Question', 'Answer'], dtype='object')

In [40]:
comments_df.drop(columns=["Question"], inplace=True)

In [41]:
# comments_df = comments_df.drop_duplicates(["document"])

In [42]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(comments_df)

In [43]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [47]:
import torch

device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0): MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_features

In [48]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [49]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [50]:
embedding = get_embeddings(comments_dataset["Answer"][0])
embedding.shape

torch.Size([1, 768])

In [51]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["Answer"]).detach().cpu().numpy()[0]}
)

  0%|          | 0/55 [00:00<?, ?ex/s]

In [52]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Answer', 'embeddings'],
    num_rows: 55
})

In [69]:
question = "How to load a torchscript model"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [70]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [71]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [72]:
samples_df

Unnamed: 0,Answer,embeddings,scores
4,One common way to do inference with a trained ...,"[-0.6122680902481079, -0.3593311607837677, 0.0...",36.68549
3,"Save:\ntorch.save(modelA.state_dict(), PATH)\n...","[-0.26079243421554565, -0.5940628051757812, 0....",34.921864
2,"Save:\n\ntorch.save(model, PATH)\nLoad:\n\n# M...","[-0.12381523102521896, -0.2200237363576889, 0....",34.642708
1,"Save on GPU, Load on CPU\nSave:\n\ntorch.save(...","[-0.2662140727043152, -0.4992186725139618, -0....",32.611095
0,This document provides solutions to a variety ...,"[0.19851192831993103, -0.399810791015625, -0.0...",31.892815


In [73]:
samples["Answer"]

['This document provides solutions to a variety of use cases regarding the saving and loading of PyTorch models. Feel free to read the whole document, or just skip to the code you need for a desired use case.\n\nWhen it comes to saving and loading models, there are three core functions to be familiar with:\n\ntorch.save: Saves a serialized object to disk. This function uses Python’s pickle utility for serialization. Models, tensors, and dictionaries of all kinds of objects can be saved using this function.\n\ntorch.load: Uses pickle’s unpickling facilities to deserialize pickled object files to memory. This function also facilitates the device to load the data into (see Saving & Loading Model Across Devices).\n\ntorch.nn.Module.load_state_dict: Loads a model’s parameter dictionary using a deserialized state_dict. For more information on state_dict, see What is a state_dict?.',
 'Save on GPU, Load on CPU\nSave:\n\ntorch.save(model.state_dict(), PATH)\nLoad:\n\ndevice = torch.device(\'cp

### SO dataset

In [85]:
so_df = pd.read_csv("questions_with_accepted_df.csv")

In [86]:
so_df.shape

(7558, 12)

In [87]:
import re
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [94]:
so_df = so_df[["question", "context", "pt_answer"]]

In [96]:
so_df["question"] = so_df["question"].apply(lambda x: cleanhtml(x))
so_df["context"] = so_df["context"].apply(lambda x: cleanhtml(x))
so_df["pt_answer"] = so_df["pt_answer"].apply(lambda x: cleanhtml(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  so_df["question"] = so_df["question"].apply(lambda x: cleanhtml(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  so_df["context"] = so_df["context"].apply(lambda x: cleanhtml(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  so_df["pt_answer"] = so_df["pt_answer"].apply(lambda x: cleanhtml(x))


In [101]:
so_df["answer"] = so_df["context"] + "\n" + so_df["pt_answer"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  so_df["answer"] = so_df["context"] + "\n" + so_df["pt_answer"]


In [103]:
so_df = so_df[["question", "answer"]]

In [104]:
embedding = get_embeddings(so_df["answer"][0])
embedding.shape

torch.Size([1, 768])

In [105]:
so_dataset = Dataset.from_pandas(so_df)

In [106]:
embeddings_dataset = so_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["answer"]).detach().cpu().numpy()[0]}
)

  0%|          | 0/7558 [00:00<?, ?ex/s]

In [107]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/8 [00:00<?, ?it/s]

Dataset({
    features: ['question', 'answer', 'embeddings'],
    num_rows: 7558
})

In [136]:
question = "what is a state_dict ?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [137]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [138]:
samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [139]:
samples_df

Unnamed: 0,question,answer,embeddings,scores
4,Libtorch C++: Efficient/correct way for saving...,"To answer my own question, the model state dic...","[-0.47472235560417175, -0.45282137393951416, 0...",41.877724
3,Translating Conv1D Layer from pytorch to tenso...,In tensorflow's keras you write something like...,"[-0.25582146644592285, -0.5839678645133972, -0...",41.720638
2,MT5ForConditionalGeneration with Pytorch-light...,Try inheriting pl.LightingModule instead of pl...,"[-0.15955837070941925, -0.28728607296943665, -...",41.645695
1,What is the last line of this Rnn function mea...,"The line out = self.fc(out[:, -1, :]) is using...","[-0.3125688135623932, -0.34171342849731445, -0...",39.558189
0,How to print the model's parameters'shape and ...,The state dictionary of does not contain any i...,"[0.06976506859064102, -0.15413105487823486, -0...",36.894581


In [140]:
samples["answer"]

["The state dictionary of does not contain any information about the structure of forward logic of its corresponding nn.Module. Without prior knowledge about it's content, you can't get which key of the dict contains the first layer of the module... it's possibly the first one but this method is rather limited if you want to beyond just the first layer. You can inspect the content of the nn.Module but you won't be able to extract much more from it, without having the actual nn.Module class at your disposal.\n\nI solved this by extending the DataCollatorForSeq2Seq class and overriding the __call__ method in it to also pad my 'spk_utt_pos' list appropriately.\n",
 "The line out = self.fc(out[:, -1, :]) is using negative indexing: out is a tensor of shape batch_size x seq_length x hidden_size, so out[:, 1, :] would return the first element along the second dimension (or axis), and out[:, -1, :] returns the last element along the second dimension. It would be equivalent to out[:, seq_lengt