In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
from datasets import Dataset
from tqdm import tqdm

2023-02-03 18:21:36.573717: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-03 18:21:37.536856: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/usr/local/cuda/efa/lib:/usr/local/cuda/lib:/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cuda/targets/x86_64-linux/lib:/usr/local/lib:/usr/lib:/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/usr/local/cuda/efa/lib:/usr/local/cuda/lib:/usr/local/cuda:/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/cud

In [2]:
df = pd.read_csv("pt_question_answers.csv")

In [3]:
df.shape

(14593, 11)

In [4]:
df[["pt_title", "pt_body", "pt_answer"]]

Unnamed: 0,pt_title,pt_body,pt_answer
0,Extracting the top-k value-indices from a 1-D ...,<p>Given a 1-D tensor in Torch (<code>torch.Te...,"<p>As of pull request <a href=""https://github...."
1,How to Display Custom Images in Tensorboard (e...,"<p>The <a href=""https://github.com/tensorflow/...",<p>It is quite easy to do if you have the imag...
2,Python wheels: cp27mu not supported,"<p>I'm trying to install pytorch (<a href=""htt...","<p>Yes, that is possible. Just create the obje..."
3,Loading Torch7 trained models (.t7) in PyTorch,<p>I am using Torch7 library for implementing ...,<p><code>view()</code> reshapes the tensor wit...
4,PyTorch: How to use DataLoaders for custom Dat...,<p>How to make use of the <code>torch.utils.da...,<p>While you will not get as detailed informat...
...,...,...,...
14588,How to disable Neptune callback in transformer...,"<p>After installing <a href=""https://docs.nept...",<p>To disable Neptune callback in transformers...
14589,BGR to RGB for CUB_200 images by Image.split(),<p>I am creating a PyTorch dataset and dataloa...,<p>I would strongly recommend you use skimage....
14590,Neural Networks Extending Learning Domain,<p>I have a simple function <strong>f</strong>...,<p>What you want is called extrapolation (as o...
14591,How do I multiply tensors like this?,<p>I am working on a project where I need to m...,<p>You should familiarize yourself with the te...


In [5]:
df["text"] = df["pt_title"] + "\n" + df["pt_body"] + "\n" + df["pt_answer"]

In [6]:
df = df[["text"]]

In [7]:
import re
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, '', raw_html)
  return cleantext

In [8]:
df["text"] = df["text"].apply(lambda x: cleanhtml(x))

In [9]:
df["text"] = df["text"].str.lower()

In [10]:
df

Unnamed: 0,text
0,extracting the top-k value-indices from a 1-d ...
1,how to display custom images in tensorboard (e...
2,python wheels: cp27mu not supported\ni'm tryin...
3,loading torch7 trained models (.t7) in pytorch...
4,pytorch: how to use dataloaders for custom dat...
...,...
14588,how to disable neptune callback in transformer...
14589,bgr to rgb for cub_200 images by image.split()...
14590,neural networks extending learning domain\ni h...
14591,how do i multiply tensors like this?\ni am wor...


In [11]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [12]:
device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0): MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_features

In [13]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [14]:
dataset = Dataset.from_pandas(df)

In [15]:
embedding = get_embeddings(dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [16]:
embeddings_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

  0%|          | 0/14593 [00:00<?, ?ex/s]

In [17]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/15 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'embeddings'],
    num_rows: 14593
})

In [18]:
def get_context(question, truncate_length=512, k=5):

    question_embedding = get_embeddings([question]).cpu().detach().numpy()


    scores, samples = embeddings_dataset.get_nearest_examples(
        "embeddings", question_embedding, k=k
    )

    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=False, inplace=True)
    samples_df["text"] = samples_df["text"].str[:truncate_length]
#     print(samples_df)
        
#     for i in range(len(samples_df)):
#         print(len(samples_df.iloc[i]["text"]))

    return '\n'.join(samples_df.text.tolist())


In [19]:
questions_df = pd.read_csv("top100questions.csv")


In [20]:
# question_list = []
# context_list = []
# for question in tqdm(questions_df["question"]):
#     print(question)
#     question_list.append(question)
#     context_list.append(get_context(question))
    

In [21]:
# context_df = pd.DataFrame({
#     "question": question_list,
#     "context": context_list
# })

In [22]:
questions_df.iloc[:10]

Unnamed: 0,question
0,How do I check if PyTorch is using the GPU?\n
1,How do I save a trained model in PyTorch?\n
2,What does .view() do in PyTorch?\n
3,Why do we need to call zero_grad() in PyTorch?\n
4,How do I print the model summary in PyTorch?\n
5,How do I initialize weights in PyTorch?\n
6,What does model.eval() do in pytorch?\n
7,What's the difference between reshape and view...
8,What does model.train() do in PyTorch?\n
9,What does .contiguous() do in PyTorch?\n


In [23]:
hf_token = ""

In [24]:
def run_qa(question, context):

    template = """from the context: {context} answer the question: {question}."""
    prompt = PromptTemplate(template=template, input_variables=["context", "question"])
    llm_chain = LLMChain(prompt=prompt, llm=
                         HuggingFaceHub(
                             huggingfacehub_api_token=hf_token,
                             repo_id="google/flan-t5-xl",
                             model_kwargs={"temperature":0}))

    print(llm_chain.predict(question=question, context=context))


In [25]:
question = "how to save pytorch model"
context = get_context(question, k=3)
run_qa(question, context)

torch.save()/torch.load() is for saving/loading


In [26]:
question = "Why my training is slow"
context = get_context(question, k=3)
run_qa(question, context)

acc_fn is not a float


In [27]:
question = "what is model.eval()"
context = get_context(question, k=3)
run_qa(question, context)

to evaluate the model


In [28]:
question = "how to typecast tensor from float to long"
context = get_context(question, k=3)
run_qa(question, context)

import sys import os import python import os


In [29]:
for question in questions_df.loc[:10, "question"]:
    print("***********************************")
    question = question.rstrip()
    print(question)
    context = get_context(question, k=3)
    run_qa(question, context)


***********************************
How do I check if PyTorch is using the GPU?
nvidia-smi command can detect gpu activity, but i
***********************************
How do I save a trained model in PyTorch?
torch.save()/torch.load() is for saving/loading
***********************************
What does .view() do in PyTorch?
reshapes the tensor to a different but compatible shape
***********************************
Why do we need to call zero_grad() in PyTorch?
zero_grad(self) | sets gradients of all model parameters to zero.
***********************************
How do I print the model summary in PyTorch?
how do i print the summary of a model in pytorch like
***********************************
How do I initialize weights in PyTorch?
solved it by saving only the model's state_dict() via torch.save
***********************************
What does model.eval() do in pytorch?
modifies certain modules (layers) which are required to behave differently during training and inference
***************

In [30]:
# question = "how to save pytorch model"
# context = get_context(question, k=3)

In [31]:
# context = get_context(question, k=3)

In [32]:
# prompt = PromptTemplate(
#     template="Content: {page_content}",
#     input_variables=["page_content"],
# )


# llm_chain = LLMChain(prompt=prompt, llm=
#                      HuggingFaceHub(
#                          huggingfacehub_api_token=hf_token,
#                          repo_id="google/flan-t5-xl",
#                          model_kwargs={"temperature":0.01}))

# question = question

# page_content=context
# print(llm_chain.run(question))

In [33]:
# DEFAULT_TEXT_QA_PROMPT_TMPL = (
#     "Context information is below. \n"
#     "---------------------\n"
#     "{context_str}"
#     "\n---------------------\n"
#     "Given the context information and not prior knowledge, "
#     "answer the question: {question}\n"
# )


# prompt = PromptTemplate(
#     input_variables=["context_str", "question"], template=DEFAULT_TEXT_QA_PROMPT_TMPL
# )

# llm_chain = LLMChain(prompt=prompt, llm=
#                      HuggingFaceHub(
#                          huggingfacehub_api_token=hf_token,
#                          repo_id="google/flan-t5-xl",
#                          model_kwargs={"temperature":1e-10}))

# question = question

# context_str = context
# print(llm_chain.run({"question": question, "context_str" : context_str}))

In [34]:
# def get_context_df(question, truncate_length=512, k=5):

#     question_embedding = get_embeddings([question]).cpu().detach().numpy()


#     scores, samples = embeddings_dataset.get_nearest_examples(
#         "embeddings", question_embedding, k=k
#     )

#     samples_df = pd.DataFrame.from_dict(samples)
#     samples_df["scores"] = scores
#     samples_df.sort_values("scores", ascending=False, inplace=True)
#     samples_df["text"] = samples_df["text"].str[:truncate_length]
# #     print(samples_df)

#     return samples_df

In [35]:
# question

In [36]:
# context_str = get_context(question, k=3)

In [37]:
# answer_df = get_context_df(question)
# existing_answer = answer_df.iloc[0]["text"]

In [38]:
# DEFAULT_REFINE_PROMPT_TMPL = (
#     "The original question is as follows: {question}\n"
#     "We have provided an existing answer, including sources: {existing_answer}\n"
#     "We have the opportunity to refine the existing answer"
#     "(only if needed) with some more context below.\n"
#     "------------\n"
#     "{context_str}\n"
#     "------------\n"
#     "Given the new context, refine the original answer to better "
#     "answer the question. "
#     "If the context isn't useful, return the original answer."
# )
# DEFAULT_REFINE_PROMPT = PromptTemplate(
#     input_variables=["question", "existing_answer", "context_str"],
#     template=DEFAULT_REFINE_PROMPT_TMPL,
# )

# llm_chain = LLMChain(prompt=DEFAULT_REFINE_PROMPT, llm=
#                      HuggingFaceHub(
#                          huggingfacehub_api_token=hf_token,
#                          repo_id="google/flan-t5-xl",
#                          model_kwargs={"temperature":0}), verbose=True)

# print(llm_chain.run({"question": question, "context_str": context_str, "existing_answer": existing_answer}))