In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, pipeline
from langchain import PromptTemplate, HuggingFaceHub, LLMChain
import re
from langchain.llms.huggingface_pipeline import HuggingFacePipeline
from datasets import Dataset
from tqdm import tqdm

In [2]:
df = pd.read_csv("pt_question_answers.csv")

In [3]:
df[["pt_title", "pt_body", "pt_answer"]]

Unnamed: 0,pt_title,pt_body,pt_answer
0,Extracting the top-k value-indices from a 1-D ...,<p>Given a 1-D tensor in Torch (<code>torch.Te...,"<p>As of pull request <a href=""https://github...."
1,How to Display Custom Images in Tensorboard (e...,"<p>The <a href=""https://github.com/tensorflow/...",<p>It is quite easy to do if you have the imag...
2,Python wheels: cp27mu not supported,"<p>I'm trying to install pytorch (<a href=""htt...","<p>Yes, that is possible. Just create the obje..."
3,Loading Torch7 trained models (.t7) in PyTorch,<p>I am using Torch7 library for implementing ...,<p><code>view()</code> reshapes the tensor wit...
4,PyTorch: How to use DataLoaders for custom Dat...,<p>How to make use of the <code>torch.utils.da...,<p>While you will not get as detailed informat...
...,...,...,...
14588,How to disable Neptune callback in transformer...,"<p>After installing <a href=""https://docs.nept...",<p>To disable Neptune callback in transformers...
14589,BGR to RGB for CUB_200 images by Image.split(),<p>I am creating a PyTorch dataset and dataloa...,<p>I would strongly recommend you use skimage....
14590,Neural Networks Extending Learning Domain,<p>I have a simple function <strong>f</strong>...,<p>What you want is called extrapolation (as o...
14591,How do I multiply tensors like this?,<p>I am working on a project where I need to m...,<p>You should familiarize yourself with the te...


In [4]:
df["text"] = df["pt_title"] + "\n" + df["pt_body"] + "\n" + df["pt_answer"]

In [5]:
df = df[["text"]]

In [6]:
CLEANR = re.compile('<.*?>') 

def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext
df["text"] = df["text"].apply(lambda x: cleanhtml(x))

df["text"] = df["text"].str.lower()

In [7]:
df

Unnamed: 0,text
0,extracting the top-k value-indices from a 1-d ...
1,how to display custom images in tensorboard (e...
2,python wheels: cp27mu not supported\ni'm tryin...
3,loading torch7 trained models (.t7) in pytorch...
4,pytorch: how to use dataloaders for custom dat...
...,...
14588,how to disable neptune callback in transformer...
14589,bgr to rgb for cub_200 images by image.split()...
14590,neural networks extending learning domain\ni h...
14591,how do i multiply tensors like this?\ni am wor...


In [8]:
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [9]:
device = torch.device("cuda")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0): MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_features

In [10]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [11]:
dataset = Dataset.from_pandas(df)

embeddings_dataset = dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/14593 [00:00<?, ?ex/s]

  0%|          | 0/15 [00:00<?, ?it/s]

Dataset({
    features: ['text', 'embeddings'],
    num_rows: 14593
})

In [20]:
def get_context(question, truncate_length=512, k=5):

    question_embedding = get_embeddings([question]).cpu().detach().numpy()

    scores, samples = embeddings_dataset.get_nearest_examples(
        "embeddings", question_embedding, k=k
    )

    samples_df = pd.DataFrame.from_dict(samples)
    samples_df["scores"] = scores
    samples_df.sort_values("scores", ascending=False, inplace=True)
    samples_df["text"] = samples_df["text"].str[:truncate_length]

    return '\n'.join(samples_df.text.tolist())

In [40]:
model_id = "gpt2"
tokenizer_hfp = AutoTokenizer.from_pretrained(model_id)
model_hfp = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline(
    "text-generation", model=model_hfp, tokenizer=tokenizer_hfp, max_new_tokens = 50
)
hf = HuggingFacePipeline(pipeline=pipe)

In [41]:
hf

HuggingFacePipeline(cache=None, verbose=False, callback_manager=<langchain.callbacks.shared.SharedCallbackManager object at 0x7f493f281fa0>, pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x7f4846eed520>, model_id='gpt2', model_kwargs=None)

In [42]:
questions_df = pd.read_csv("top100questions.csv")

In [43]:
def run_qa(question, context):

    template = """from this context: {context} answer the question: {question}."""
    prompt = PromptTemplate(template=template, input_variables=["context", "question"])
    llm_chain = LLMChain(prompt=prompt, llm=hf)
    return llm_chain.predict(question=question, context=context)


In [44]:
for question in questions_df.loc[:10, "question"]:
    print("*************************************************************************")
    question = question.rstrip()
    print(question)
    context = get_context(question, k=3)
    print('answer:',run_qa(question, context))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


*************************************************************************
How do I check if PyTorch is using the GPU?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer:  this is just an example of this being something other than a python script that looks through gpu, which can include many other processes (like the user's work), and also includes a couple of others like the python script.

so, the
*************************************************************************
How do I save a trained model in PyTorch?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer: 

I decided to take a different approach : i created my model library in an order of magnitude greater than pytorch.


from the pytorch subroutine (where the model is being trained) which compiles to a binary
*************************************************************************
What does .view() do in PyTorch?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer: 

from v2.1.1.1 [a-z]:


import net def __init__(self):

self.shape = nn.shape

self.shape.add(100, nn.
*************************************************************************
Why do we need to call zero_grad() in PyTorch?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer: 

i wonder why detach_() is necessary? in python

@import os ( """

"""

for group in self.param_groups:

if not group['params'] is None:

for group in
*************************************************************************
How do I print the model summary in PyTorch?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer: 
output_2 (inputlayer)                    input_1.txt                  
*************************************************************************
How do I initialize weights in PyTorch?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer: 

The problem with this approach is that

furthermore the question: how do i initialize weights in pytorch?

if m.object is None :

weight_init()

if m is not None :
*************************************************************************
What does model.eval() do in pytorch?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer:  it performs one major function, which its result may not have, if it has no input at all. we won't go into detail about that here, or even in the comments post the module which is involved (for a list of all the modules
*************************************************************************
What's the difference between reshape and view in pytorch?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer: 

im not going to explain the difference, its easy to see.

for(i in [6, 2, 3]) {

inputs = tensor(input.shape[i].shape[0], 0) output
*************************************************************************
What does model.train() do in PyTorch?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer: 

and the

model = tf.train.Model((model.data, (params[:], params[:], "test_weights_kh"), np.zeros, b.tensor.Loss.DistantN
*************************************************************************
What does .contiguous() do in PyTorch?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


answer:  i was working on this in python, so I figured it probably is for my own use. no-obvious-feature, also not implemented well in a general purpose python library (like pumpy). I guess prytow is just a
*************************************************************************
Why do we "pack" the sequences in PyTorch?
answer:  this is because i have the idea that the R n-packers will always keep the order in which the sequences are sorted by their diodes. we think of this as the "racket" of the random array. because the rnn
