In [4]:
from pypdf import PdfReader
import os
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
from Agent import Agent
import re
import faiss
import pickle as pkl

In [2]:
@torch.no_grad()
def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-base-v2')
embeddings_model = AutoModel.from_pretrained('intfloat/e5-base-v2')
device = 'cuda' if torch.cuda.is_available() else 'cpu'
embeddings_model.to(device)
embeddings_model.eval()
pass

# for 'intfloat/e5-base-v2'
# Each input text should start with "query: " or "passage: ".
# For tasks other than retrieval, you can simply use the "query: " prefix.


In [3]:
@torch.no_grad()
def get_embeddings(texts):
    # Tokenize the input texts
    batch_dict = tokenizer(texts, max_length=512, padding=True, truncation=True, return_tensors='pt').to(device)

    outputs = embeddings_model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask']).to('cpu')

    # (Optionally) normalize embeddings
    embeddings = F.normalize(embeddings)
    return embeddings.numpy()

In [11]:
index = faiss.read_index('all_embeddings.index')

In [12]:
text_info = pkl.load(open('text_info.pkl', 'rb'))

In [13]:
query = 'What is the difference between an encoder and a decoder?'
query = 'query: ' + query

In [14]:
query_embedding = get_embeddings([query])

In [15]:
k = 3

In [16]:
scores, text_idx = index.search(query_embedding,k)

In [17]:
text_idx = text_idx.flatten()

In [18]:
text_idx

array([ 12, 383,  22])

In [19]:
scores

array([[0.30136153, 0.35455027, 0.35456172]], dtype=float32)

In [21]:
info = '.'.join(np.array(text_info)[text_idx][:,1])
info = info.replace('passage: ', '').strip()

In [22]:
print(info)

2Figure 1: The Transformer - model architecture  3 1 Encoder and Decoder Stacks Encoder: The encoder is composed of a stack of N= 6 identical layers  Each layer has two sub-layers  The ﬁrst is a multi-head self-attention mechanism, and the second is a simple, positionwise fully connected feed-forward network. ⟨decoder⟩denotes the decoding procedure employed by the LMQL runtime when solving the query  The presented version of LMQL enables argmax ,sample and beam argmax and sample work as discussed in §2 1  beamhowever, denotes a novel procedure called scripted beam search which performs beam search jointly over all holes and control flow. The encoder contains self-attention layers  In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder  Each position in the encoder can attend to all positions in the previous layer of the encoder  Similarly, self-attention layers in the decoder allow each pos

In [23]:
model_name = 'databricks/dolly-v2-3b'
gen_model = AutoModelForCausalLM.from_pretrained(model_name,
                                             torch_dtype=torch.bfloat16,
                                            low_cpu_mem_usage=True,
                                                 trust_remote_code=True)
gen_tokenizer = AutoTokenizer.from_pretrained(model_name)
gen_model.to(device)
pass

In [24]:
tokens = gen_tokenizer('Human: What is your name?\n Assistant: ', return_tensors='pt')

In [25]:
prompt = f'''
Below is an user query that describes a question. Write a response that appropriately answers the query using the 
information given in the input. The information is extracted from a research paper.


'''


agent = Agent(gen_model, gen_tokenizer, prompt,
              break_words=['### End'], device=device)

In [26]:
gen_text = f'### Instruction:\n{query.replace("query: ", "")}\n\nInput:\n{info}'

In [27]:
print(gen_text)

### Instruction:
What is the difference between an encoder and a decoder?

Input:
2Figure 1: The Transformer - model architecture  3 1 Encoder and Decoder Stacks Encoder: The encoder is composed of a stack of N= 6 identical layers  Each layer has two sub-layers  The ﬁrst is a multi-head self-attention mechanism, and the second is a simple, positionwise fully connected feed-forward network. ⟨decoder⟩denotes the decoding procedure employed by the LMQL runtime when solving the query  The presented version of LMQL enables argmax ,sample and beam argmax and sample work as discussed in §2 1  beamhowever, denotes a novel procedure called scripted beam search which performs beam search jointly over all holes and control flow. The encoder contains self-attention layers  In a self-attention layer all of the keys, values and queries come from the same place, in this case, the output of the previous layer in the encoder  Each position in the encoder can attend to all positions in the previous lay

In [28]:
agent.generate_response_greedy(gen_text,
                               verbose=True, temp=0.5,name='### Response:',max_length=1024)


The decoder contains self-attention layers which allow each position in the decoder to attend to all positions in the decoder up to and including that position.

### End

'\nThe decoder contains self-attention layers which allow each position in the decoder to attend to all positions in the decoder up to and including that position.\n\n'