In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import os.path as osp
import sys
ROOT_DIR = osp.dirname(os.getcwd())
sys.path.append(ROOT_DIR)

# Set up models, vectorstore and retriever

In [3]:
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever
from tqdm import tqdm

In [4]:
DOCUMENT_PATH='../data/rel18/'
VECTOR_STORE_PATH = '../data/vectorstore/'

In [5]:
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
encode_kwargs = {'allow_download': 'True'}
embeddings = GPT4AllEmbeddings(
    model_name=model_name,
    device='cpu',
    gpt4all_kwargs=encode_kwargs
)

In [6]:
vectorstore = Chroma(persist_directory=VECTOR_STORE_PATH+"chromadb", embedding_function=embeddings)

In [7]:
vectorstore._collection.get(limit=1)

{'ids': ['00000a5c-dc75-47d5-83c5-4763f5cf0ef1'],
 'embeddings': None,
 'metadatas': [{'source': '../data/rel18/28550-i30.docx'}],
 'documents': ['The Performance Data Stream Units are described using ASN.1 as specified in ITU-T Rec. X.680 [15] and X.681 [16]. Transfer syntax for Performance Data Stream Units is derived from their ASN.1 definitions by use of Packed Encoding Rules (PER), aligned as'],
 'uris': None,
 'data': None}

In [8]:
from transformers import AutoTokenizer,AutoModelForCausalLM
from peft import PeftModel
import transformers
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
answer_model = 'microsoft/phi-2'

In [10]:
tokenizer = AutoTokenizer.from_pretrained(answer_model)
tokenizer.pad_token = tokenizer.eos_token 
answer_model = AutoModelForCausalLM.from_pretrained(answer_model,device_map="auto",)
peft_model = PeftModel.from_pretrained(answer_model, '../bin/pretrained/', device_map="auto")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.68s/it]


In [11]:
answer_generator = transformers.pipeline(
    "text-generation",
    model=peft_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalL

In [12]:
rerank_model = HuggingFaceCrossEncoder(model_name="BAAI/bge-reranker-base")



In [13]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={'k': 50}
)
compressor = CrossEncoderReranker(model=rerank_model, top_n=10)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

# Inference

In [14]:
from lib.prompt import get_inference_prompt
import json
import pandas as pd

In [45]:
def answer_questions(qst_filename, answer_model, retriever, max_new_tokens=4,return_full_text=False, batch_size = 128):
    with open(qst_filename) as file:
        questions = json.load(file)
    solutions = []
    def get_question_prompt(qstn_id,qstn_data): # in tqdm(questions.items()):
        qstn_id=qstn_id.split(' ')[1].strip()
        qstn_text = qstn_data['question']
        #searching through datastore for context
        docs = retriever.invoke(qstn_text)
        context =  (' '.join(list(map(lambda d:d.page_content,docs)))).replace('\n', '. ')
        infer_data = get_inference_prompt(qstn_data, context)
        prompt = infer_data['question']
        return qstn_id,prompt
    prompts = list(map(lambda entry:get_question_prompt(entry[0],entry[1]),tqdm(list(questions.items()))))
    num_prompts = len(prompts)
    for i in tqdm(range(0,num_prompts,batch_size)):
        current_prompts=list(map(lambda e:e[1],prompts[i:i+batch_size]))
        current_qstn_ids=list(map(lambda e:e[0],prompts[i:i+batch_size]))
        responses = answer_model(current_prompts,max_new_tokens=max_new_tokens, return_full_text=return_full_text)
        current_ans_ids =list(map(lambda r:r[0]['generated_text'].split(')')[0][-1:].strip(),responses))
        solutions += list(zip(current_qstn_ids,current_ans_ids))
        
    return solutions

In [46]:
def save_solution(filename,solution, task=''):
    df = pd.DataFrame(solution,columns=['Question_ID','Answer_ID'])
    df['Task'] = task
    df.to_csv(filename,index=False,)

In [47]:
train_soln = answer_questions('../data/TeleQnA_testing1.txt',answer_generator,compression_retriever)

100%|██████████| 366/366 [02:08<00:00,  2.84it/s]
100%|██████████| 3/3 [05:45<00:00, 115.12s/it]


In [48]:
save_solution('testing_result.csv',train_soln,'Phi-2')

In [None]:
pd.read_csv('testing_result.csv')['Answer_ID']

In [40]:
pred = pd.read_csv('./result.csv')
act = pd.read_csv('../data/Q_A_ID_training.csv')

In [42]:
pred['Answer_ID']=pred['Answer_ID'].astype(int)

In [44]:
(pred['Answer_ID'] == act['Answer_ID']).mean()

0.6344969199178645

In [None]:
#context from datastore
# docs = compression_retriever.invoke(question['question'])
# context =  (' '.join(list(map(lambda d:d.page_content,docs)))).replace('\n', '. ')
# question
# prompt = get_inference_prompt(question, context)
# refined_prompt = prompt['question']
# answer = prompt['answer']
# print(refined_prompt)
# gen_result = answer_generator(refined_prompt,max_new_tokens=128,return_full_text=False,)
# "option 1) Avoid monitoring neighbo"
# print(answer)
# print(gen_result[0]['generated_text'])