In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import os.path as osp
import sys
ROOT_DIR = osp.dirname(os.getcwd())
sys.path.append(ROOT_DIR)

# Set up models, vectorstore and retriever

In [3]:
from langchain_chroma import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.retrievers import EnsembleRetriever
from tqdm import tqdm
from lib.config import LLM_MODEL_NAME, EMBEDDING_MODEL_NAME, VECTOR_STORE_NAME,COMPRESSION_RETRIEVER_TOP_N,VECTOR_RETRIEVER_K,RERANKER_MODEL_NAME

In [4]:
DOCUMENT_PATH='../data/rel18/'
VECTOR_STORE_PATH = '../data/vectorstore/'

In [5]:
embeddings = GPT4AllEmbeddings(
    model_name=EMBEDDING_MODEL_NAME,
    device='gpu',
)

# embeddings=HuggingFaceBgeEmbeddings(model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "cuda"}, encode_kwargs={"normalize_embeddings": True})

In [6]:
vectorstore = Chroma(persist_directory=VECTOR_STORE_PATH+VECTOR_STORE_NAME, embedding_function=embeddings)

In [7]:
from transformers import AutoTokenizer,AutoModelForCausalLM
from peft import PeftModel
import transformers
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from langchain_community.llms.llamafile import Llamafile
from langchain.retrievers.multi_query import MultiQueryRetriever, LineListOutputParser

In [9]:
# Set logging for the queries
import logging

logging.basicConfig()
logger = logging.getLogger("langchain.retrievers.multi_query")
logger.setLevel(logging.INFO)

In [10]:
from typing import List

from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from pydantic import BaseModel, Field


# Output parser will split the LLM result into a list of queries
# class LineList(BaseModel):
#     # "lines" is the key (attribute name) of the parsed output
#     lines: List[str] = Field(description="Lines of text")


# class LineListOutputParser(PydanticOutputParser):
#     def __init__(self) -> None:
#         super().__init__(pydantic_object=LineList)

#     def parse(self, text: str) -> LineList:
#         logger.info(text)
#         lines = list(filter(lambda t: t.strip() != '',text.strip().split("\n")))
#         return LineList(lines=lines)


# output_parser = LineListOutputParser()

# QUERY_PROMPT = PromptTemplate(
#     input_variables=["question"],
#     template="""<|system|>
#     You are an AI language model assistant. Your task is to generate five different versions of the given user question to retrieve relevant documents from a vector database. By generating multiple perspectives on the user question, your goal is to help the user overcome some of the limitations of the distance-based similarity search. Provide these alternative questions separated by newlines.</s>
#     <|user|>
#     {question}</s>
#     <|assistant|>""")

# llm =  Llamafile()
# # Chain
# llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

In [11]:
# VECTOR_RETRIEVER_K = 50
# COMPRESSION_RETRIEVER_TOP_N=5

In [12]:
vstore_retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs= {'k': VECTOR_RETRIEVER_K, } 
)


In [13]:
# pipe = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.bfloat16, device_map="auto")
# tiny_token = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
# messages = [
#     {
#         "role": "system",
#         "content": "You are a friendly chatbot who always responds in the style of a pirate",
#     },
#     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
# ]
# prompt = tiny_token.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# print(prompt)

In [14]:

#self 
# metadata_field_info = [
#     AttributeInfo(
#         name="source",
#         description="The name of the document where the content was taken from",
#         type="string",
#     ),
#     AttributeInfo(
#         name="year",
#         description="The year the standard was released",
#         type="integer",
#     ),
#     AttributeInfo( #3GPP Release
#         name="tag",
#         description="The 3GPP release information",
#         type="string",
#     ),
# ]
# self_llm = CustomTransformersLLM(model_name='microsoft/phi-2', max_length=128) 
# document_content_description = 'contains technical details about telecommunications standards'
# self_retriever = SelfQueryRetriever.from_llm(
#     self_llm,
#     vectorstore,
#     document_content_description,
#     metadata_field_info,
#     structured_query_translator= ChromaTranslator()
# )


#compression


rerank_model = HuggingFaceCrossEncoder(model_name=RERANKER_MODEL_NAME, model_kwargs = {'device': 'cuda'})

compressor = CrossEncoderReranker(model=rerank_model, top_n=COMPRESSION_RETRIEVER_TOP_N)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=vstore_retriever
)


# multiquery
# llm = Llamafile()
# retriever_from_llm = MultiQueryRetriever.from_llm(llm=llm,
#     retriever=vstore_retriever
# )
# retriever_from_llm = MultiQueryRetriever(
#     retriever=compression_retriever, llm_chain=llm_chain, include_original=True
# )



In [15]:
docs = compression_retriever.invoke("What does the NEF notify to the AF after determining the suitable DNAI(s)? [3GPP Release 18]")

In [16]:
docs

[Document(page_content='4a.\tIf late notification via NEF is requested by the AF, the SMF notifies the NEF of the target DNAI of the PDU Session or indication of EAS rediscovery and may indicate capability of supporting EAS IP replacement in 5GC by invoking Nsmf_EventExposure_Notify service operation. The SMF may provide the target AF ID if it determines that the target DNAI is not supported by the source AF.', metadata={'source': '23502-i40.docx'}),
 Document(page_content='When NEF gets the AF request for determine DNAI information with the AF/EAS IP address or AF/EAS IP address range, optionally with the target geographic area, DNN, S-NSSAI. The NEF can map the geographic area into TA list. The NEF may find the target SMF(s) through NRF, with using the TA list, DNN, S-NSSAI. Then, the NEF requests the SMF with the AF/EAS IP address range, optionally TA list, DNN, S-NSSAI, in order to find the suitable DNAI(s).', metadata={'source': '23700-48-i00.docx'}),
 Document(page_content='1.\tA

In [17]:
print(VECTOR_RETRIEVER_K)
print(COMPRESSION_RETRIEVER_TOP_N)

50
5


# Inference

In [18]:
#Uncomment
answer_model_name = LLM_MODEL_NAME
tokenizer = AutoTokenizer.from_pretrained(answer_model_name)
tokenizer.pad_token = tokenizer.eos_token 
# base_model = AutoModelForCausalLM.from_pretrained(answer_model_name,device_map="auto",)
# answer_model = PeftModel.from_pretrained(base_model, '../bin/pretrained_256_64/', device_map="auto")
# pretrained_512_32
answer_model = AutoModelForCausalLM.from_pretrained('../bin/pretrained_512_32/',device_map="auto",)
answer_generator = transformers.pipeline(
    "text-generation",
    model=answer_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.06s/it]


In [19]:
from lib.prompt import get_inference_prompt
import json
import pandas as pd

In [20]:
def answer_questions(qst_filename, answer_model, retriever, max_new_tokens=4,return_full_text=False, batch_size = 128):
    with open(qst_filename) as file:
        questions = json.load(file)
    solutions = []
    def get_question_prompt(qstn_id,qstn_data): # in tqdm(questions.items()):
        qstn_id=qstn_id.split(' ')[1].strip()
        qstn_text = qstn_data['question']
        #searching through datastore for context
        docs = retriever.invoke(qstn_text)
        context =  (' '.join(list(map(lambda d:d.page_content,docs)))).replace('\n', '. ')
        infer_data = get_inference_prompt(qstn_data, context)
        prompt = infer_data['prompt']
        return qstn_id,prompt
    prompts = list(map(lambda entry:get_question_prompt(entry[0],entry[1]),tqdm(list(questions.items()))))
    print(prompts[0])
    num_prompts = len(prompts)
    for i in tqdm(range(0,num_prompts,batch_size)):
        current_prompts=list(map(lambda e:e[1],prompts[i:i+batch_size]))
        current_qstn_ids=list(map(lambda e:e[0],prompts[i:i+batch_size]))
        responses = answer_model(current_prompts,max_new_tokens=max_new_tokens, return_full_text=return_full_text)
        current_ans_ids =list(map(lambda r:r[0]['generated_text'].split(':')[0][-1:].strip(),responses))
        solutions += list(zip(current_qstn_ids,current_ans_ids))
        
    return solutions

In [21]:
def save_solution(filename,solution, task=''):
    df = pd.DataFrame(solution,columns=['Question_ID','Answer_ID'])
    df['Task'] = task
    df.to_csv(filename,index=False,)

In [29]:
train_soln = answer_questions('../data/TeleQnA_testing1.txt',answer_generator,compression_retriever)

100%|██████████| 366/366 [02:48<00:00,  2.17it/s]


('8138', '### Instructions: \nBased on the provided context, select the correct answer from the choices given. Provide your answer in the following format: option Number: Answer.\n\nContext:\nIf a gNB initiates a channel occupancy using the channel access procedures described in clause 4.4.1 on a channel, the gNB may transmit a DL transmission(s) on the channel within the maximum Channel Occupancy Time described in Clause 4.4.1 on the channel after the DL transmission(s) initiating the channel occupancy. The followings are applicable to the DL transmission(s): -\tIf the gap is  or , the gNB can transmit the transmission on the channel after performing Type 2A or Type 2B DL channel access procedures as described in clause 4.1.2.1 and 4.1.2.2, respectively.. . For the case where a gNB shares a channel occupancy initiated by a UE with configured grant PUSCH transmission, the gNB may transmit a transmission that follows the configured grant PUSCH transmission by the UE as follows: A channe

100%|██████████| 3/3 [08:18<00:00, 166.19s/it]


In [31]:
save_solution('testing_result.csv',train_soln,'Phi-2')

# Evaluation

In [25]:
import pandas as pd

In [26]:
pred = pd.read_csv('training_result.csv')
act = pd.read_csv('../data/Q_A_ID_training.csv')

In [27]:
pred['Answer_ID']=pred['Answer_ID'].astype(int)

In [28]:
(pred['Answer_ID'] == act['Answer_ID']).mean()

0.7761806981519507

|algo|score|
|--|--|
|similarity|0.6235455167693361|
|mmr|xxx|

In [45]:
VECTOR_STORE_NAME

'chromadb_512_32'

In [None]:
pred[pred['Answer_ID'] != act['Answer_ID']].head()

Unnamed: 0,Question_ID,Answer_ID,Task
6,23,3,Phi-2
14,53,1,Phi-2
15,60,1,Phi-2
16,70,1,Phi-2
20,80,1,Phi-2


In [None]:
# context from datastore
# question = q
# docs = compression_retriever.invoke(question['question'])
# context =  (' '.join(list(map(lambda d:d.page_content,docs)))).replace('\n', '. ')
# prompt = get_inference_prompt(question, context)
# refined_prompt = prompt['question']
# answer = prompt['answer']
# print(refined_prompt)
# gen_result = answer_generator(refined_prompt,max_new_tokens=128,return_full_text=False,)
# "option 1) Avoid monitoring neighbo"
# print(answer)
# print(gen_result[0]['generated_text'])