In [1]:
import pyarrow.parquet as pq
from langchain_community.document_loaders.csv_loader import CSVLoader

parquet_file = pq.ParquetFile('rawdata/test-00000-of-00001.parquet')
alldb = parquet_file.read().to_pandas()
labeltable = alldb.loc[alldb['label']==1]
unique_qid = labeltable['question_id'].unique()
loader = CSVLoader(file_path='text.csv',source_column="answer",csv_args={
    # 'delimiter': ',',
    # 'fieldnames': ['question_id', 'question', 'document_title', 'answer','label']
})
docdata = loader.load()

In [2]:
from langchain_community.embeddings import OllamaEmbeddings, HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

embeddings = HuggingFaceEmbeddings()
vector = FAISS.from_documents(docdata, embeddings)

In [3]:
def retrieve_doc(k, query):
    ret = vector.as_retriever(search_kwargs={"k": k}).get_relevant_documents(query)
    return ret

def get_precision(gtlist, prelist):
    count = 0
    for item in prelist:
        if(item in gtlist):
            count += 1
    return count / len(prelist)

def get_recall(gtlist, prelist):
    count = 0
    for item in gtlist:
        if(item in prelist):
            count += 1
    return count / len(gtlist)

def get_AP(m,gtlist,query):
    all = 0
    for k in range(1,m+1):
        ret_docs = retrieve_doc(k,query)
        prelist = [item.metadata['row'] for item in ret_docs]
        precision = get_precision(gtlist, prelist)
        all+=precision
    return all/m  

def fetch_AP(gtlist,query):
    N = 500
    ret_docs = retrieve_doc(N,query)
    prelist = [item.metadata['row'] for item in ret_docs]
    rem = len(gtlist)
    idx = 0
    plist = []
    while(rem > 0 and idx<N):
        prerow = prelist[idx]
        if(prerow in gtlist):
            plist.append((len(plist)+1)/(idx+1))
            rem -= 1
        else:
            idx += 1
    if(rem > 0):
        print("error")
    return sum(plist)/len(plist)

def get_mAP(labeltable,unique_qid):
    n = len(unique_qid)
    allAP = 0
    for i in range(n):
        qid = unique_qid[i]
        labelpd = labeltable[labeltable['question_id'] == qid]
        gtlist = labelpd.index
        query = labelpd.iloc[0]['question']
        inputm = len(gtlist)
        AP = fetch_AP(gtlist,query)
        allAP += AP
    return allAP/n

def get_RatK(labeltable, unique_qid, m):
    n = len(unique_qid)
    allR = 0
    for i in range(n):
        qid = unique_qid[i]
        labelpd = labeltable[labeltable['question_id'] == qid]
        gtlist = labelpd.index
        query = labelpd.iloc[0]['question']
        ret_docs = retrieve_doc(m,query)
        prelist = [item.metadata['row'] for item in ret_docs]
        recall = get_recall(gtlist, prelist)
        allR += recall 
    return allR / n


def get_PatK(labeltable,unique_qid,m):
    n = len(unique_qid)
    allP = 0
    for i in range(n):
        qid = unique_qid[i]
        labelpd = labeltable[labeltable['question_id'] == qid]
        gtlist = labelpd.index
        query = labelpd.iloc[0]['question']
        ret_docs = retrieve_doc(m,query)
        prelist = [item.metadata['row'] for item in ret_docs]
        precision = get_precision(gtlist, prelist)
        allP += precision
    return allP/n
        

In [4]:
passages = retrieve_doc(5,"Who is the president of United States")

  warn_deprecated(


In [11]:
passages[0]

Document(page_content='answer: The President of the United States of America (POTUS) is the head of state and head of government of the United States .', metadata={'source': 'The President of the United States of America (POTUS) is the head of state and head of government of the United States .', 'row': 382})

In [10]:
alldb.iloc[passages[0].metadata['row']]

question_id                                                    Q216
question          what do presidents make after they leave the w...
document_title                       President of the United States
answer            The President of the United States of America ...
label                                                             0
Name: 382, dtype: object

In [33]:
query = labeltable.iloc[2]['question']

In [34]:
query

'how old was sue lyon when she made lolita'

In [35]:
passages = retrieve_doc(1,query)

In [36]:
passage = " ".join([item.page_content.replace("answer:","") for item in passages])

In [37]:
passage

' The actress who played Lolita, Sue Lyon , was fourteen at the time of filming.'

In [31]:
def prompt(context,query):
    p = f'''
You are given a question, you should summarize answer based on give context
question: {query}
context: {context}
'''
    return p

In [32]:
print(prompt(passage,query))


You are given a question, you should summarize answer based on give context
question: how a water pump works
context:  Pumps operate by some mechanism (typically reciprocating or rotary ), and consume energy to perform mechanical work by moving the fluid.



In [43]:
get_RatK(labeltable,unique_qid,10)

0.9482167352537723

In [11]:
get_PatK(labeltable,unique_qid,)

0.11111111111111147

In [8]:
get_mAP(labeltable, unique_qid)

0.8653579760488987

In [49]:
qid = unique_qid[3]
labelpd = labeltable[labeltable['question_id'] == qid]
gtlist = labelpd.index
query = labelpd.iloc[0]['question']
k = 5
ret_docs = retrieve_doc(k,query)

In [44]:
prelist = [item.metadata['row'] for item in ret_docs]

In [45]:
get_precision(gtlist, prelist)

0.4