In [192]:
import os
from dotenv import load_dotenv
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.retrievers import BM25Retriever
from langchain_chroma import Chroma
import numpy as np

from src.data_io.read_data import read_txt_data
from src.chunk.chunk import chunk_data
from rank_bm25 import BM25Okapi
load_dotenv()
from utils import load_config_from_yaml
from configs.rag_config import RAGConfig

from collections import defaultdict
import uuid

In [22]:
query = 'Was Abraham Lincoln the sixteenth President of the United States?'

In [193]:
config_path = './test_configs/test.yaml'
config = load_config_from_yaml(config_path, RAGConfig)

In [4]:
vectorestore_path = os.getenv('CHROMA_DB_PATH')

In [194]:
df = read_txt_data()
df_list = df.to_dict(orient="records")
chunked_data = chunk_data(df, config=config, page_content_column='file_content')

150 data has been chunked into 1299 pieces.


In [14]:
df_list[:2]

[{'file_name': 'S08_set1_a1.txt.clean',
  'file_content': 'kangaroo\n\n\nA kangaroo is a marsupial from the family Macropodidae (macropods, meaning \'large foot\'). In common use the term is used to describe the largest species from this family, the Red Kangaroo, the Antilopine Kangaroo, and the Eastern and Western Grey Kangaroo of the Macropus genus. The family also includes many smaller species which include the wallabies, tree-kangaroos, wallaroos, pademelons and the Quokka, some 63 living species in all.  Kangaroos are endemic to the continent of Australia, while the smaller macropods are found in Australia and New Guinea.\n\nIn general, larger kangaroos have adapted much better to changes wrought to the Australian landscape by humans and though many of their smaller cousins are endangered, they are plentiful. They are not farmed to any extent, but wild kangaroos are shot for meat, over which there is controversy.  Steve Dow: "An industry that\'s under the gun". Sydney Morning Hera

In [7]:
embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/distiluse-base-multilingual-cased-v1')

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
vector_store = Chroma(
    persist_directory=vectorestore_path, 
    collection_name="test_1", 
    embedding_function=embedding_model
)

In [18]:
df_list[0]

{'file_name': 'S08_set1_a1.txt.clean',
 'file_content': 'kangaroo\n\n\nA kangaroo is a marsupial from the family Macropodidae (macropods, meaning \'large foot\'). In common use the term is used to describe the largest species from this family, the Red Kangaroo, the Antilopine Kangaroo, and the Eastern and Western Grey Kangaroo of the Macropus genus. The family also includes many smaller species which include the wallabies, tree-kangaroos, wallaroos, pademelons and the Quokka, some 63 living species in all.  Kangaroos are endemic to the continent of Australia, while the smaller macropods are found in Australia and New Guinea.\n\nIn general, larger kangaroos have adapted much better to changes wrought to the Australian landscape by humans and though many of their smaller cousins are endangered, they are plentiful. They are not farmed to any extent, but wild kangaroos are shot for meat, over which there is controversy.  Steve Dow: "An industry that\'s under the gun". Sydney Morning Herald

In [None]:
for item in df_list:
    print()

In [195]:
chunked_data

[Document(metadata={'file_name': 'S08_set1_a1.txt.clean'}, page_content='kangaroo\n\n\nA kangaroo is a marsupial from the family Macropodidae (macropods, meaning \'large foot\'). In common use the term is used to describe the largest species from this family, the Red Kangaroo, the Antilopine Kangaroo, and the Eastern and Western Grey Kangaroo of the Macropus genus. The family also includes many smaller species which include the wallabies, tree-kangaroos, wallaroos, pademelons and the Quokka, some 63 living species in all.  Kangaroos are endemic to the continent of Australia, while the smaller macropods are found in Australia and New Guinea.\n\nIn general, larger kangaroos have adapted much better to changes wrought to the Australian landscape by humans and though many of their smaller cousins are endangered, they are plentiful. They are not farmed to any extent, but wild kangaroos are shot for meat, over which there is controversy.  Steve Dow: "An industry that\'s under the gun". Sydne

In [196]:
tokenized_documents = [doc.page_content.split() for doc in chunked_data]

# results = bm25_retriever.get_relevant_documents(query=query)

In [198]:
bm25 = BM25Okapi(tokenized_documents)

In [199]:
tokenized_query = query.split()

In [200]:
scores = bm25.get_scores(tokenized_query)

In [201]:
top_k_indices = np.argsort(scores)[::-1][:5]
# top_k_docs = [documents[i] for i in top_k_indices]

In [202]:
np.argsort(scores)[::-1][:5]

array([220, 234, 236, 231, 219], dtype=int64)

In [204]:
top_k_docs = [chunked_data[i] for i in top_k_indices]

In [205]:
top_k_docs

[Document(metadata={'file_name': 'S08_set3_a4.txt'}, page_content='Abraham Lincoln\n\n\n\nAbraham Lincoln (February 12, 1809 – April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that \'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the center

In [51]:
vectorstore = Chroma(
    persist_directory=vectorestore_path, 
    collection_name="test_1", 
    embedding_function=embedding_model
)

In [148]:
semantic_results = vectorstore.similarity_search_with_relevance_scores(query, k=10)

In [149]:
# semantic_results
[ {'file_name': item[0].metadata['file_name'], 'file_content': item[0].page_content} for item in semantic_results][:3]

[{'file_name': 'S08_set3_a4.txt',
  'file_content': 'Abraham Lincoln\n\n\n\nAbraham Lincoln (February 12, 1809 – April 15, 1865) was the sixteenth President of the United States, serving from March 4, 1861 until his assassination. As an outspoken opponent of the expansion of slavery in the United States, "[I]n his short autobiography written for the 1860 presidential campaign, Lincoln would describe his protest in the Illinois legislature as one that \'briefly defined his position on the slavery question, and so far as it goes, it was then the same that it is now." This was in reference to the anti-expansion sentiments he had then expressed. Doris Kearns Goodwin, Team of Rivals: The Political Genius of Abraham Lincoln (2005) p. 91.  Holzer pg. 232.  Writing of the Cooper Union  speech, Holzer notes, "Cooper Union proved a unique confluence of political culture, rhetorical opportunity, technological innovation, and human genius, and it brought Abraham Lincoln to the center stage of Amer

In [150]:
semantic_results[0][0].metadata['file_name']

'S08_set3_a4.txt'

In [151]:
semantic_results[0][1]

0.5979493856430054

In [152]:
doc_scores = defaultdict(list)
for chunk in semantic_results:
    doc_id = chunk[0].metadata['file_name']
    score = chunk[1]
    doc_scores[doc_id].append(score)

In [153]:
for doc_id, scores in doc_scores.items():
    print(scores)

[0.5979493856430054, 0.5872262716293335, 0.5836962461471558, 0.5693079233169556, 0.5191542506217957, 0.5147264003753662, 0.5116897225379944, 0.4999133348464966, 0.48484450578689575, 0.4749831557273865]


In [154]:
{doc_id: max(scores) for doc_id, scores in doc_scores.items()}

{'S08_set3_a4.txt': 0.5979493856430054}

In [95]:
aggregated_scores = {doc_id: max(scores) for doc_id, scores in doc_scores.items()}

In [96]:
sorted_doc_ids = sorted(aggregated_scores.items(), key=lambda x: x[1], reverse=True)

In [160]:
vector_result = [{'file_name': item[0].metadata['file_name'], 'file_content': item[0].page_content} for item in semantic_results]

In [161]:
combined_list = [top_k_docs, vector_result]

In [162]:
len(combined_list)

2

In [172]:
rrf_scores = {}
for result in combined_list:
    for rank, doc in enumerate(result):
        doc_id = doc['file_name']
        rrf_scores[doc_id] = rrf_scores.get(doc_id, 0) + 1/(60 + rank)
sorted_results = sorted(rrf_scores.items(), key=lambda item: item[1], reverse=True)

In [173]:
sorted_results

[('S08_set3_a4.txt', 0.172013963733953),
 ('S08_set3_a5.txt.clean', 0.01639344262295082),
 ('S08_set3_a3.txt.clean', 0.016129032258064516),
 ('S08_set3_a7.txt.clean', 0.015873015873015872),
 ('S08_set3_a8.txt.clean', 0.015625)]