In [None]:
## Load and preprocess CSV data
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_core.documents import Document

# 1. Initialize and load the data with the 'description' field
loader = CSVLoader(
    file_path='arxiv_data/all_chunks.csv',
    content_columns=["chunk_text"],
    metadata_columns=["topic", "chunk_id", "pdf_name"]
)
data = loader.load()

# 2. Define a function to transform the documents
def remove_key_from_content(documents, key):
    new_documents = []
    for doc in documents:
        # Create a new Document object with the cleaned page_content
        new_documents.append(
            Document(
                page_content=doc.page_content.replace(f'{key}: ', ''),
                metadata=doc.metadata
            )
        )
    return new_documents

# 3. Apply the function to the loaded data
cleaned_data = remove_key_from_content(data, 'chunk_text')


In [None]:
for doc in cleaned_data:
    print(doc.metadata.get("topic"))

In [None]:
import pickle
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document

filtered_docs = [doc for doc in cleaned_data if doc.metadata.get("topic") == "stat.ML"]
documents = filtered_docs

# Initialize the BM25Retriever
bm25_retriever = BM25Retriever.from_documents(documents)

# Define the file path to save the retriever
file_path = "bm25_retriever.pkl"

# Save the BM25Retriever using pickle
with open(file_path, "wb") as f:
    pickle.dump(bm25_retriever, f)

print(f"BM25Retriever saved to {file_path}")