# Load data

In [None]:
import glob
import os

In [None]:
!pip install langchain

In [None]:
!pip install langchain-community

In [None]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders.text import TextLoader
loader = DirectoryLoader("un/TXT/Session 77 - 2022/", glob="**/*.txt", loader_cls=TextLoader)
data = loader.load()

In [None]:
data[0:5]

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 50)
all_splits = text_splitter.split_documents(data)

In [None]:
all_splits[0:10]

In [None]:
# this used to work
# from langchain_huggingface import HuggingFaceEmbeddings
# this is deprecated
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings
# this will work (now and) in the future
# from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.sklearn import SKLearnVectorStore

retriever = SKLearnVectorStore.from_documents(all_splits, HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"))\
                              .as_retriever(search_kwargs={"k": 100})

In [None]:
question = "The climate crisis is worse for poorer countries"
docs = retriever.invoke(question)
len(docs)

In [None]:
docs[0:10]

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', 0)
df = pd.DataFrame([{"source": d.metadata["source"], "text": d.page_content} for d in docs])
df

In [None]:
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.retrievers import ContextualCompressionRetriever


cross_encoder = HuggingFaceCrossEncoder(model_name="mixedbread-ai/mxbai-rerank-large-v1")
compressor = CrossEncoderReranker(model=cross_encoder, top_n=20)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

In [None]:
compressed_docs = compression_retriever.invoke(question)

In [None]:
pd.DataFrame([{"source": d.metadata["source"], "text": d.page_content} for d in compressed_docs])