https://www.youtube.com/watch?v=tcqEUSNCn8I

In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.embeddings import OpenAIEmbeddings
#from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
#from dotenv import load_dotenv
import os
import shutil
from langchain_community.llms import Ollama

In [2]:
CHROMA_PATH = "chroma"
DATA_PATH = "task2/sources"

In [3]:

def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf")
    documents = loader.load()
    return documents

In [4]:
documents = load_documents()

In [7]:
documents[4].metadata

{'source': 'task2/sources/2307.07889v3.LLM_Comparative_Assessment__Zero_shot_NLG_Evaluation_through_Pairwise_Comparisons_using_Large_Language_Models.pdf'}

In [8]:
def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [9]:
chunks = split_text(documents)

Split 45 documents into 20321 chunks.
language processing benchmarks. In response to this challenge, recent studies commonly introduce an evaluation approach, namely the Elo rating sys- tem (Elo, 1967), wherein either human or LLM judges are enlisted to adjudicate between two LLM- generated outputs (Askell et al., 2021; Bai et al.,
{'source': 'task2/sources/2307.03025v3.Style_Over_Substance__Evaluation_Biases_for_Large_Language_Models.pdf', 'start_index': 2067}


In [12]:
#chunks[11].page_content
some = chunks[0:10]

In [13]:
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

2024-07-09 16:15:25.799504: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [14]:
def save_to_chroma(chunks: list[Document]):
    # Clear out the database first.
    #if os.path.exists(CHROMA_PATH):
     #   shutil.rmtree(CHROMA_PATH)


    # Ensure the directory exists and has the correct permissions
    os.makedirs(CHROMA_PATH, exist_ok=True)
    os.chmod(CHROMA_PATH, 0o755)
        
    # Create a new DB from the documents.
    db = Chroma.from_documents(
        chunks, embedding_function, persist_directory=CHROMA_PATH
    )
    db.persist()
    print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")


In [15]:
! ls -l | grep chroma

drwxr-xr-x 2 deborah deborah     4096 Jul  9 15:48 chroma


In [21]:
save_to_chroma(chunks)

Saved 20321 chunks to chroma.


In [22]:
db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

In [23]:
from langchain.chains import RetrievalQA

In [24]:
rag = RetrievalQA.from_chain_type(
            llm=Ollama(model="mistral"),
            retriever=db.as_retriever()
            #memory=ConversationSummaryMemory(llm = Ollama(model="mistral")),
            #chain_type_kwargs={"prompt": pt, "verbose": True},
        )

rag.invoke("What is the recent development of LLM generated text evaluation?")

{'query': 'What is the recent development of LLM generated text evaluation?',
 'result': ' Recent developments in LLM (Large Language Model) generated text evaluation include the use of LLMs to evaluate the quality of their own text outputs, which can provide insights into how a tuned version might perform in various scenarios (Yang et al., 2021; 2023a). This approach is being explored as a scalable and cost-effective alternative to human evaluations (Jain et al., 2023; Taori et al., 2023; Chiang et al., 2023). For instance, Fu et al. (2023) use the predicted text probability from an LLM as the automated score to assess text quality. Additionally, this approach has been shown to significantly enhance the ability of LLMs to evaluate text and speed up evaluation processes for Chinese LLMs, decreasing the average cost per sample by 4.6 times (Fu et al., 2023). However, it is important to note that while the use of LLMs as evaluators for text generation is in the exploratory phase, there a

In [20]:
rag.invoke('What is  Multi-Elo Rating System (MERS)')

{'query': 'What is  Multi-Elo Rating System (MERS)',
 'result': ' The Multi-Elo Rating System (MERS) is a proposed approach for independently evaluating machine-generated text across multiple dimensions using the Elo rating system. This system aims to enhance the quality of evaluations, particularly in terms of factual accuracy, as shown by empirical results from studies. It contrasts with traditional methods that merge all evaluation aspects into a single score.'}

In [None]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = model.encode(sentences)