# evaluation

In [1]:
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
import chromadb
from transformers import T5ForConditionalGeneration, T5Tokenizer
import json
import time

  from tqdm.autonotebook import tqdm, trange


## Load the .env file

In [2]:
load_dotenv()

openai_api_key = os.getenv('OPENAI_API_KEY')
replicate_api_key = os.getenv('REPLICATE_API_KEY')

## index functions

In [3]:
def create_t5_embedding(encoder, chunks):
    documents = []

    for sentence in chunks:
        temp_dict = {
            'text': sentence['text'],
            'text_vector': encoder.encode(sentence["text"]).tolist()
        }
        documents.append(temp_dict)
    
    return {'documents': documents}

In [4]:
def populate_chroma_collection(documents, collection):
    ids = [str(i) for i in range(len(documents))]
    embeddings = [doc['text_vector'] for doc in documents]
    texts = [doc['text'] for doc in documents]
    
    collection.add(
        ids=ids,
        embeddings=embeddings,
        metadatas=[{"text": text} for text in texts]
    )

## RAG functions

In [5]:
def search(query, **kwargs):
    query_vector = kwargs['encoder'].encode(query).tolist()
    
    # Perform cosine similarity search in ChromaDB
    results = kwargs['vector_db_client'].get_or_create_collection(kwargs['index_name']).query(
        query_embeddings=[query_vector],
        n_results=kwargs['num_results'],
        include=["metadatas", "documents", "distances"]
    )
    
    # return results["metadatas"]
    return results["metadatas"][0]

In [6]:
# prompt
def build_prompt(query, search_results):
    prompt_template = """
    You're a podcast chat bot. Answer the QUESTION based on the CONTEXT from the RESULTS database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()
    
    context = ""
    
    for search_result in search_results:
        doc = search_result['_source']['text'] if '_source' in search_result.keys() else search_result['text']
        context = context + f"{doc}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [7]:
# generate
def llm(prompt, **kwargs):
    inputs = kwargs['llm_tokenizer'](prompt, return_tensors="pt")
    outputs = kwargs['llm_client'].generate(
        inputs["input_ids"], 
        max_length=100,
        num_beams=5,
        do_sample=True,
        temperature=1.0,
        top_k=50,
        top_p=0.95,    
        )
    response = kwargs['llm_tokenizer'].decode(outputs[0], skip_special_tokens=True)
        
    return response

In [8]:
# rag 
def rag(query, **kwargs):

    search_results = search(
        query, 
        encoder=kwargs['encoder'] if 'encoder' in kwargs.keys() else None, 
        index_name=kwargs['index_name'], 
        vector_db_client=kwargs['vector_db_client'] if 'vector_db_client' in kwargs.keys() else None,        
        num_results=5
        )

    prompt = build_prompt(query, search_results)

    answer = llm(
        prompt, 
        llm_client=kwargs['llm_client'], 
        llm_tokenizer=kwargs['llm_tokenizer'] if 'llm_tokenizer' in kwargs.keys() else None
        )
    
    for word in answer.split():
        yield word + " "
        time.sleep(0.05)

## Main

In [9]:
# podcast option
episode_option = "1. Try a sample"
episode_option_selected=True    

In [10]:
# sentence encoder
sentence_encoder = "1. T5"
encoder=SentenceTransformer("sentence-transformers/sentence-t5-base")
sentence_encoder_selected=True



In [11]:
# transcription method
# skip

In [12]:
# vector database
index_name = "podcast-transcriber"
vector_db="3. ChromaDB"
# Initialize ChromaDB client
vector_db_client = chromadb.PersistentClient(path="./chroma_db")

# List existing collections
existing_collections = vector_db_client.list_collections()

# Check if the collection exists
if index_name in existing_collections[0].name:
    # Delete the collection if it exists
    vector_db_client.delete_collection(index_name)

# Create or get a collection with cosine distance
index = vector_db_client.get_or_create_collection(
    name=index_name,
    metadata={"hnsw:space": "cosine"}
)

vector_db_selected=True
index_created=True
# print(f"Index {[k for k,v in index.items()][0]} was created successfully.")

In [13]:
# llm
llm_option = "2. FLAN-5"
llm_client = T5ForConditionalGeneration.from_pretrained("google/flan-t5-large")
llm_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-large")
llm_option_selected=True

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
# download
with open('sample/episode_details.json', 'r') as f:
    episode_details = json.load(f)

In [15]:
# transcribe
# skip

In [16]:
# encode
episode_details.update(create_t5_embedding(encoder, episode_details['chunks']))

In [17]:
# populate index
populate_chroma_collection(episode_details['documents'], index)

Add of existing embedding ID: 439
Add of existing embedding ID: 440
Add of existing embedding ID: 441
Add of existing embedding ID: 442
Add of existing embedding ID: 443
Add of existing embedding ID: 444
Add of existing embedding ID: 445
Add of existing embedding ID: 446
Add of existing embedding ID: 447
Add of existing embedding ID: 448
Add of existing embedding ID: 449
Add of existing embedding ID: 450
Add of existing embedding ID: 451
Add of existing embedding ID: 452
Add of existing embedding ID: 453
Add of existing embedding ID: 454
Add of existing embedding ID: 455
Add of existing embedding ID: 456
Add of existing embedding ID: 457
Add of existing embedding ID: 458
Add of existing embedding ID: 459
Add of existing embedding ID: 460
Add of existing embedding ID: 461
Add of existing embedding ID: 462
Add of existing embedding ID: 463
Add of existing embedding ID: 464
Add of existing embedding ID: 465
Add of existing embedding ID: 466
Add of existing embedding ID: 467
Add of existin

## search didn't retrieve anything!!!

In [18]:
# interact
query = "Who were the Mensheviks?"
response = rag(
    query, 
    encoder=encoder,
    index_name=index_name,
    vector_db_client=vector_db_client,
    num_results=5,
    llm_client=llm_client,
    llm_tokenizer=llm_tokenizer
    )
print(" ".join(list(response)))

Marxists 
