<a href="https://colab.research.google.com/github/ethanelkaim/RAG-Models-For-LLM/blob/main/Hierarchical_and_Wikipedia_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install faiss-cpu sentence-transformers transformers wikipedia-api torch datasets cohere openai -U langchain-community pinecone-client

In [2]:
import wikipediaapi
from sentence_transformers import SentenceTransformer, util, CrossEncoder
import faiss
import numpy as np
from datasets import load_dataset
from transformers import pipeline, GPT2Tokenizer
import cohere
import time as tm
import openai
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
import os
from tqdm import tqdm
from pinecone import Pinecone, ServerlessSpec, PineconeApiException
import re
from typing import List, Tuple

In [None]:
# os.environ["OPENAI_API_KEY"] = "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"
cohere_api_key = "LjyWoNgE5Cc1E5qytRY90Nwc2VlD1tMdKrkf13nF"
PINECONE_API_KEY = "pcsk_63k9vT_J3gPSjhkiVxRQi1cn8xJxYg6fBRc7p1DksZS7iNombcuAW3gHNCdRVCuyRWdBqm"
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# truncate or split any overly long article text into smaller sections before summarization because the model has a maximum input sequence length (1024 tokens)
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", truncation=True)

# Load the cross-encoder model for reranking
reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

dimension = 384

# Global dictionary to store content after indexing
wiki_content_map = {}

# Hugging Face NER pipeline for keyword extraction
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", grouped_entities=True)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [None]:
!huggingface-cli login

In [5]:
# Function to extract keywords using Hugging Face's NER pipeline
def extract_keywords(query):
    ner_results = ner_pipeline(query)
    keywords = []
    year = None
    for entity in ner_results:
        entity_word = entity['word']
        if entity_word not in keywords and not entity_word.startswith("##"):
            keywords.append(entity_word)
    year_pattern = r'\b(?:19|20)\d{2}\b' # Extract years
    years = re.findall(year_pattern, query)
    for year in years:
        if year not in keywords:
            year = year
    return keywords, year

# Basic RAG pipeline

In [6]:
# Basic Indexing Function
def index_dataset_basic(dataset_name, keyword):
    """Basic indexing of dataset with FAISS."""
    start_time = tm.time()
    global wiki_content_map
    wiki_content_map.clear()

    if dataset_name == 'wikipedia':
        wiki_wiki = wikipediaapi.Wikipedia('english')
        all_paragraphs = []
        all_embeddings = []
        page_metadata = []

        for keyword in keywords:
            print(f"Indexing page: {keyword}")
            page = wiki_wiki.page(keyword)

            if page.exists():
                paragraphs = page.text.split('\n')
                for idx, paragraph in enumerate(paragraphs):
                    if len(paragraph.strip()) > 0:
                        # Encode the paragraph and store the embedding
                        embedding = model.encode(paragraph, convert_to_tensor=False)
                        all_embeddings.append(embedding)  # Append the embedding to our list

                        # Store the paragraph and metadata
                        all_paragraphs.append(paragraph)
                        page_metadata.append({"keyword": keyword, "paragraph_idx": idx})


                print(f"Indexed page: {keyword}")
            else:
                print(f"Wikipedia page for '{keyword}' does not exist.")

        # Finalize embeddings as a single numpy array for consistency
        all_embeddings = np.array(all_embeddings).astype("float32")

        # Add metadata for the content in wiki_content_map
        wiki_content_map = {
            i: {"paragraph": all_paragraphs[i], "metadata": page_metadata[i]}
            for i in range(len(all_paragraphs))
        }

        runtime = tm.time() - start_time
        print(f"Basic Indexing Time for {dataset_name}: {runtime:.2f} seconds")
        return runtime, wiki_content_map, all_embeddings


    elif dataset_name == 'natural_questions':
        ds = load_dataset("google-research-datasets/natural_questions", "default", split='train[:20]')
        embeddings = model.encode(ds['question'])

    elif dataset_name == 'cnn_dailymail':
      ds = load_dataset("cnn_dailymail", "3.0.0", split='train')
      embeddings = model.encode(ds['highlights'][:400])

    runtime = tm.time() - start_time
    print(f"Basic Indexing Time for {dataset_name}: {runtime:.2f} seconds")
    return runtime, ds, embeddings

In [7]:
def create_pinecone_index(index_name: str, dimension: int, metric: str = 'cosine'):
    """
    Create a pinecone index if it does not exist
    Args:
        index_name: The name of the index
        dimension: The dimension of the index
        metric: The metric to use for the index
    Returns:
        Pinecone: A pinecone object which can later be used for upserting vectors and connecting to VectorDBs
    """
    print("Creating a Pinecone index...")
    pc = Pinecone(api_key=PINECONE_API_KEY)
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(name=index_name, dimension=dimension, metric=metric, spec=ServerlessSpec(cloud="aws", region="us-east-1"))
    print("Done!")
    return pc

In [8]:
def upsert_vectors(index: Pinecone, embeddings: np.ndarray, dataset, dataset_name: str, text_field: str = 'highlights', batch_size: int = 128):
    """
    Upsert vectors to a Pinecone index with support for multiple dataset formats.
    Args:
        index: The Pinecone index object.
        embeddings: The embeddings to upsert.
        dataset: The dataset containing the metadata (dict for 'wikipedia' or Hugging Face Dataset for 'cnn_dailymail').
        dataset_name: The name of the dataset being used ('wikipedia' or 'cnn_dailymail').
        text_field: The text field in the dataset to use as metadata for 'cnn_dailymail'.
        batch_size: The batch size to use for upserting.
    Returns:
        An updated Pinecone index.
    """
    print("Upserting the embeddings to the Pinecone index...")
    shape = embeddings.shape

    ids = [str(i) for i in range(shape[0])]

    # Prepare metadata based on dataset type
    if dataset_name == "wikipedia":
        meta = [{"text": dataset[idx]["paragraph"], "keyword": dataset[idx]["metadata"]["keyword"], "paragraph_idx": dataset[idx]["metadata"]["paragraph_idx"]}
                for idx in range(len(dataset))]

    elif dataset_name == "cnn_dailymail":
        meta = [{text_field: text} for text in dataset[text_field]]

    else:
        raise ValueError("Unsupported dataset name. Use 'wikipedia' or 'cnn_dailymail'.")

    # Create list of (id, vector, metadata) tuples to be upserted
    to_upsert = list(zip(ids, embeddings, meta))

    # Upsert vectors to Pinecone index in batches
    for i in tqdm(range(0, shape[0], batch_size)):
        i_end = min(i + batch_size, shape[0])
        index.upsert(vectors=to_upsert[i:i_end])

    return index


In [None]:
def augment_prompt(query: str, model: SentenceTransformer = SentenceTransformer('all-MiniLM-L6-v2'), index=None) -> str:
    """
    Augment the prompt with the top 3 results from the knowledge base
    Args:
        query: The query to augment
        index: The vectorstore object
    Returns:
        str: The augmented prompt
    """
    results = [float(val) for val in list(model.encode(query))]

    # get top 3 results from knowledge base
    query_results = index.query(
        vector=results,
        top_k=3,
        include_values=True,
        include_metadata=True
    )['matches']

    text_matches = [match['metadata'].get('content', '') for match in query_results]

    # get the text from the results
    source_knowledge = "\n\n".join(text_matches)

    # feed into an augmented prompt
    augmented_prompt = f"""Using the contexts below, answer the query.
    Contexts:
    {source_knowledge}
    If the answer is not included in the source knowledge - say that you don't know.
    Query: {query}"""
    return augmented_prompt, source_knowledge


# Hierarchical model

In [186]:
def encode_dataset_hierarchical(dataset_name, keywords, year, pc, query, chunk_size=1000, chunk_overlap=200, max_tokens=900, max_len=400, doc_len=5):
    """Hierarchical encoding and indexing with Sentence Transformers and Pinecone for cnn_dailymail or wikipedia datasets."""
    start_time = tm.time()
    documents = []

    if dataset_name == 'cnn_dailymail':
        ds = load_dataset("cnn_dailymail", "3.0.0", split='train')
        if year:
            documents = [doc for doc in ds if any(keyword.lower() in doc['article'].lower() for keyword in keywords) and str(year) in doc['article']]
        else:
            documents = [doc for doc in ds if all(keyword.lower() in doc['article'].lower() for keyword in keywords)]

    elif dataset_name == 'wikipedia':
        wiki_wiki = wikipediaapi.Wikipedia('english')
        for keyword in keywords:
            page = wiki_wiki.page(keyword)
            if page.exists():
                paragraphs = page.text.split('\n')
                for idx, paragraph in enumerate(paragraphs):
                    if len(paragraph.strip()) > 0:
                        documents.append({"article": paragraph, "id": f"{keyword}_{idx}"})

    if len(documents) == 0:
        print("No documents found for the given keywords and year.")
        return None, None, None, None, None
    else:
        print(f"Found {len(documents)} documents.")

    # if len(documents) > doc_len:
    #     documents = documents[:doc_len]

    # Initialize relevance model
    relevance_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

    # Summarize and create embeddings for relevant documents
    summaries = []
    for doc in tqdm(documents):
        article = doc['article']
        # print('\nlen article :',len(article))
        # print('article :',article)

        # Check relevance
        relevance_score = relevance_model.predict([(query, article)])
        if len(documents) > 50 and relevance_score < 0.5:
            continue
        if dataset_name == 'cnn_dailymail':
            if len(article.split()) > max_tokens:
                chunks = [article[i:i + max_tokens] for i in range(0, len(article), max_tokens - 100)]
                summary_parts = [summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]['summary_text'] for chunk in chunks]
                summary_text = " ".join(summary_parts)
                # print('len summary_text :',len(summary_text))
                # print('summary_text ',summary_text)

            else:
                summary_text = summarizer(article, max_length=150, min_length=50, do_sample=False)[0]['summary_text']
                # print('len summary_text :',len(summary_text))
                # print('summary_text ',summary_text)

        else:
            if len(article) > max_len:
                summary_text = summarizer(article, max_length=100, min_length=50, do_sample=False)[0]['summary_text']
                # print('len summary_text :',len(summary_text))
                # print('summary_text ',summary_text)
            else:
                summary_text = article

        summaries.append({"content": summary_text, "source": doc['id']})

    print(f"Found and summarized {len(summaries)} relevant documents.")

    summary_embeddings = np.array([model.encode(s["content"]) for s in summaries]).astype("float32")

    # Define Pinecone index names based on dataset
    if dataset_name == "wikipedia":
        summary_index_name = "wikipedia-summary"
        chunk_index_name = "wikipedia-chunk"
    else:
        summary_index_name = "cnn-dailymail-summary"
        chunk_index_name = "cnn-dailymail-chunk"

    # Helper function to create or connect to Pinecone indexes
    def create_or_connect_index(index_name, dimension):
        if index_name in pc.list_indexes():
            print(f"Connecting to existing index '{index_name}'.")
            return pc.Index(index_name)
        else:
            print(f"Creating new index '{index_name}' with dimension {dimension}.")
            try:
                pc.create_index(name=index_name, dimension=dimension, metric='cosine', spec=ServerlessSpec(cloud="aws", region="us-east-1"))
            except PineconeApiException as e:
                if 'ALREADY_EXISTS' in str(e):
                    print(f"Index '{index_name}' already exists. Connecting to it.")
                else:
                    raise e
            return pc.Index(index_name)

    # Connect to or create summary and chunk indexes
    summary_index = create_or_connect_index(summary_index_name, summary_embeddings.shape[1])
    chunk_index = create_or_connect_index(chunk_index_name, summary_embeddings.shape[1])

    # Upsert summaries to Pinecone summary index
    summary_upserts = [(str(i), summary_embeddings[i], {"source": summaries[i]["source"]}) for i in range(len(summaries))]
    summary_index.upsert(vectors=summary_upserts)

    # Split and upsert chunks to Pinecone chunk index
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = [chunk for doc in documents for chunk in text_splitter.split_text(doc["article"])]
    chunk_embeddings = np.array([model.encode(chunk) for chunk in chunks]).astype("float32")
    chunk_upserts = [(str(i), chunk_embeddings[i], {"content": chunks[i]}) for i in range(len(chunks))]
    chunk_index.upsert(vectors=chunk_upserts)

    runtime = tm.time() - start_time
    print(f"Hierarchical Indexing Time for {dataset_name}: {runtime:.2f} seconds")
    return summary_index, chunk_index, summaries, chunks, runtime


In [11]:
def retrieve_hierarchical_with_reranking(query: str, summary_index, chunk_index, summaries, chunks, top_k_summaries=3, top_k_chunks=5, rerank_top_n=3) -> Tuple[List[str], float]:
    """
    Retrieve relevant passages using hierarchical indexing with Pinecone,
    and rerank the retrieved chunks based on their relevance to the query.
    """
    start_time = tm.time()

    # Step 1: Retrieve top summaries
    query_embedding = model.encode(query).tolist()
    summary_results = summary_index.query(vector=query_embedding, top_k=top_k_summaries, include_values=False, include_metadata=True)
    relevant_summaries = [result["metadata"]["source"] for result in summary_results["matches"]]

    # Step 2: Retrieve relevant chunks from each summary
    relevant_chunks = []
    for summary in summaries:
        if summary["source"] in relevant_summaries:
            summary_embedding = model.encode(summary["content"]).tolist()
            chunk_results = chunk_index.query(vector=summary_embedding, top_k=top_k_chunks, include_values=False, include_metadata=True)
            relevant_chunks.extend([(result["metadata"]["content"], query) for result in chunk_results["matches"]])

    print("relevant_chunks :\n", relevant_chunks)

    # Step 3: Rerank chunks based on their relevance to the query
    if relevant_chunks:
        scores = reranker.predict(relevant_chunks)
        scored_chunks = sorted(zip(relevant_chunks, scores), key=lambda x: x[1], reverse=True)

        # Select the top reranked chunks
        reranked_chunks = [chunk[0][0] for chunk in scored_chunks[:rerank_top_n]]
    else:
        reranked_chunks = []

    runtime = tm.time() - start_time
    print(f"Hierarchical Retrieval and Reranking Time: {runtime:.2f} seconds")
    return reranked_chunks, runtime

In [78]:
# Function to generate a response using GPT-2
def generate_response_gpt2(query):
    generator = pipeline("text-generation", model="gpt2")
    generated_text = generator(query, max_length=5000, num_return_sequences=1)[0]['generated_text']
    return generated_text

# Function to generate a response using Cohere's API
def generate_response_cohere(query, cohere_api_key):
    co = cohere.Client(api_key=cohere_api_key)
    response = co.chat(model='command-r-plus', message=query)
    return response.text

# Function to generate a response using GPT-2 with retrieved context
def generate_response_gpt2_with_context(query, retrieved_passages):
    generator = pipeline("text-generation", model="gpt2")
    context = query + "\n\n" + "\n".join(retrieved_passages)
    generated_text = generator(context, max_length=10000, num_return_sequences=1)[0]['generated_text']
    return generated_text

# Function to generate a response using Cohere with retrieved context
def generate_response_cohere_with_context(query, retrieved_passages, cohere_api_key):
    context = query + "\n\n" + "\n".join(retrieved_passages)
    co = cohere.Client(api_key=cohere_api_key)
    response = co.generate(prompt=context, model="command").generations[0].text
    return response

def generate_response_with_augment_prompt(query, retrieved_passages, cohere_api_key, max_tokens=4081):
    # Construct initial context
    context = query + "\n\nUsing the contexts below, answer the query.\nContexts:\n".join(retrieved_passages)

    # Step 1: Check token count
    while len(tokenizer.encode(context)) > max_tokens and len(retrieved_passages) > 2:
        # Token count exceeds limit. Truncating the last chunk
        retrieved_passages.pop()
        context = query + "\n\nUsing the contexts below, answer the query.\nContexts:\n".join(retrieved_passages)

    # Step 2: Add final prompt instruction
    context += "\nIf the answer is not included in the source knowledge - say that you don't know."

    # Initialize Cohere client and generate response
    co = cohere.Client(api_key=cohere_api_key)
    response = co.generate(prompt=context, model="command").generations[0].text
    return response

# Queries

In [188]:
query = "What can you tell me about NVIDIA's growth and major developments since 2023?"
# query = "Who won the 2023 Turing Award ?"

# query = "How much money did Harry Potter star Daniel Radcliffe have when he was 18?"
# query = "What was the punishment Michael Vick could face for his role in the dogfighting conspiracy?"

In [189]:
dataset_name = "wikipedia" # "cnn_dailymail" or "wikipedia" or "natural_questions"
llm_choice = "cohere"  # "cohere" or "gpt2"

In [190]:
# Step 1: Extract keywords from the query
keywords, year = extract_keywords(query)
print(f"Extracted Keywords: {keywords}")
if year:
  print(f"Extracted Year: {year}")

Extracted Keywords: ['NVIDIA']
Extracted Year: 2023


# Tests

In [191]:
# Step 2: Fetch the data
time, ds, embeddings = index_dataset_basic(dataset_name, keywords)

Indexing page: NVIDIA
Indexed page: NVIDIA
Basic Indexing Time for wikipedia: 8.05 seconds


In [192]:
if dataset_name == "wikipedia":
  for idx, paragraph in wiki_content_map.items():
    if idx == 5:
      break
    print(f"Paragraph {idx} \n{paragraph}")
else:
  pd_dataset = ds.to_pandas()
  print(pd_dataset.head(5))

Paragraph 0 
{'paragraph': 'Nvidia Corporation (, en-VID-ee-ə) is an American multinational corporation and technology company headquartered in Santa Clara, California, and incorporated in Delaware. It is a software and fabless company which designs and supplies graphics processing units (GPUs), application programming interfaces (APIs) for data science and high-performance computing, as well as system on a chip units (SoCs) for the mobile computing and automotive market. Nvidia is also a dominant supplier of artificial intelligence (AI) hardware and software.', 'metadata': {'keyword': 'NVIDIA', 'paragraph_idx': 0}}
Paragraph 1 
{'paragraph': "Nvidia's professional line of GPUs are used for edge-to-cloud computing and in supercomputers and workstations for applications in fields such as architecture, engineering and construction, media and entertainment, automotive, scientific research, and manufacturing design. Its GeForce line of GPUs are aimed at the consumer market and are used in 

In [193]:
if dataset_name == "wikipedia":
    pc = create_pinecone_index('wikipedia', embeddings.shape[1])
    index = pc.Index('wikipedia')
    index_upserted = upsert_vectors(index, embeddings, wiki_content_map, dataset_name="wikipedia")
else:
    pc = create_pinecone_index('cnn-dailymail', embeddings.shape[1])
    index = pc.Index('cnn-dailymail')
    index_upserted = upsert_vectors(index, embeddings, ds, dataset_name="cnn_dailymail")

Creating a Pinecone index...
Done!
Upserting the embeddings to the Pinecone index...


100%|██████████| 2/2 [00:01<00:00,  1.55it/s]


In [194]:
# Step 3: Basic LLM response (No retrievial context)
if llm_choice == "gpt2":
    basic_response = generate_response_gpt2(query)
elif llm_choice == "cohere":
    if cohere_api_key is None:
        raise ValueError("Cohere API key is required for Cohere LLM.")
    basic_response = generate_response_cohere(query, cohere_api_key)
else:
    raise ValueError("Invalid LLM choice. Please choose 'gpt2' or 'cohere'.")

print("\nBasic Response (No Retrieval):")
print(basic_response)


Basic Response (No Retrieval):
Unfortunately, as an AI language model, I only have access to the information available on the internet up until the beginning of January 2023. Therefore, I cannot provide details on NVIDIA's growth and developments in 2023 and beyond as it would require real-time data and access to future events and plans, which are not available to me.

However, I can provide a summary of NVIDIA's growth and major developments prior to 2023, which have likely contributed to its continued success:

1. Data Center Business Expansion: NVIDIA has significantly grown its data center business, which includes providing hardware and software solutions for artificial intelligence (AI), machine learning, and data analytics workloads. This segment has become a major revenue driver for the company, with continued growth expected as more businesses adopt AI and data center technologies.

2. Gaming Dominance: NVIDIA's GeForce graphics processing units (GPUs) have dominated the gamin

In [195]:
# Step 4: Basic augmented LLM response (with retrieved context)
augmented_prompt, source_knowledge = augment_prompt(query, model=model, index=index)

if llm_choice == "gpt2":
    augmented_response = generate_response_gpt2_with_context(query, augmented_prompt)
elif llm_choice == "cohere":
    augmented_response = generate_response_cohere_with_context(query, augmented_prompt, cohere_api_key)

print("\nAugmented Response (With Retrieved Context):")
print(augmented_response)


Augmented Response (With Retrieved Context):
 Here is some information about NVIDIA's growth and major developments between 2023 and now: 

Growth and Performance:
- NVIDIA's revenue has consistently increased throughout the past few years, with a notable jump of nearly 60% in the first quarter of 2024 compared to the previous year. This was driven primarily by strong demand and the successful launch of various AI and graphics-related products. 
- They have also consistently reported record revenues and significant financial gains in recent quarters. These results have been fueled by the company's diversification efforts, expanding its reach beyond gaming and into the realms of AI, cloud computing, and autonomous vehicles. 

Acquisitions and Partnerships: 
- NVIDIA made strategic acquisitions to strengthen its position in key sectors. In 2023, they acquired the British chip firm Arm Limited for a record-breaking US$40 billion. This purchase grew NVIDIA's footprint in the semiconductor

In [196]:
pc2 = Pinecone(api_key=PINECONE_API_KEY)
# List the current indexes
print(pc2.list_indexes())

# Check if the dataset is 'wikipedia' or 'cnn-dailymail' and delete the relevant indexes if they exist
if dataset_name == "wikipedia":
    if 'cnn-dailymail-summary' in [index.name for index in pc2.list_indexes()]:
        pc2.delete_index('cnn-dailymail-summary')
        print("Deleted 'cnn-dailymail-summary' index.")
    if 'cnn-dailymail-chunk' in [index.name for index in pc2.list_indexes()]:
        pc2.delete_index('cnn-dailymail-chunk')
        print("Deleted 'cnn-dailymail-chunk' index.")
else:
    if 'wikipedia-summary' in [index.name for index in pc2.list_indexes()]:
        pc2.delete_index('wikipedia-summary')
        print("Deleted 'wikipedia-summary' index.")
    if 'wikipedia-chunk' in [index.name for index in pc2.list_indexes()]:
        pc2.delete_index('wikipedia-chunk')
        print("Deleted 'wikipedia-chunk' index.")

# List the indexes after deletion to confirm changes
print(pc2.list_indexes())

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 384,
              'host': 'wikipedia-kv7z9bj.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'wikipedia',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'deletion_protection': 'disabled',
              'dimension': 384,
              'host': 'cnn-dailymail-chunk-kv7z9bj.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'cnn-dailymail-chunk',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'deletion_protection': 'disabled',
              'dimension': 384,
              'host': 'cnn-dailymail-kv7z9bj.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'cnn-dailymail',
              'spec': 

In [197]:
# Step 5: Hierarchical Indexing
summary_index, chunk_index, summaries, chunks, hierarchical_runtime = encode_dataset_hierarchical(dataset_name, keywords, year, pc2, query)

Found 168 documents.


 34%|███▍      | 57/168 [00:26<03:24,  1.84s/it]Your max_length is set to 100, but your input_length is only 97. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=48)
 35%|███▌      | 59/168 [00:37<05:11,  2.86s/it]Your max_length is set to 100, but your input_length is only 89. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=44)
100%|██████████| 168/168 [01:25<00:00,  1.96it/s]


Found and summarized 16 relevant documents.
Creating new index 'wikipedia-summary' with dimension 384.
Creating new index 'wikipedia-chunk' with dimension 384.
Hierarchical Indexing Time for wikipedia: 108.08 seconds


In [201]:
# Retrieve relevant chunks using hierarchical retrieval and reranking
relevant_chunks, retrieval_time = retrieve_hierarchical_with_reranking(query, summary_index, chunk_index, summaries, chunks)

# Print reranked results
print("\nTop Reranked Chunks:")
for i, chunk in enumerate(relevant_chunks):
    print(f"\nChunk {i+1}:\n{chunk}")

# Now use augmented_prompt in your LLM query
augmented_response = generate_response_with_augment_prompt(query, relevant_chunks, cohere_api_key)

print("\nAugmented Response (With Retrieved Context):")
print(augmented_response)


relevant_chunks :
 [("In June 2024, Nvidia's market capitalization reached $3 trillion for the first time. Nvidia, then the third most valuable company in the S&P 500, executed a 10-for-1 stock split on June 10, 2024. This move increased the accessibility of shares to investors and followed a significant rise in the company's value, driven by growing demand for its AI-focused semiconductors. The company's revenue tripled in the most recent fiscal quarter compared to the previous year, reaching $26 billion, with projections for 2025 nearing $117 billion. Nvidia's 53.4% net margin indicated strong profitability within the technology sector. The company became the world's most valuable, surpassing Microsoft and Apple, on June 18, 2024, after its market capitalization exceeded $3.3 trillion.", "What can you tell me about NVIDIA's growth and major developments since 2023?"), ('On March 1, 2024, Nvidia became the third company in the history of the United States to close with a market capita

# Evaluation

Query reformulation

In [202]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

def generate_reformulations(query, cohere_client, num_reformulations=5):
    """
    Generate reformulations of the query using an LLM (in this case, Cohere).
    """
    reformulations = []
    for _ in range(num_reformulations):
        response = cohere_client.chat(model='command-r-plus', message=f"Please reformulate: {query}")
        reformulations.append(response.text)
    return reformulations

def evaluate_rag_with_reformulations(query, cohere_client, summary_index, chunk_index, summaries, chunks, model, num_reformulations=5):
    """
    Evaluate the RAG model by generating reformulations of the query, retrieving responses,
    and calculating similarity between responses.
    """
    # Step 1: Retrieve relevant chunks using hierarchical retrieval and reranking
    relevant_chunks, retrieval_time = retrieve_hierarchical_with_reranking(query, summary_index, chunk_index, summaries, chunks)

    # Step 2: Get the response for the original query
    original_response = generate_response_with_augment_prompt(query, relevant_chunks, cohere_api_key)

    # Step 3: Generate reformulated queries
    reformulated_queries = generate_reformulations(query, cohere_client, num_reformulations)

    # Step 4: Process each reformulated query, retrieve response and calculate similarities
    responses = [original_response]
    for reformulated_query in reformulated_queries:
        relevant_chunks, retrieval_time = retrieve_hierarchical_with_reranking(reformulated_query, summary_index, chunk_index, summaries, chunks)
        reformulated_response = generate_response_with_augment_prompt(reformulated_query, relevant_chunks, cohere_api_key)
        responses.append(reformulated_response)

    # Step 5: Encode the responses and calculate cosine similarity matrix
    response_embeddings = model.encode(responses)
    similarity_matrix = cosine_similarity(response_embeddings)

    # Step 6: Calculate similarities to the original response
    similarities_to_original = similarity_matrix[0, 1:]
    average_similarity = np.mean(similarities_to_original)

    # Display results
    print("Original Response:", original_response)
    print("\nReformulated Responses and Similarities:")
    for i, (reform_query, reform_response, sim) in enumerate(zip(reformulated_queries, responses[1:], similarities_to_original)):
        print(f"\nReformulation {i+1}: {reform_query}")
        print(f"Response: {reform_response}")
        print(f"Similarity to Original: {sim:.4f}")

    print(f"\nAverage Similarity to Original Response: {average_similarity:.4f}")
    return average_similarity


In [203]:
co = cohere.Client(api_key=cohere_api_key)
average_similarity = evaluate_rag_with_reformulations(query, co, summary_index, chunk_index, summaries, chunks, model, num_reformulations=5)
print("Evaluation completed. Average similarity score:", average_similarity)

relevant_chunks :
 [("In June 2024, Nvidia's market capitalization reached $3 trillion for the first time. Nvidia, then the third most valuable company in the S&P 500, executed a 10-for-1 stock split on June 10, 2024. This move increased the accessibility of shares to investors and followed a significant rise in the company's value, driven by growing demand for its AI-focused semiconductors. The company's revenue tripled in the most recent fiscal quarter compared to the previous year, reaching $26 billion, with projections for 2025 nearing $117 billion. Nvidia's 53.4% net margin indicated strong profitability within the technology sector. The company became the world's most valuable, surpassing Microsoft and Apple, on June 18, 2024, after its market capitalization exceeded $3.3 trillion.", "What can you tell me about NVIDIA's growth and major developments since 2023?"), ('On March 1, 2024, Nvidia became the third company in the history of the United States to close with a market capita

Semantic coherence

In [204]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def compute_coherence(query, response, model):
    """
    Compute the semantic coherence between the query and the response
    using cosine similarity.

    """
    query_embedding = model.encode(query).reshape(1, -1)
    response_embedding = model.encode(response).reshape(1, -1)

    coherence_score = cosine_similarity(query_embedding, response_embedding)[0][0]

    return coherence_score


In [205]:
coherence_score = compute_coherence(query, augmented_response, model)
print("Semantic Coherence Score:", coherence_score)

Semantic Coherence Score: 0.82843757
