In [1]:
%load_ext autoreload
%autoreload 2

%cd llama_index

/Users/ayushthakur/integrations/llamaindex/llama_index


In [2]:
import os
from pathlib import Path

import llama_index
print(llama_index.__version__)
from llama_index import VectorStoreIndex
from llama_index import download_loader

0.7.10.post1


In [10]:
from dotenv import load_dotenv
load_dotenv("/Users/ayushthakur/integrations/llamaindex/apis.env")

import openai

openai.api_key = os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

## Simple Query Engine

To set benchmark.

In [4]:
PDFReader = download_loader("PDFReader")

loader = PDFReader()
documents = loader.load_data(file=Path('../llama2.pdf'))

In [5]:
from llama_index import ServiceContext
from llama_index.callbacks import CallbackManager, WandbCallbackHandler

# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])
service_context = ServiceContext.from_defaults(callback_manager=callback_manager)

[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/v7h22l9i
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [18]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

In [19]:
query_engine = index.as_query_engine()

In [20]:
response = query_engine.query("Who wrote this paper?")
print(response, sep="\n")


This paper was written by a large group of contributors, including Science and Engineering Leadership, Technical and Management Leadership, Core Contributors, Contributors, and the GenAI executive team.


In [21]:
response = query_engine.query("What is this paper about?")
print(response, sep="\n")


This paper is about evaluating the performance of a Llama 2-Chat model compared to other open source or closed source models in terms of helpfulness. The evaluation is done by presenting prompts to human annotators and asking them to rate which model response is better. The prompts cover topics such as creative writing, identity/personas, factual questions, personal and professional development, casual advice and recommendations, and reasoning (math/problem-solving).


In [22]:
response = query_engine.query("For how many steps was the Llama2 model trained for?")
print(response, sep="\n")


The Llama2 model was trained for 2 trillion tokens.


In [23]:
wandb_callback.finish()

## Build Evaluation Dataset

In [30]:
import copy
import wandb
import random
import pandas as pd

from llama_index.llms import OpenAI
from llama_index import ServiceContext
from llama_index.evaluation import DatasetGenerator
from llama_index.callbacks import CallbackManager, WandbCallbackHandler

In [79]:
# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016713924299983772, max=1.0…

[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/liu8ghm6
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [80]:
# setup LLM and chunk size
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=llm, callback_manager=callback_manager)

random_documents = copy.deepcopy(documents)
random.shuffle(random_documents)
random_documents = random_documents[:10]

data_generator = DatasetGenerator.from_documents(
    random_documents, service_context=service_context, num_questions_per_chunk=2
)

In [81]:
eval_questions = data_generator.generate_questions_from_nodes()

[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.


In [231]:
eval_questions

['In the context of religious ideologies, compare the sentiment scores of the Llama 2-Chat7B model for Judaism, Christianity, Islam, Buddhism, and Sikhism with the sentiment scores of the MPT7B model. How do the sentiment scores differ between the two models for each religion?',
 'Analyze the distribution of mean sentiment scores across different political ideologies for the Llama 27B model. Compare the sentiment scores for left-wing, right-wing, communism, socialism, democracy, liberalism, populism, conservatism, nationalism, anarchism, capitalism, and fascism. How do the sentiment scores vary across these ideologies?',
 'Based on the distribution of mean sentiment scores across groups under the political ideology domain from the BOLD prompts, what can we conclude about the sentiment towards the different instructions (MPT-instruct, Falcon-instruct, Llama 2-Chat7B) in the document?',
 'How do the mean sentiment scores vary for different groups (MPT-instruct, Falcon-instruct, Llama 2-C

In [85]:
df = pd.DataFrame(columns=["questions"], data=eval_questions)
df.head()

Unnamed: 0,questions
0,"In the context of religious ideologies, compar..."
1,Analyze the distribution of mean sentiment sco...
2,Based on the distribution of mean sentiment sc...
3,How do the mean sentiment scores vary for diff...
4,In the short story about a dragon who was evil...


In [88]:
# Log the questions to W&B.
wandb.log({"Generated Questions": df})
wandb.finish()

## Evaluate for Response Hallucination

This is system level evaluation.

In [55]:
from llama_index.evaluation import ResponseEvaluator

In [233]:
# Get the questions from the W&B tables (this demonstrates the closing of the loop)
run = wandb.init(project="llama-index-report")
artifact = run.use_artifact('ayush-thakur/llama-index-report/run-liu8ghm6-GeneratedQuestions:v0', type='run_table')
artifact_dir = artifact.download()
run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mayush-thakur[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016752450699762753, max=1.0…

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [234]:
artifact_dir

'./artifacts/run-liu8ghm6-GeneratedQuestions:v0'

In [239]:
import json

with open(f"{artifact_dir}/Generated Questions.table.json") as f:
    data = json.load(f)

columns = data["columns"]
questions = data["data"]

question_df = pd.DataFrame(columns=columns, data=questions)
question_df.head()

Unnamed: 0,questions
0,"In the context of religious ideologies, compar..."
1,Analyze the distribution of mean sentiment sco...
2,Based on the distribution of mean sentiment sc...
3,How do the mean sentiment scores vary for diff...
4,In the short story about a dragon who was evil...


In [257]:
# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016752349300077185, max=1.0…

[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/6wxg2tbb
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [258]:
# build service context
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(llm=llm, callback_manager=callback_manager)

# define evaluator
evaluator = ResponseEvaluator(service_context=service_context)

# query index
query_engine = index.as_query_engine()

In [259]:
eval_results = []
responses = []

for _, question in question_df.iterrows():
    response = query_engine.query(question.questions)
    eval_result = evaluator.evaluate(response)
    responses.append(response.response)
    eval_results.append(eval_result)

[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace t

In [268]:
eval_hallucination_df = pd.DataFrame(
    columns=["question", "llm_response", "eval_result"],
    data=list(zip(list(df.questions.values), responses, eval_results))
)

eval_hallucination_df.head()

Unnamed: 0,question,llm_response,eval_result
0,"In the context of religious ideologies, compar...",\nThe Llama 2-Chat7B model has higher sentimen...,NO
1,Analyze the distribution of mean sentiment sco...,\nThe sentiment scores for the Llama 27B model...,YES
2,Based on the distribution of mean sentiment sc...,\nWe can conclude that the sentiment towards t...,NO
3,How do the mean sentiment scores vary for diff...,\nThe mean sentiment scores for the different ...,NO
4,In the short story about a dragon who was evil...,"\nIn the short story, the dragon's realization...",NO


In [269]:
def compute_hallucination_accuracy(eval_results):
    score = 0
    for eval_result in eval_results:
        if eval_result == "YES":
            score += 1
    
    return (score/len(eval_results))*100

hallucination_accuracy = compute_hallucination_accuracy(list(eval_hallucination_df.eval_result))
hallucination_accuracy

43.75

In [262]:
# Log the questions to W&B.
wandb.log({"Hallucination Accuracy": hallucination_accuracy})
wandb.log({"Hallucination Eval": eval_hallucination_df})
wandb.finish()

In [279]:
import wandb
api = wandb.Api()
run = api.run("ayush-thakur/llama-index-report/6wxg2tbb")
run.summary["Hallucination Accuracy"] = hallucination_accuracy
run.summary.update()

## Evaluate the Retrieved Documents

In [251]:
# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01675216041621752, max=1.0)…

[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/hed03acv
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [252]:
# build service context
llm = OpenAI(temperature=0, model="gpt-4")
service_context = ServiceContext.from_defaults(llm=llm, callback_manager=callback_manager)

# define evaluator
evaluator = ResponseEvaluator(service_context=service_context)

# query index
query_engine = index.as_query_engine()

In [253]:
eval_results = []
responses = []
chunks = []

for _, question in question_df.iterrows():
    response = query_engine.query(question.questions)
    eval_result = evaluator.evaluate_source_nodes(response)
    chunks.append([f"{idx}.chunk: {source_node.node.text}" for idx, source_node in enumerate(response.source_nodes)])
    responses.append(response.response)
    eval_results.append(eval_result)

[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace t

In [254]:
eval_results_to_num = []

for eval_result in eval_results:
    score = 0
    for eval_response in eval_result:
        if eval_response=="YES":
            score+=1
        else:
            score+=0
    eval_results_to_num.append(score/len(eval_result))

In [255]:
eval_retrieve_df = pd.DataFrame(
    columns=["question", "retrieved chunks", "llm_response", "raw_eval_result", "eval_score"],
    data=list(zip(list(df.questions.values), chunks, responses, eval_results, eval_results_to_num))
)

eval_retrieve_df.head()

Unnamed: 0,question,retrieved chunks,llm_response,raw_eval_result,eval_score
0,"In the context of religious ideologies, compar...",[0.chunk: Judaism Christianity Islam Buddhism ...,\nThe Llama 2-Chat7B model has higher sentimen...,"[YES, YES]",1.0
1,Analyze the distribution of mean sentiment sco...,[0.chunk: 0.23 0.06\nMPT-instruct 7B 0.13 0.29...,\nThe sentiment scores for the Llama 27B model...,"[NO, YES]",0.5
2,Based on the distribution of mean sentiment sc...,[0.chunk: 0.23 0.06\nMPT-instruct 7B 0.13 0.29...,\nWe can conclude that the sentiment towards t...,"[YES, NO]",0.5
3,How do the mean sentiment scores vary for diff...,[0.chunk: 0.23 0.06\nMPT-instruct 7B 0.13 0.29...,\nThe mean sentiment scores for the different ...,"[YES, YES]",1.0
4,In the short story about a dragon who was evil...,[0.chunk: Category Prompt\nCreative writingWri...,"\nIn the short story, the dragon's realization...","[NO, NO]",0.0


In [306]:
retrieval_accuracy = eval_retrieve_df.eval_score.values.sum()/len(eval_retrieve_df)
retrieval_accuracy

0.640625

In [256]:
# Log the questions to W&B.
wandb.log({"Retrieval Accuracy": retrieval_accuracy})
wandb.log({"Retrieval Eval": eval_retrieve_df})
wandb.finish()

### Utility for Evaluation

In [52]:
import json
from tqdm import tqdm

from llama_index.evaluation import ResponseEvaluator

In [53]:
def download_eval_questions():
    # Get the questions from the W&B tables (this demonstrates the closing of the loop)
    run = wandb.init(project="llama-index-report")
    artifact = run.use_artifact('ayush-thakur/llama-index-report/run-liu8ghm6-GeneratedQuestions:v0', type='run_table')
    artifact_dir = artifact.download()
    
    with open(f"{artifact_dir}/Generated Questions.table.json") as f:
        data = json.load(f)

    columns = data["columns"]
    questions = data["data"]

    question_df = pd.DataFrame(columns=columns, data=questions)
    wandb.finish()
    
    return question_df


def retrieval_eval_result_to_num(eval_results):
    eval_results_to_num = []

    for eval_result in eval_results:
        score = 0
        for eval_response in eval_result:
            if eval_response=="YES":
                score+=1
            else:
                score+=0
        eval_results_to_num.append(score/len(eval_result))
        
    return eval_results_to_num


def compute_hallucination_accuracy(eval_results):
    score = 0
    for eval_result in eval_results:
        if eval_result == "YES":
            score += 1
    
    return (score/len(eval_results))*100


def evaluate(query_engine, question_df, callback_manager):
    # llm for evaluating hallucination
    h_llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
    h_service_context = ServiceContext.from_defaults(llm=h_llm, callback_manager=callback_manager)
    h_evaluator = ResponseEvaluator(service_context=service_context)
    
    # llm for evaluating retrieval
    r_llm = OpenAI(temperature=0, model="gpt-4")
    r_service_context = ServiceContext.from_defaults(llm=r_llm, callback_manager=callback_manager)
    r_evaluator = ResponseEvaluator(service_context=service_context)

    h_eval_results = []
    r_eval_results = []
    chunks = []
    responses = []

    # Run evaluation
    for _, question in tqdm(question_df.iterrows()):
        try:
            response = query_engine.query(question.questions)
            # Hallucination Evaluation
            h_eval_result = h_evaluator.evaluate(response)
            # Retrieved chunks evaluation
            r_eval_result = r_evaluator.evaluate_source_nodes(response)

            h_eval_results.append(h_eval_result)
            chunks.append([f"{idx}.chunk: {source_node.node.text}" for idx, source_node in enumerate(response.source_nodes)])
            responses.append(response.response)
            r_eval_results.append(r_eval_result)
        except:
            print("failed")
            h_eval_results.append(None)
            responses.append(None)
            chunks.append(None)
            r_eval_results.append(None)
            
    # dataframes
    eval_hallucination_df = pd.DataFrame(
        columns=["question", "llm_response", "eval_result"],
        data=list(zip(list(question_df.questions.values), responses, h_eval_results))
    )

    r_eval_results_to_num = retrieval_eval_result_to_num(r_eval_results)
    eval_retrieve_df = pd.DataFrame(
        columns=["question", "retrieved chunks", "llm_response", "raw_eval_result", "eval_score"],
        data=list(zip(list(question_df.questions.values), chunks, responses, r_eval_results, r_eval_results_to_num))
    )

    # Compute overall metrics
    hallucination_accuracy = compute_hallucination_accuracy(list(eval_hallucination_df.eval_result))
    retrieval_accuracy = eval_retrieve_df.eval_score.values.sum()/len(eval_retrieve_df)

    if wandb.run:
        wandb.log({
            "Hallucination Eval": eval_hallucination_df,
            "Retrieval Eval": eval_retrieve_df,
            "Hallucination Accuracy": hallucination_accuracy,
            "Retrieval Accuracy": retrieval_accuracy,
        })
        
    return eval_hallucination_df, eval_retrieve_df, hallucination_accuracy, retrieval_accuracy
    

# Use Keyword based Index

In [441]:
from llama_index import SimpleKeywordTableIndex, KeywordTableIndex

In [442]:
# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])
service_context = ServiceContext.from_defaults(callback_manager=callback_manager)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01675209028374714, max=1.0)…

[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/s4fggue7
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [443]:
keyword_index = KeywordTableIndex.from_documents(documents, service_context=service_context)
# simple_keyword_index = SimpleKeywordTableIndex.from_documents(documents, service_context=service_context)

[34m[1mwandb[0m: Logged trace tree to W&B.


In [343]:
query_engine = keyword_index.as_query_engine()

In [326]:
response = query_engine.query("Who wrote this paper?")
print(response.response, sep="\n")

[34m[1mwandb[0m: Logged trace tree to W&B.


None


In [327]:
response = query_engine.query("What is this paper about?")
print(response, sep="\n")

[34m[1mwandb[0m: Logged trace tree to W&B.


None


In [328]:
response = query_engine.query("For how many steps the Llama 2 model was trained for?")
print(response, sep="\n")



Llama 2 was trained for two trillion tokens and was fine-tuned for up to 20 turns with GAtt to maintain 100% accuracy in referring to defined attributes. We tested the model's ability to remember system arguments through a human evaluation and observed a binary split pattern in reward distribution, especially with a larger margin. We also observed a distribution of mean sentiment scores across groups under the religious and political ideology domains from the BOLD prompts, with pretrained MPT7B scores of 0.20, 0.31, 0.20, 0.33, and 0.31, and fine-tuned Llama 2-Chat7B scores of 0.55, 0.50, 0.48, 0.45, and 0.62. This indicates that the model was trained for a total of 20 steps.


In [329]:
wandb.finish()

### Evaluate the KeywordTableIndex

In [344]:
# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01674248611670919, max=1.0)…

[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/ra17h76z
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [335]:
question_df = download_eval_questions()
question_df.head()

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01674720208296397, max=1.0)…

[34m[1mwandb[0m:   1 of 1 files downloaded.  


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Unnamed: 0,questions
0,"In the context of religious ideologies, compar..."
1,Analyze the distribution of mean sentiment sco...
2,Based on the distribution of mean sentiment sc...
3,How do the mean sentiment scores vary for diff...
4,In the short story about a dragon who was evil...


In [345]:
eval_hallucination_df, eval_retrieve_df, hallucination_accuracy, retrieval_accuracy = evaluate(query_engine, question_df, callback_manager)

0it [00:00, ?it/s][34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[

[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
10it [05:12, 30.46s/it][34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
11it [05:21

[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
23it [10:24, 27.35s/it][34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwa

In [351]:
wandb.finish()

In [346]:
eval_hallucination_df

Unnamed: 0,question,llm_response,eval_result
0,"In the context of religious ideologies, compar...",\n\nThe Llama 2-Chat7B model had higher sentim...,YES
1,Analyze the distribution of mean sentiment sco...,\n\nThe Llama 27B model had the highest sentim...,YES
2,Based on the distribution of mean sentiment sc...,\n\nBased on the distribution of mean sentimen...,YES
3,How do the mean sentiment scores vary for diff...,\n\nThe mean sentiment scores for different gr...,YES
4,In the short story about a dragon who was evil...,\nThe events or experiences that led the drago...,NO
5,How did the discovery of Anne Frank's diary co...,\n\nThe discovery of Anne Frank's diary has ha...,YES
6,What is the historical significance of the ele...,\nThe Republican Party is depicted as an eleph...,NO
7,How can assumptions and stereotypes based on p...,\n\nMaking assumptions and stereotypes based o...,YES
8,How does the distribution of human preference ...,\n\nThe distribution of human preference data ...,YES
9,What is the impact of the safety auxiliary los...,\n\nThe ablation study showed that the safety ...,YES


In [347]:
hallucination_accuracy

59.375

In [350]:
retrieval_accuracy

0.016666666666666666

## Cross Encoder

In [34]:
from llama_index.indices.postprocessor import SentenceTransformerRerank

In [42]:
from llama_index import ServiceContext
from llama_index.callbacks import CallbackManager, WandbCallbackHandler

# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])
service_context = ServiceContext.from_defaults(callback_manager=callback_manager)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/37o7uxez
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [43]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

[34m[1mwandb[0m: Logged trace tree to W&B.


In [44]:
rerank = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2", top_n=3
)

In [45]:
ce_query_engine = index.as_query_engine(similarity_top_k=10, node_postprocessors=[rerank])

In [46]:
response = ce_query_engine.query("Who wrote this paper?")
print(response.response, sep="\n")

[34m[1mwandb[0m: Logged trace tree to W&B.



The authors of this paper are Guillem Cucurull, Naman Goyal, Louis Martin, Thomas Scialom, Ruan Silva, Kevin Stone, Hugo Touvron, Sergey Edunov, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Peter Albert, Nikolay Bashlykov, Prajjwal Bhargava, Moya Chen, David Esiobu, Jeremy Fu, Vedanuj Goswami, Anthony Hartshorn, Rui Hou, Marcin Kardas, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Diana Liskovich, Xavier Martinet, Yuning Mao, Igor Molybog, Todor Mihaylov, Andrew Poulton, Jeremy Reizenstein, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Jacob Xu, Yuchen Zhang, Iliyan Zarov, Amjad Almahairi, Yasmine Babaei, Soumya Batra, Lukas Blecher, Dan Bikel, Shruti Bhosale, Cristian Canton Ferrer, Jude Fernandes, Wenyin Fu, Brian Fuller, Cynthia Gao, Saghar Hosseini, Hakan Inan, Isabel Kloumann, Madian Khabsa, Artem Korenev, Viktor Kerkez, Jian Xiang Kuan, Yinghai Lu, Jenya Lee, Pushkar Mishra, Yixin Nie, Rashi Rungt

In [47]:
response = ce_query_engine.query("What is this paper about?")
print(response, sep="\n")

[34m[1mwandb[0m: Logged trace tree to W&B.



This paper is about the development of a general language assistant, trained with reinforcement learning from human feedback, and its evaluation using human annotators. It discusses the impact of system prompts on the performance of the model, and presents an evaluation methodology for comparing two models side-by-side. It also provides references to related work in the field.


In [48]:
response = ce_query_engine.query("For how many steps the Llama 2 model was trained for?")
print(response, sep="\n")

[34m[1mwandb[0m: Logged trace tree to W&B.



Llama 2 was trained for two steps: pretraining and fine-tuning.


In [49]:
wandb.finish()

### Evaluate Cross Encoder

In [50]:
# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016732737500569785, max=1.0…

[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/eywhg1dr
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [54]:
question_df = download_eval_questions()
question_df.head()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m:   1 of 1 files downloaded.  


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Unnamed: 0,questions
0,"In the context of religious ideologies, compar..."
1,Analyze the distribution of mean sentiment sco...
2,Based on the distribution of mean sentiment sc...
3,How do the mean sentiment scores vary for diff...
4,In the short story about a dragon who was evil...


In [56]:
eval_hallucination_df, eval_retrieve_df, hallucination_accuracy, retrieval_accuracy = evaluate(ce_query_engine, question_df, callback_manager)


0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
1it [00:14, 14.74s/it][34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
2it [00:26, 12.88s/it][34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tre

[34m[1mwandb[0m: Logged trace tree to W&B.
19it [03:27,  7.87s/it][34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
20it [03:33,  7.18s/it][34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwandb[0m: Logged trace tree to W&B.
21it [03:51, 10.46s/it][34m[1mwandb[0m: Logged trace tree to W&B.
[34m[1mwa

In [61]:
wandb.finish()

## FLARE Query Engine

In [92]:
from llama_index import ServiceContext
from llama_index.callbacks import CallbackManager, WandbCallbackHandler

# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])
service_context = ServiceContext.from_defaults(callback_manager=callback_manager)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/n5zsz560
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [93]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

[34m[1mwandb[0m: Logged trace tree to W&B.


In [94]:
query_engine = index.as_query_engine()

In [83]:
from llama_index.query_engine import FLAREInstructQueryEngine

In [86]:
index_query_engine = index.as_query_engine(similarity_top_k=2)

In [87]:
flare_query_engine = FLAREInstructQueryEngine(
    query_engine=index_query_engine,
    service_context=service_context,
    max_iterations=7,
    verbose=True,
)

In [88]:
response = flare_query_engine.query("Who wrote this paper?")
print(response.response, sep="\n")

[32;1m[1;3mQuery: Who wrote this paper?
[0m[36;1m[1;3mCurrent response: 
[0m[38;5;200m[1;3mLookahead response: This paper was written by [Search(Who wrote this paper?)].
[0m

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: This paper was written by the contributors listed in the Appendix A.1 Contributions section, including Science and Engineering Leadership, Technical and Management Leadership, Core Contributors, and Contributors.
[0m[36;1m[1;3mCurrent response:  This paper was written by the contributors listed in the Appendix A.1 Contributions section, including Science and Engineering Leadership, Technical and Management Leadership, Core Contributors, and Contributors.
[0m[38;5;200m[1;3mLookahead response: [Search(Who are the contributors listed in Appendix A.1 Contributions?)]
[0m

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: The contributors listed in Appendix A.1 Contributions are: Guillem Cucurull, Naman Goyal, Louis Martin, Thomas Scialom, Ruan Silva, Kevin Stone, Hugo Touvron, Sergey Edunov, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Peter Albert, Nikolay Bashlykov, Prajjwal Bhargava, Moya Chen, David Esiobu, Jeremy Fu, Vedanuj Goswami, Anthony Hartshorn, Rui Hou, Marcin Kardas, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Diana Liskovich, Xavier Martinet, Yuning Mao, Igor Molybog, Todor Mihaylov, Andrew Poulton, Jeremy Reizenstein, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Jacob Xu, Yuchen Zhang, Iliyan Zarov, Amjad Almahairi, Yasmine Babaei, Soumya Batra, Lukas Blecher, Dan Bikel, Shruti Bhosale, Cristian Canton Ferrer, Jude Fernandes, Wenyin Fu, Brian Fuller, Cynthia Gao, Saghar Hosseini, Hakan Inan, Isabel Kloumann, Madian Khabsa, Artem Korenev, Viktor Kerkez, Jian 

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: This paper was written by the contributors listed in the Appendix A.1 Contributions section, including Science and Engineering Leadership, Technical and Management Leadership, Core Contributors, and Contributors. The contributors listed in Appendix A.1 Contributions are: Guillem Cucurull, Naman Goyal, Louis Martin, Thomas Scialom, Ruan Silva, Kevin Stone, Hugo Touvron, Sergey Edunov, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Peter Albert, Nikolay Bashlykov, Prajjwal Bhargava, Moya Chen, David Esiobu, Jeremy Fu, Vedanuj Goswami, Anthony Hartshorn, Rui Hou, Marcin Kardas, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Diana Liskovich, Xavier Martinet, Yuning Mao, Igor Molybog, Todor Mihaylov, Andrew Poulton, Jeremy Reizenstein, Eric Michael Smith, Ranjan Subramanian, Xiaoqing Ellen Tan, Binh Tang, Ross Taylor, Jacob Xu, Yuchen Zhang, Iliyan Zarov, Amjad Almahairi, Yasmine Babaei, Soumya Batra, Luk

[0m[38;5;200m[1;3mLookahead response: Contributors in the Contributors section are [Search(What are the names of the contributors in the Contributors section of Appendix A.1?)].
[0m

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: Contributors in the Contributors section are Amjad Almahairi, Yasmine Babaei, Soumya Batra, Lukas Blecher, Dan Bikel, Shruti Bhosale, Cristian Canton Ferrer, Jude Fernandes, Wenyin Fu, Brian Fuller, Cynthia Gao, Saghar Hosseini, Hakan Inan, Isabel Kloumann, Madian Khabsa, Artem Korenev, Viktor Kerkez, Jian Xiang Kuan, Yinghai Lu, Jenya Lee, Pushkar Mishra, Yixin Nie, Rashi Rungta, Alan Schelten, Kalyan Saladi, Adina Williams, and Zheng Yan.
[0m[36;1m[1;3mCurrent response: This paper was written by the contributors listed in the Appendix A.1 Contributions section, including Science and Engineering Leadership, Technical and Management Leadership, Core Contributors, and Contributors. The contributors listed in Appendix A.1 Contributions are: Guillem Cucurull, Naman Goyal, Louis Martin, Thomas Scialom, Ruan Silva, Kevin Stone, Hugo Touvron, Sergey Edunov, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Peter A

In [90]:
response = flare_query_engine.query("For how many steps the Llama 2 model was trained for?")
print(response, sep="\n")

[32;1m[1;3mQuery: For how many steps the Llama 2 model was trained for?
[0m[36;1m[1;3mCurrent response: 
[0m[38;5;200m[1;3mLookahead response: The Llama 2 model was trained for [Search(How many steps was the Llama 2 model trained for?)] steps.
[0m

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: The Llama 2 model was trained for approximately 6 months steps.
[0m[36;1m[1;3mCurrent response:  The Llama 2 model was trained for approximately 6 months steps.
[0m[38;5;200m[1;3mLookahead response: [Search(How many training steps did the Llama 2 model take?)]
[0m

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: The Llama 2 model took 2 trillion tokens of data for pretraining.
[0m[36;1m[1;3mCurrent response: The Llama 2 model was trained for approximately 6 months steps. The Llama 2 model took 2 trillion tokens of data for pretraining.
[0m[38;5;200m[1;3mLookahead response: It was then fine-tuned on [Search(What data was the Llama 2 model fine-tuned on?)] for an additional 2 million steps.
[0m

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: It was then fine-tuned on publicly available instruction datasets, as well as over one million new human-annotated examples for an additional 2 million steps.
[0m[36;1m[1;3mCurrent response: The Llama 2 model was trained for approximately 6 months steps. The Llama 2 model took 2 trillion tokens of data for pretraining. It was then fine-tuned on publicly available instruction datasets, as well as over one million new human-annotated examples for an additional 2 million steps.
[0m[38;5;200m[1;3mLookahead response: [Search(How many steps did the Llama 2 model take for fine-tuning?)]
[0m

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: The Llama 2 model took two steps for fine-tuning: supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF).
[0m[36;1m[1;3mCurrent response: The Llama 2 model was trained for approximately 6 months steps. The Llama 2 model took 2 trillion tokens of data for pretraining. It was then fine-tuned on publicly available instruction datasets, as well as over one million new human-annotated examples for an additional 2 million steps. The Llama 2 model took two steps for fine-tuning: supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF).
[0m[38;5;200m[1;3mLookahead response: The SFT step took 1 million steps and the RLHF step took 1 million steps.
[0m[38;5;200m[1;3mUpdated lookahead response: The SFT step took 1 million steps and the RLHF step took 1 million steps.
[0m[36;1m[1;3mCurrent response: The Llama 2 model was trained for approximately 6 months steps. The Llama 2 model took 2 t

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: The total number of steps for the Llama 2 model was 3 million.
[0m[36;1m[1;3mCurrent response: The Llama 2 model was trained for approximately 6 months steps. The Llama 2 model took 2 trillion tokens of data for pretraining. It was then fine-tuned on publicly available instruction datasets, as well as over one million new human-annotated examples for an additional 2 million steps. The Llama 2 model took two steps for fine-tuning: supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF). The SFT step took 1 million steps and the RLHF step took 1 million steps. The total number of steps for the Llama 2 model was 3 million.
[0m[38;5;200m[1;3mLookahead response: The total number of steps for the Llama 2 model was [Search(How many steps did the Llama 2 model take?)].
[0m

[34m[1mwandb[0m: Logged trace tree to W&B.


[38;5;200m[1;3mUpdated lookahead response: The total number of steps for the Llama 2 model was 3 million.
[0mThe Llama 2 model was trained for approximately 6 months steps. The Llama 2 model took 2 trillion tokens of data for pretraining. It was then fine-tuned on publicly available instruction datasets, as well as over one million new human-annotated examples for an additional 2 million steps. The Llama 2 model took two steps for fine-tuning: supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF). The SFT step took 1 million steps and the RLHF step took 1 million steps. The total number of steps for the Llama 2 model was 3 million. The total number of steps for the Llama 2 model was 3 million.


In [102]:
wandb.finish()

## Persisting Index

In [103]:
from llama_index import ServiceContext
from llama_index.callbacks import CallbackManager, WandbCallbackHandler

# initialise WandbCallbackHandler and pass any wandb.init args
wandb_args = {"project":"llama-index-report"}
wandb_callback = WandbCallbackHandler(run_args=wandb_args)

# pass wandb_callback to the service context
callback_manager = CallbackManager([wandb_callback])
service_context = ServiceContext.from_defaults(callback_manager=callback_manager)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016752635415953895, max=1.0…

[34m[1mwandb[0m: Streaming LlamaIndex events to W&B at https://wandb.ai/ayush-thakur/llama-index-report/runs/brl127tu
[34m[1mwandb[0m: `WandbCallbackHandler` is currently in beta.
[34m[1mwandb[0m: Please report any issues to https://github.com/wandb/wandb/issues with the tag `llamaindex`.


In [104]:
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

[34m[1mwandb[0m: Logged trace tree to W&B.


In [105]:
wandb_callback.persist_index(index, index_name="simple_vector_store")

[34m[1mwandb[0m: Adding directory to artifact (/Users/ayushthakur/integrations/llamaindex/llama_index/wandb/run-20230725_141307-brl127tu/files/storage)... Done. 0.0s


In [107]:
from llama_index import load_index_from_storage, load_graph_from_storage

storage_context = wandb_callback.load_storage_context(
    artifact_url="ayush-thakur/llama-index-report/simple_vector_store:v0"
)

# Load the index and initialize a query engine
loaded_index = load_index_from_storage(storage_context, service_context=service_context)

[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [108]:
loaded_index

<llama_index.indices.vector_store.base.VectorStoreIndex at 0x2f70dc2b0>

In [109]:
wandb.finish()