## Step 1: Set up colab and download relevant packages (if needed)

In [None]:
# mount collab to drive
from google.colab import drive
drive.mount("/content/drive")
%cd '/content/drive/My Drive/LlamaIndex/example_keyword_table_compare_and_contrast'

Mounted at /content/drive
/content/drive/My Drive/LlamaIndex/example_keyword_table_compare_and_contrast


In [None]:
!ls

data_1	data_3	     LlamaIndex_1.ipynb  neat_text.py
data_2	llama_index  LlamaIndex.ipynb	 __pycache__


In [None]:
# !git clone https://github.com/jerryjliu/llama_index.git

In [None]:
!pip install llama_index
!pip install pypdf
!pip install openai
!pip install transformers
!pip install accelerate
!pip install sentence_transformers
!pip install chromadb
!pip install -U openai-whisper
!pip install pydub
!pip install einops
!pip install llama-cpp-python

Collecting llama_index
  Downloading llama_index-0.8.3-py3-none-any.whl (673 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m673.7/673.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken (from llama_index)
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json (from llama_index)
  Downloading dataclasses_json-0.5.14-py3-none-any.whl (26 kB)
Collecting langchain>=0.0.262 (from llama_index)
  Downloading langchain-0.0.266-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=0.26.4 (from llama_index)
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m10.3 MB/s[

In [None]:
import openai
from llama_index import SimpleDirectoryReader, ServiceContext, VectorStoreIndex, SimpleKeywordTableIndex
from llama_index.vector_stores import ChromaVectorStore
import torch
from llama_index.llms import HuggingFaceLLM
import transformers
import chromadb
from llama_index.vector_stores import ChromaVectorStore
from llama_index.storage.storage_context import StorageContext
from neat_text import neat_text
from transformers import set_seed
from tqdm import tqdm
from llama_index.indices.composability import ComposableGraph
set_seed(42)

## Step 2: Load the documents

In [None]:
# Note: OpenAI GPT-3 text-davinci-003 model
# NOTE: Even if you wish to use a different model (eg Huggingface model), you still need to specify an OpenAI API key if not they may keep throwing errors.
openai.api_key = "blah_blah_blah"

In [None]:
SNAP = SimpleDirectoryReader("data_1").load_data()
TEFAP=SimpleDirectoryReader("data_2").load_data()
School_lunch=SimpleDirectoryReader("data_3").load_data()

## Steps 3: Define the ServiceContext and StorageContext

### Step 3(a) Definining the ServiceContext (ie LLM) if you wish to use something other than the default

In [None]:
llm = HuggingFaceLLM(
    # context_window=3000,
    # max_new_tokens=256,
    # generate_kwargs={"temperature": 0.2, "do_sample": False},
    tokenizer_name="EleutherAI/pythia-12b",
    model_name="EleutherAI/pythia-12b",
    # device_map="auto",
    # stopping_ids=[50278, 50279, 50277, 1, 0],
    # tokenizer_kwargs={"max_length": 4096, "padding": True, "truncation": True, "return_tensors": "pt"},
    # # # uncomment this if using CUDA to reduce memory usage
    model_kwargs={"torch_dtype": torch.float16}
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/47.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/9.81G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.93G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/4.11G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

### Step 3(b) Defining the StorageContext (ie Vector Database which we want to use) if you wish to use something other than the default

In [None]:
# Creating a Chroma client
# By default, Chroma will operate purely in-memory.
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("data")
# set up ChromaVectorStore and load in data
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)


In [None]:
service_context = ServiceContext.from_defaults(llm=llm, embed_model="local")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Downloading (…)ab102/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)2d2d7ab102/README.md:   0%|          | 0.00/78.9k [00:00<?, ?B/s]

Downloading (…)2d7ab102/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)ab102/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)2d2d7ab102/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)d7ab102/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
program_indexes, index_summaries = {},{}
programs=[SNAP, TEFAP, School_lunch]
program_names=["SNAP","TEFAP","School lunch"]
for i in tqdm(range(len(programs))):
  program_indexes[program_names[i]]=VectorStoreIndex.from_documents(programs[i], service_context=service_context)
  index_summaries[program_names[i]]=f"{program_names[i]} summaries"

# Build Keyword Table Index on top of vector indices!
graph = ComposableGraph.from_indices(
    SimpleKeywordTableIndex,
    [index for _, index in program_indexes.items()],
    [summary for _, summary in index_summaries.items()],
    max_keywords_per_chunk=50,
)

100%|██████████| 3/3 [00:03<00:00,  1.25s/it]


******
Could not load OpenAI model. Using default LlamaCPP=llama2-13b-chat. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
Invalid OpenAI API key.
API key should be of the format: "sk-" followed by 48 alphanumeric characters.

******
Downloading url https://huggingface.co/TheBloke/Llama-2-13B-chat-GGML/resolve/main/llama-2-13b-chat.ggmlv3.q4_0.bin to path /tmp/llama_index/models/llama-2-13b-chat.ggmlv3.q4_0.bin
total size (MB): 7323.31


6985it [00:20, 347.50it/s]                          
AVX = 1 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 1 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | 


******
Could not load OpenAIEmbedding. Using HuggingFaceBgeEmbeddings with model_name=BAAI/bge-small-en. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
Invalid OpenAI API key.
API key should be of the format: "sk-" followed by 48 alphanumeric characters.

******


[nltk_data] Downloading package stopwords to /tmp/llama_index...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from llama_index.indices.query.query_transform.base import DecomposeQueryTransform
decompose_transform = DecomposeQueryTransform(
    service_context.llm_predictor
)

In [None]:
# with query decomposition in subindices
from llama_index.query_engine.transform_query_engine import TransformQueryEngine


custom_query_engines = {}
for index in program_indexes.values():
    query_engine = index.as_query_engine(service_context=service_context)
    transform_metadata = {"index_summary": index.index_struct.summary}
    tranformed_query_engine = TransformQueryEngine(
        query_engine, decompose_transform, transform_metadata=transform_metadata
    )
    custom_query_engines[index.index_id] = tranformed_query_engine

custom_query_engines[graph.root_index.index_id] = graph.root_index.as_query_engine(
    retriever_mode="simple",
    response_mode="tree_summarize",
    service_context=service_context,
)

query_engine_decompose = graph.as_query_engine(
    custom_query_engines=custom_query_engines,
)

## Step 4. Query the data

### Example: Compare and contrast

In [None]:
q="Compare and contrast the different recipient targets between SNAP, TEFAP, and FMNP programs"
response= query_engine.query(q)


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [None]:
print(response)


SNAP: Low-income households
TEFAP: Low-income seniors
FMNP: Low-income women, children, and seniors

A:

The answer is:

SNAP: Low-income households
TEFAP: Low-income seniors
FMNP: Low-income women, children, and seniors

The first two are the same, but the third is different.
The first two are the same because they are both targeted at low-income households.
The third is different because it is targeted at low-income women, children, and seniors.

A:

The answer is:

SNAP: Low-income households
TEFAP: Low-income seniors
FMNP: Low-income women, children, and seniors

The first two are the same, but the third is different.
The first two are the same because they are both targeted at low-income households.
The third is different because it is targeted at low-income women, children, and seniors.




In [None]:
q="Compare and contrast the different recipient targets between SNAP, TEFAP, and FMNP programs"
response= query_engine_decompose.query(q)

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


In [None]:
print(neat_text(response))

SNAP, TEFAP, and FMNP are all programs that provide food assistance to low-income people.
They are all administered by the USDA.
