In [8]:
!pip install -q llama-index
!pip install pypdf
!pip install docx2txt
!pip install transformers



In [9]:
!pip install google-generativeai

Collecting google-generativeai
  Using cached google_generativeai-0.2.2-py3-none-any.whl (133 kB)
Collecting google-ai-generativelanguage==0.3.3 (from google-generativeai)
  Using cached google_ai_generativelanguage-0.3.3-py3-none-any.whl (267 kB)
Installing collected packages: google-ai-generativelanguage, google-generativeai
Successfully installed google-ai-generativelanguage-0.3.3 google-generativeai-0.2.2


In [1]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.palm import PaLM
from llama_index import ServiceContext
from llama_index import StorageContext, load_index_from_storage
import os

## Load data

In [3]:
!mkdir data

In [3]:
documents = SimpleDirectoryReader("data").load_data()

In [4]:
documents

[Document(id_='8f116f6f-e26f-4531-a60a-ce72a6e51d31', embedding=None, metadata={'file_name': '04. LlamaIndex.docx', 'file_path': 'data/04. LlamaIndex.docx', 'creation_date': '2023-11-13', 'last_modified_date': '2023-11-13', 'last_accessed_date': '2023-11-13'}, excluded_embed_metadata_keys=['creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, hash='1b78ceb8a9386843bea0785fc6b5485d2292402f7bba1508ee0f3ce698a7b564', text="What is LlamaIndex?\n\n\n\nLlamaIndex is a data framework for building LLM applications. It provides a comprehensive toolkit for ingestion, management, and querying of your external data so that you can use it with your LLM app.  \n\n\n\n\n\nChatGPT is trained on huge amounts of data. But what if you wish to train ChatGPT on your private data. There are 3 ways in which you can achieve this.\n\n\n\n\n\n1.   Train an open-source LLM like Llama on your data. 

## Split the Text into Small Chunks

In [12]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=dc591a2233665c4c22368304270da662c931509eb20b0a4998fda47e8049f466
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_tr

In [10]:
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings

In [5]:
os.environ['GOOGLE_API_KEY'] = 'AIzaSyCON4Y32JXFTj6NzeSBQJqfTtxoWU4DepM'

In [6]:
llm = PaLM()

In [13]:
embed_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en")

Downloading (…)9a243/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)1e3c49a243/README.md:   0%|          | 0.00/90.1k [00:00<?, ?B/s]

Downloading (…)3c49a243/config.json:   0%|          | 0.00/719 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)9a243/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)1e3c49a243/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)c49a243/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [14]:
service_context = ServiceContext.from_defaults(llm = llm, embed_model=embed_model, chunk_size = 800, chunk_overlap=20)

[nltk_data] Downloading package punkt to /tmp/llama_index...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [15]:
index = VectorStoreIndex.from_documents(documents, service_context= service_context)

## Storing and Loading the Index

In [16]:
index.storage_context.persist()

In [None]:
# Loading the index
# storage_context = StorageContext.from_defaults(persist_dir = './storage')
# index = load_index_from_storage(storage_context=storage_context)

## Q/A

In [17]:
query_engine = index.as_query_engine()

In [29]:
response = query_engine.query("What is llamaindex?")
response

Response(response='LlamaIndex is a data framework for building LLM applications. It provides a comprehensive toolkit for ingestion, management, and querying of your external data so that you can use it with your LLM app.', source_nodes=[NodeWithScore(node=TextNode(id_='138b2e41-7dea-4c40-9f14-fdaa958ae90d', embedding=None, metadata={'file_name': '04. LlamaIndex.docx', 'file_path': 'data/04. LlamaIndex.docx', 'creation_date': '2023-11-13', 'last_modified_date': '2023-11-13', 'last_accessed_date': '2023-11-13'}, excluded_embed_metadata_keys=['creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='8f116f6f-e26f-4531-a60a-ce72a6e51d31', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_name': '04. LlamaIndex.docx', 'file_path': 'data/04. LlamaIndex.docx', 'creation_date': '2023-11-13', 'last_modified_date': '2023-11-13',

In [30]:
from IPython.display import Markdown, display

In [31]:
display(Markdown(f"<b>{response}</b>"))

<b>LlamaIndex is a data framework for building LLM applications. It provides a comprehensive toolkit for ingestion, management, and querying of your external data so that you can use it with your LLM app.</b>