## Modules

In [9]:
from llama_index import SimpleDirectoryReader, VectorStoreIndex, ServiceContext
from llama_index.node_parser import SimpleNodeParser
from llama_index.text_splitter import TokenTextSplitter
from transformers import AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
from datasets import Dataset
import os
import json

In [10]:
os.environ['OPENAI_API_KEY'] = json.load(open("/home/shahul/openai-keys.json"))["ikka"]

## Pipeline

In [11]:
reader = SimpleDirectoryReader("./wikidata/")
data = reader.load_data()[:50]

In [12]:
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5",)

In [13]:
token_splitter = TokenTextSplitter(chunk_size=500
                                   , chunk_overlap=100, tokenizer=tokenizer.encode)

In [14]:
node_parser = SimpleNodeParser.from_defaults(text_splitter=token_splitter)

In [15]:
embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")


In [16]:
# nodes = node_parser.get_nodes_from_documents(data)

In [18]:
service_context = ServiceContext.from_defaults(node_parser=node_parser, embed_model=embed_model)

In [19]:
index = VectorStoreIndex.from_documents(data) 

In [22]:
retreiver = index.as_retriever()

In [25]:
chunks = retreiver.retrieve("Who was the president of US in 1960?")

In [26]:
chunks

[NodeWithScore(node=TextNode(id_='40724cb5-677c-4775-9d38-c66b998040d7', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='760f2e6e-1bbc-4442-9cfa-6eac5a791845', node_type=<ObjectType.DOCUMENT: '4'>, metadata={}, hash='d7c0b27402b65a509d0efda4e748557689d3ce9e8f2fa072fb438d10c3843cdf'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='d7fe4f5d-f3a5-4a53-8436-b0b314a28505', node_type=<ObjectType.TEXT: '1'>, metadata={}, hash='1090dacf7e240d0ceaf8ff9b945b2826c72730a8e02714b6d3d6bbb36aef816d')}, hash='daa018de51a18dcb4097def5cf8f1a082b6cbb3ccec021d3de144da0b13ca3cf', text='The 1976 United States presidential election was the 48th quadrennial presidential election, held on Tuesday, November 2, 1976. Democrat Jimmy Carter, former Governor of Georgia, defeated incumbent Republican president Gerald Ford in a narrow victory. \nFord ascended to the presidency when Richard Nixo