# Set credentials

Enter an OpenAI key for running queries, and a Github token for reading from Github repos.

Point to the Github repo where we will be ingesting data for indexing

In [3]:
import os
import nest_asyncio
nest_asyncio.apply()

os.environ["OPENAI_API_KEY"] = 'YOUR-OPENAI-KEY'
os.environ["GITHUB_TOKEN"] = 'YOUR-GITHUB-TOKEN'

github_repo_owner = 'cpage-pivotal'
github_repo_name = 'ai-data'
github_branch = 'essay'

# Index the data

We will create a vector store index across the entire dataset.

In [None]:
from llama_index import GPTVectorStoreIndex, GithubRepositoryReader
documents = GithubRepositoryReader(
    owner=github_repo_owner,
    repo=github_repo_name,
    use_parser=False,
    verbose=False,
).load_data(branch=github_branch)


# Store Indexes

We will persist the indices where they can be picked up by third-party applications for querying.

In [None]:
from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores import RedisVectorStore

nodes = SimpleNodeParser().get_nodes_from_documents(documents)

vector_store = RedisVectorStore(
    index_name="pg_essays",
    index_prefix="llama",
    redis_url="redis://localhost:6379",
    overwrite=True
)
# from llama_index.storage.docstore import SimpleDocumentStore
# docstore = SimpleDocumentStore()
# docstore.add_documents(nodes)

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = GPTVectorStoreIndex.from_documents(documents,storage_context=storage_context)
# storage_context.persist()

# Run Queries

Run a completion query on the data

In [None]:
query_engine = index.as_query_engine()
query_string = "How did the author start Y Combinator?"
response = query_engine.query(query_string)
print(response)