In [4]:
!pip install python-dotenv

Defaulting to user installation because normal site-packages is not writeable


In [9]:
!pip install llama-index llama_hub wikipedia

Defaulting to user installation because normal site-packages is not writeable
Collecting llama_hub
  Obtaining dependency information for llama_hub from https://files.pythonhosted.org/packages/5d/6d/3d23de219fec9394e4f96a6310df32ff1921ca79e83bfcf51e99875be23a/llama_hub-0.0.15-py3-none-any.whl.metadata
  Downloading llama_hub-0.0.15-py3-none-any.whl.metadata (8.8 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting atlassian-python-api (from llama_hub)
  Downloading atlassian-python-api-3.40.0.tar.gz (157 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.8/157.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting html2text (from llama_hub)
  Downloading html2text-2020.1.16-py3-none-any.whl (32 kB)
Collecting retry

In [1]:
import os 
from dotenv import load_dotenv
import openai
from IPython import display 


In [2]:
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")


### LlamaHub connectors and basic queries

In [3]:
from llama_hub.wikipedia.base import WikipediaReader

loader = WikipediaReader()
documents = loader.load_data(pages=['BioRxiv', 'Bioinformatics', 'Cheminformatics', 'Large language model', 'Transformer (machine learning model)'])

In [7]:
documents[0]

Document(id_='37f8f88a-5d43-4dc5-899b-40a1a58f7e91', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, hash='a7402335edcbbb5760d816f26ec37a84ee5426c441aed2924c046b4546289608', text='bioRxiv (pronounced "bio-archive") is an open access preprint repository for the biological sciences co-founded by John Inglis and Richard Sever in November 2013. It is hosted by the Cold Spring Harbor Laboratory (CSHL).As preprints, papers hosted on bioRxiv are not peer-reviewed, but undergo basic screening and checked against plagiarism. However, peer reviews from other sources may be posted alongside preprints. Moreover, readers may post comments. \nIt has been measured that two thirds of the papers posted in bioRxiv are later published in peer-reviewed journals. BioRxiv, and its sister site, medRxiv, have been major sources for the dissemination of COVID-19 research.\n\n\n== History ==\nBioRxiv was inspired by and intends to complement the arX

In [8]:
from llama_index import VectorStoreIndex
# build an index over these Document objects.
index = VectorStoreIndex.from_documents(documents)

# we can now query an index with the default QueryEngine
query_engine = index.as_query_engine()
response = query_engine.query("what is the purpose of bioinformaticians")

In [9]:
response

Response(response='\nThe purpose of bioinformaticians is to develop and implement computer programs to efficiently access, manage, and use various types of biological data, develop new mathematical algorithms and statistical measures to assess relationships among members of large data sets, and analyze and interpret biological data in order to increase the understanding of biological processes.', source_nodes=[NodeWithScore(node=TextNode(id_='45269cca-30a2-4aa4-8cb6-9b59a1296153', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='8f76e119-1f2c-4929-9dcd-538bc82b0334', node_type=None, metadata={}, hash='31e10b333619c21367b04caeb895873fed59d65e9f88f27d8d42de39a92e04b4'), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='b77aa9f6-792d-4a01-875c-b46f255de200', node_type=None, metadata={}, hash='3800b13fbceab0cb9e829522e1743f4ee6343094d93de1e6526bc7b278f08180')}, hash='91f

### Querying over multiple documents

In [10]:
import nest_asyncio
nest_asyncio.apply()

In [11]:
from llama_index import SimpleDirectoryReader, LLMPredictor, ServiceContext, VectorStoreIndex
from llama_index.response.pprint_utils import pprint_response
from langchain import OpenAI

from llama_index.tools import QueryEngineTool, ToolMetadata
from llama_index.query_engine import SubQuestionQueryEngine
from langchain.chat_models import ChatOpenAI

In [12]:
llm_predictor = LLMPredictor(llm=OpenAI(temperature=0, model_name="gpt-4", max_tokens=-1, streaming=True))
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor)

