In [54]:
import warnings
warnings.filterwarnings('ignore')

In [55]:
# https://github.com/chroma-core/chroma/blob/main/chromadb/__init__.py#L57
import sys
__import__("pysqlite3")
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")

In [56]:
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate, LLMChain

In [57]:
model_id, task = "lmsys/fastchat-t5-3b-v1.0", "text2text-generation"

# the model will be downloaded on first use, if not cached in ~/.cache/huggingface/hub/

model = HuggingFacePipeline.from_model_id(
    model_id=model_id,
    task=task,
    model_kwargs={
        "temperature": 0,
        "max_length": 1000
    },
)

In [58]:
template_text = """
{question}
"""
template = PromptTemplate(template=template_text, input_variables=["question"])
llm_chain = LLMChain(prompt=template, llm=model)

In [59]:
llm_chain("Who is Sheryl Crow?")["text"]

'<pad> Sheryl  Crow  is  an  American  singer,  songwriter,  and  actress.  She  is  best  known  for  her  role  as  the  lead  singer  and  lead  guitarist  of  the  rock  band  The  Band wagon,  and  for  her  role  as  the  lead  singer  and  lead  guitarist  of  the  alternative  rock  band  The  Mamas  and  the  Papas.  Crow  has  also  been  a  member  of  the  band  The  Mamas  and  the  Papas  since  its  formation  in  1995.\n'

In [60]:
template_text = """
Provide brief answers, use 10 words or less.
{question}
"""
template = PromptTemplate(template=template_text, input_variables=["question"])
llm_chain = LLMChain(prompt=template, llm=model)

In [61]:
llm_chain("Who is Sheryl Crow?")["text"]

'<pad> Singer-songwriter'

In [62]:
llm_chain("Who is Poland located?")["text"]

'<pad> Europe'

In [63]:
llm_chain("What is Bialowieza Forest?")["text"]

'<pad> Bialowieza Forest is a protected forest in Poland.'

In [64]:
llm_chain("What does the name 'Białowieża' mean in English?")["text"]

'<pad> "Bird of the Woods"'

In [65]:
llm_chain("What's the length of the Tsar's Trail and where does it begin?")["text"]

"<pad> The Tsar's Trail is a 900 mile long trail that begins in Moscow and ends in St. Petersburg."

In [66]:
# https://python.langchain.com/docs/integrations/document_loaders
from langchain.document_loaders import WikipediaLoader

loader = WikipediaLoader("Białowieża_Forest")
wiki_page = loader.load()

In [67]:
from langchain.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://bpn.com.pl/index.php?option=com_content&task=view&id=651&Itemid=297&lang=en")
bpn_page = loader.load()

In [68]:
# https://python.langchain.com/docs/use_cases/question_answering/#step-1-load

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(wiki_page + bpn_page)

In [69]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# https://integrations.langchain.com/embeddings
hf_embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs={
        'device': 'cpu'
    },
    encode_kwargs={
        'normalize_embeddings': False
    }
)

vectorstore = Chroma.from_documents(documents=all_splits, embedding=hf_embeddings)

In [70]:
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline

qa_chain = RetrievalQA.from_chain_type(model, retriever=vectorstore.as_retriever())

In [71]:
qa_chain({
    "query": "Provide brief answers, use 10 words or less. What does the name 'Białowieża' mean in English?"
})["result"]

'<pad> "White  Tower"  in  English.\n'

In [72]:
qa_chain({
    "query": "Provide brief answers, use 10 words or less. What's the length of the Tsar's Trail and where does it begin?"
})["result"]

'<pad> 4  km  long,  starts  at  Przed  Kosym  Mostem  depot.\n'

In [73]:
qa_chain({
    "query": "Are there any walking trails in the Białowieża Forest?"
})["result"]

'<pad> No,  there  are  no  walking  trails  in  the  Biaowiea  Forest.\n'