In [None]:
from dotenv import load_dotenv
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
load_dotenv()

In [None]:
search_term = '2023 Wimbledon Championships'
docs = WikipediaLoader(query=search_term, load_max_docs=1).load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False
)
data = text_splitter.split_documents(docs)
data[0]

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

In [None]:
embeddings = OpenAIEmbeddings()

In [None]:
store = Chroma.from_documents(
    data,
    embeddings,
    ids = [f'{item.metadata["source"]}-{index}' for index, item in enumerate(data)],
    collection_name='Wimbledon-Embeddings',
    persist_directory='db'
)
store.persist()

In [None]:
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
import pprint

In [None]:
template = """
You are a both that answers questions about Wimbledon 2023, using only the context provided.
If you don't know the answer, simply state that you don't know.

Context:

{context}

Question:

{question}
"""
prompt = PromptTemplate(
    template=template, input_variables=['context', 'question']
)

In [None]:
llm = ChatOpenAI(temperature=0, model='gpt-4')

In [None]:
qa_with_source = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=store.as_retriever(),
    chain_type_kwargs={'prompt': prompt},
    return_source_documents=True
)

In [None]:
pprint.pprint(
    qa_with_source('When and where was Wimbledon 2023 held?')
)