In [103]:
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings,AzureChatOpenAI,AzureOpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
from langchain_community.document_loaders import WikipediaLoader
from pathlib import Path
import dotenv

dotenv.load_dotenv()

True

# RAG 
Using the RAG model to generate answers to questions based on a given context.
The code is based on Langchain and the Langchain community packages.

## Dataset
In this example we will utilize the WikipediaLoader to load a few documents about Gothenburg, Sweden. There are plenty of other loaders available in the Langchain community package.
For example, to load data from pdfs, html, or other sources.
**The data needs to be cleaned!** (skipped in this example)

In [96]:
loader = WikipediaLoader(query="Gothenburg,Sweden", lang="en",load_max_docs=2)
docs = loader.load()

In [97]:
for i,doc in enumerate(docs):
    print(f"Source: {i} - Length:{len(doc.page_content)} - Url:{doc.metadata["source"]} - {doc.page_content[:100]} ")

Source: 0 - Length:4000 - Url:https://en.wikipedia.org/wiki/Gothenburg - Gothenburg ( ; abbreviated Gbg; Swedish: Göteborg [jœtɛˈbɔrj] ) is the capital of Västra Götaland Co 
Source: 1 - Length:4000 - Url:https://en.wikipedia.org/wiki/University_of_Gothenburg - The University of Gothenburg (Swedish: Göteborgs universitet) is a university in Sweden's second lar 


## Split data
Split the data into smaller chunks to be able to retrieve and generate answers based on the relevant context.
Chunk size and overlap can be adjusted to fit the specific use case - but also where to split the text.

The *RecursiveCharacterTextSplitter* is a generic text splitter, that tries to split the text into chunks of a specific size, while trying to keep the sentences intact by splitting at (["\n\n", "\n", " ", ""]).
There are also other text splitters, for example HTMLsplitter, json splitter, etc.

In [98]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)   # chunk size - how many characters in each chunk, chunk overlap - how many characters to overlap
splits = text_splitter.split_documents(docs)

1

1

In [99]:
for i,split in enumerate(splits[0:3]):
    print(f"Split: {i} - Length:{len(split.page_content)} - {split.page_content} \n")
    
print(f"Total splits: {len(splits)}")

Split: 0 - Length:995 - Gothenburg ( ; abbreviated Gbg; Swedish: Göteborg [jœtɛˈbɔrj] ) is the capital of Västra Götaland County in Sweden. It is the second-largest city in Sweden, after the capital Stockholm, and the fifth-largest in the Nordic countries. It is situated by the Kattegat on the west coast of Sweden, with a population of approximately 600,000 in the city proper and about 1.1 million inhabitants in the metropolitan area.King Gustavus Adolphus founded Gothenburg by royal charter in 1621 as a heavily fortified, primarily Dutch, trading colony. In addition to the generous privileges given to his Dutch allies during the ongoing Thirty Years' War, e.g. tax relaxation, he also attracted significant numbers of his German and Scottish allies to populate his only town on the western coast; this trading status was furthered by the founding of the Swedish East India Company. At a key strategic location at the mouth of the Göta älv, where Scandinavia's largest drainage basin enters t

## Vectorize data & retriever
Vectorize the data using the AzureOpenAIEmbeddings model, and create a retriever to be able to retrieve the relevant context based on a question.

There are many alternatives: 
- LanceDB
- FAISS
- Chroma


In [124]:
embedding_model = AzureOpenAIEmbeddings(model="embeddings")
persist_directory = "./chroma_db"
if not Path(persist_directory).is_dir():
    print("Creating Chroma")
    vectorstore = Chroma.from_documents(documents=splits, embedding_function=embedding_model,persist_directory=persist_directory)  
else:
    print("Loading Chroma")
    vectorstore = Chroma(persist_directory=persist_directory,embedding_function=embedding_model)

Loading Chroma


In [125]:
query = "How to travel in Gothenburg?"
docs = vectorstore.similarity_search(query) # search for similar documents based on cosine similarity
vectorstore.similarity_search()
print(docs[0].page_content)

Gothenburg is served by Göteborg Landvetter Airport 25 km (16 mi) southeast of the city centre. The smaller Göteborg City Airport, 15 km (9.3 mi) from the city centre, was closed to regular airline traffic in 2015. The city hosts the Gothia Cup, the world's largest youth football tournament, and the Göteborg Basketball Festival, Europe's largest youth basketball tournament, alongside some of the largest annual events in Scandinavia. The Gothenburg Film Festival, held in January since 1979, is the leading Scandinavian film festival and attracts over 155,000 visitors each year. In summer, a wide variety of music festivals are held in the city, including the popular Way Out West Festival.


### Retrievers
Package the document extractor. 
- Similarity search, with a cutoff, number of example.
- MMR (Maximum ...)


In [110]:
retriever = vectorstore.as_retriever()#(search_type="mmr")   

docs = retriever.get_relevant_documents("What should I do when touristing in Gothenburg?")
for doc in docs:
    print(f"Source: {doc.metadata["source"]} - {doc.page_content} ")

Source: https://en.wikipedia.org/wiki/Gothenburg - Gothenburg is served by Göteborg Landvetter Airport 25 km (16 mi) southeast of the city centre. The smaller Göteborg City Airport, 15 km (9.3 mi) from the city centre, was closed to regular airline traffic in 2015. The city hosts the Gothia Cup, the world's largest youth football tournament, and the Göteborg Basketball Festival, Europe's largest youth basketball tournament, alongside some of the largest annual events in Scandinavia. The Gothenburg Film Festival, held in January since 1979, is the leading Scandinavian film festival and attracts over 155,000 visitors each year. In summer, a wide variety of music festivals are held in the city, including the popular Way Out West Festival. 
Source: https://en.wikipedia.org/wiki/Gothenburg - Gothenburg ( ; abbreviated Gbg; Swedish: Göteborg [jœtɛˈbɔrj] ) is the capital of Västra Götaland County in Sweden. It is the second-largest city in Sweden, after the capital Stockholm, and the fifth-la

## Prompt and model

In [None]:
#prompt = hub.pull("rlm/rag-prompt")  The following prompt is available at Langchain hub
prompt = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Question: {question} 
    Context: {context} 
    Answer:
    """
llm = AzureChatOpenAI(model_name="gpt4",api_version="2023-12-01-preview")  # use a huggingface model to generate the text ? (bra på svenska? )

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


## Explicit chain


In [120]:
query = "What year was the university founded?"

docs = vectorstore.similarity_search(query)



APIConnectionError: Connection error.

In [130]:
print(prompt.format(question="test",context=format_docs(docs[:2])))

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
    Question: test 
    Context: Gothenburg is served by Göteborg Landvetter Airport 25 km (16 mi) southeast of the city centre. The smaller Göteborg City Airport, 15 km (9.3 mi) from the city centre, was closed to regular airline traffic in 2015. The city hosts the Gothia Cup, the world's largest youth football tournament, and the Göteborg Basketball Festival, Europe's largest youth basketball tournament, alongside some of the largest annual events in Scandinavia. The Gothenburg Film Festival, held in January since 1979, is the leading Scandinavian film festival and attracts over 155,000 visitors each year. In summer, a wide variety of music festivals are held in the city, including the popular Way Out West Festival.

Gothenburg ( ; abbreviated Gbg; S

In [1]:
llm.invoke(prompt.format(question=query,context=format_docs(docs[:5])))


NameError: name 'llm' is not defined

## Langchain chain

In [107]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [118]:
print(rag_chain.invoke(query))

The University of Gothenburg was founded in 1891.
