In [1]:
from langchain import hub
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings,AzureChatOpenAI,AzureOpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate

import os
from langchain_community.document_loaders import WikipediaLoader
from pathlib import Path
import dotenv

dotenv.load_dotenv()

True

# RAG 
Using the RAG model to generate answers to questions based on a given context.
The code is based on Langchain and the Langchain community packages.

## Dataset
In this example we will utilize the WikipediaLoader to load a few documents about Gothenburg, Sweden. There are plenty of other loaders available in the Langchain community package.
For example, to load data from pdfs, html, or other sources.
**The data needs to be cleaned!** (skipped in this example)

In [2]:
loader = WikipediaLoader(query="Gothenburg,Sweden", lang="en",load_max_docs=5)
docs = loader.load()

In [3]:
for i,doc in enumerate(docs):
    print(f"Source: {i} - Length:{len(doc.page_content)} - Url:{doc.metadata["source"]} - {doc.page_content[:100]} ")

Source: 0 - Length:4000 - Url:https://en.wikipedia.org/wiki/Gothenburg - Gothenburg ( ; abbreviated Gbg; Swedish: Göteborg [jœtɛˈbɔrj] ) is the capital of Västra Götaland Co 
Source: 1 - Length:4000 - Url:https://en.wikipedia.org/wiki/University_of_Gothenburg - The University of Gothenburg (Swedish: Göteborgs universitet) is a university in Sweden's second lar 
Source: 2 - Length:3598 - Url:https://en.wikipedia.org/wiki/List_of_metropolitan_areas_in_Sweden - Sweden has three metropolitan areas consisting of the areas surrounding the three largest cities, St 
Source: 3 - Length:4000 - Url:https://en.wikipedia.org/wiki/Swedish_East_India_Company - The Swedish East India Company (Swedish: Svenska Ostindiska Companiet or SOIC) was founded in Gothen 
Source: 4 - Length:4000 - Url:https://en.wikipedia.org/wiki/Gothenburg,_Nebraska - Gothenburg is a city in Dawson County, Nebraska, United States. It is part of the Lexington, Nebrask 


## Split data
Split the data into smaller chunks to be able to retrieve and generate answers based on the relevant context.
Chunk size and overlap can be adjusted to fit the specific use case - but also where to split the text.

The *RecursiveCharacterTextSplitter* is a generic text splitter, that tries to split the text into chunks of a specific size, while trying to keep the sentences intact by splitting at (["\n\n", "\n", " ", ""]).
There are also other text splitters, for example HTMLsplitter, json splitter, etc.

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)   # chunk size - how many characters in each chunk, chunk overlap - how many characters to overlap
splits = text_splitter.split_documents(docs)

1

1

In [5]:
for i,split in enumerate(splits[0:3]):
    print(f"Split: {i} - Length:{len(split.page_content)} - {split.page_content} \n")
    
print(f"Total splits: {len(splits)}")

Split: 0 - Length:414 - Gothenburg ( ; abbreviated Gbg; Swedish: Göteborg [jœtɛˈbɔrj] ) is the capital of Västra Götaland County in Sweden. It is the second-largest city in Sweden, after the capital Stockholm, and the fifth-largest in the Nordic countries. It is situated by the Kattegat on the west coast of Sweden, with a population of approximately 600,000 in the city proper and about 1.1 million inhabitants in the metropolitan area. 

Split: 1 - Length:999 - King Gustavus Adolphus founded Gothenburg by royal charter in 1621 as a heavily fortified, primarily Dutch, trading colony. In addition to the generous privileges given to his Dutch allies during the ongoing Thirty Years' War, e.g. tax relaxation, he also attracted significant numbers of his German and Scottish allies to populate his only town on the western coast; this trading status was furthered by the founding of the Swedish East India Company. At a key strategic location at the mouth of the Göta älv, where Scandinavia's larg

## Vectorize data & retriever
Vectorize the data using the AzureOpenAIEmbeddings model, and create a retriever to be able to retrieve the relevant context based on a question.

There are many alternatives: 
- LanceDB
- FAISS
- Chroma


In [6]:
embedding_model = AzureOpenAIEmbeddings(model="embeddings")
persist_directory = "chroma_db"

if not Path(persist_directory).is_dir():
    print("Creating Chroma")
    vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_model,persist_directory=persist_directory)  
else:
    print("Loading Chroma")
    vectorstore = Chroma(persist_directory=persist_directory,embedding_function=embedding_model)

Loading Chroma


In [7]:
query = "How old is the university?"

docs = vectorstore.similarity_search(query) # search for similar documents based on cosine similarity
print(docs[0].page_content)

The University of Gothenburg (Swedish: Göteborgs universitet) is a university in Sweden's second largest city, Gothenburg. Founded in 1891, the university is the third-oldest of the current Swedish universities and, with 53,624 students and 6,707 staff members, it is one of the largest universities in the Nordic countries.


== About ==
With its eight faculties and 38 departments, the University of Gothenburg is one of the most wide-ranging and versatile universities in Sweden. Its eight faculties offer training in the Creative Arts, Social Sciences, Natural Sciences, Humanities, Education, Information Technology, Business, Economics and Law, and Health Sciences.
The University of Gothenburg has the highest number of applicants per study place in many of its subjects and courses, making it one of the most popular universities in Sweden.


== History ==


### Retrievers
Package the document extractor. 
- Similarity search, with a cutoff, number of example.
- MMR (Maximum ...)


In [8]:
retriever = vectorstore.as_retriever()#(search_type="mmr")   

docs = retriever.get_relevant_documents(query)
for doc in docs:
    print(f"Source: {doc.metadata["source"]} - {doc.page_content} ")

Source: https://en.wikipedia.org/wiki/University_of_Gothenburg - The University of Gothenburg (Swedish: Göteborgs universitet) is a university in Sweden's second largest city, Gothenburg. Founded in 1891, the university is the third-oldest of the current Swedish universities and, with 53,624 students and 6,707 staff members, it is one of the largest universities in the Nordic countries.


== About ==
With its eight faculties and 38 departments, the University of Gothenburg is one of the most wide-ranging and versatile universities in Sweden. Its eight faculties offer training in the Creative Arts, Social Sciences, Natural Sciences, Humanities, Education, Information Technology, Business, Economics and Law, and Health Sciences.
The University of Gothenburg has the highest number of applicants per study place in many of its subjects and courses, making it one of the most popular universities in Sweden.


== History == 
Source: https://en.wikipedia.org/wiki/University_of_Gothenburg - The 

## Prompt and model

In [28]:
#prompt = hub.pull("rlm/rag-prompt")  The following prompt is available at Langchain hub
prompt=ChatPromptTemplate.from_messages([
    ("system", "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."),
    ("user","Question: {question} \nContext: {context} \nAnswer:")])

llm = AzureChatOpenAI(model_name="gpt4",api_version="2023-12-01-preview")  # use a huggingface model to generate the text ? (bra på svenska? )

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


## Explicit chain


In [29]:
docs = vectorstore.similarity_search(query)



In [30]:
print(prompt.invoke({"question":query,"context":format_docs(docs[:2])}))

messages=[SystemMessage(content="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."), HumanMessage(content='Question: How old is the university? \nContext: The University of Gothenburg (Swedish: Göteborgs universitet) is a university in Sweden\'s second largest city, Gothenburg. Founded in 1891, the university is the third-oldest of the current Swedish universities and, with 53,624 students and 6,707 staff members, it is one of the largest universities in the Nordic countries.\n\n\n== About ==\nWith its eight faculties and 38 departments, the University of Gothenburg is one of the most wide-ranging and versatile universities in Sweden. Its eight faculties offer training in the Creative Arts, Social Sciences, Natural Sciences, Humanities, Education, Information Technology, Business, Economics and Law, 

In [31]:
llm.invoke(prompt.invoke({"question":query,"context":format_docs(docs[:5])}))


AIMessage(content='The University of Gothenburg was founded in 1891. Therefore, as of 2022, the university is 131 years old.', response_metadata={'token_usage': {'completion_tokens': 29, 'prompt_tokens': 715, 'total_tokens': 744}, 'model_name': 'gpt-4', 'system_fingerprint': None, 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}})

## Langchain chain

In [32]:
prompt = hub.pull("rlm/rag-prompt")    # The following prompt is available at Langchain hub. We could set it up manually as well.
# todo: manually create the prompt object
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [33]:
rag_chain.invoke(query)

'The University of Gothenburg was founded in 1891.'