In [9]:
import os
import sys
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
keys_path = os.path.abspath('../../keys')
sys.path.append(keys_path)
from keys import LANGCHAIN_API_KEY, OPENAI_API_KEY
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [10]:
#### INDEXING ####

# Load blog
import bs4
from langchain_community.document_loaders import WebBaseLoader

urls = [
    "https://www.math-datascience.nat.fau.de/im-studium/masterstudiengaenge/master-data-science/",
    "https://www.fau.eu/studiengang/data-science-bsc/",
    "https://www.fau.eu/studiengang/data-science-msc/",
]

docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200)

doc_splits = text_splitter.split_documents(docs_list)

# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import SKLearnVectorStore

vectorstore = SKLearnVectorStore.from_documents(documents=doc_splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [11]:
(docs_list)

[Document(metadata={'source': 'https://www.math-datascience.nat.fau.de/im-studium/masterstudiengaenge/master-data-science/', 'title': 'Master Data Science - Lehreinheit Mathematik – Data Science', 'description': 'Please send only questions related to the field of Data Science or the structure of this study course to the student advisory. Questions on the application…', 'language': 'de-DE'}, page_content='\n\n\n\n\nMaster Data Science - Lehreinheit Mathematik – Data Science\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n \n\n\n\nNavigation überspringen\nZur Navigation\nZum Seitenende\n\n\n\n\n\n\n\nOrganisationsmenü öffnen\n\n \n\n\nOrganisationsmenü schließen\n\n\n\n\n\nFriedrich-Alexander-Universität LE Mathematik - Data Science \n\n\n\nFAUZur zentralen FAU Website\n\nFriedrich-Alexander-UniversitätNaturwissenschaftliche Fakultät\n\n\n\nGeben Sie hier den Suchbegriff ein, um in diesem Webauftritt zu suchen:\n\n\n Suche öffnen\n\n\n\n\n\n\nEnglish\n\n\

In [12]:
from langchain.prompts import ChatPromptTemplate

# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)


from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_rag_fusion 
    | ChatOpenAI(temperature=0)
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [13]:
questions = [
    'What are the admission requirements of M.Sc. Data Science at FAU Erlangen?',
    'How can I apply for M.Sc. Data Science at FAU Erlangen?',
    'Is accommodation available at FAU Erlangen?',
    'How can I finance my studies at FAU Erlangen?',
    'What are the next steps if I get an admission at FAU Erlangen?',
    'What is the duration of M.Sc. Data Science degree program at FAU Erlangen?',
    'What is the teaching language of M.Sc. Data Science at FAU Erlangen?',
    'What is the structure of M.Sc. Data Science at FAU Erlangen?',
    'Is German required for M.Sc. Data Science at FAU Erlangen?',
    'What is the duration of B.Sc. Data Science degree program at FAU Erlangen?',
    'What is the teaching language of B.Sc. Data Science at FAU Erlangen?',
    'What is the structure of B.Sc. Data Science FAU Erlangen?',
    'What are the admission requirements of B.Sc. Data Science FAU Erlangen?'
]


In [14]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

In [15]:
from operator import itemgetter
llm = ChatOpenAI(temperature=0)
answers = []
for question in questions:
    retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
    docs = retrieval_chain_rag_fusion.invoke({"question": question})

    from langchain_core.runnables import RunnablePassthrough

    # RAG
    template = """Answer the following question based on this context:

    {context}

    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)

    final_rag_chain = (
        {"context": retrieval_chain_rag_fusion, 
        "question": itemgetter("question")} 
        | prompt
        | llm
        | StrOutputParser()
    )

    ans = final_rag_chain.invoke({"question":question})
    answers.append(ans)

In [16]:
answers

['The admission requirements for the M.Sc. Data Science program at FAU Erlangen include:\n1. A completed B.Sc. degree in Mathematics, Industrial Mathematics, Mathematical Economy, Computer Science, Data Science, or Physics from FAU or an equivalent domestic or international degree.\n2. A Grade Point Average (GPA) of 2.5 or better with respect to the German grading system.\n3. English proficiency at level B2 CEFR or six years of English classes at a German secondary school.\n4. Application process involves online registration during specific intake periods.\n5. Further evaluation of competence profile by the admission committee after completing the application process.',
 "To apply for the M.Sc. Data Science program at FAU Erlangen, you can follow these steps:\n\n1. Check the admission requirements, which include having a completed B.Sc. degree in Mathematics, Industrial Mathematics, Mathematical Economy, Computer Science, Data Science, or Physics from FAU or an equivalent domestic or i