In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv(dotenv_path="../keys/.env")

True

In [3]:
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")

In [4]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")

In [5]:
openAI_api_key = os.getenv("OPENAI_API_KEY")

In [6]:
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# List of PDF file paths
pdf_dir = "../data/pdfs"
pdf_files = [os.path.join(pdf_dir, file) for file in os.listdir(pdf_dir) if file.endswith(".pdf")]

# URLs to load
urls = [
    "https://www.math-datascience.nat.fau.de/im-studium/masterstudiengaenge/master-data-science/",
    "https://www.fau.eu/studiengang/data-science-bsc/",
    "https://www.fau.eu/studiengang/data-science-msc/",
]

# Initialize an empty list to hold all document splits
all_doc_splits = []

# Define the text splitter with a chunk size of 1000 characters and overlap of 200
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200)

### Process PDF files ###
for pdf_file in pdf_files:
    # Load the PDF
    pdf_loader = PyPDFLoader(pdf_file)
    documents = pdf_loader.load()
    
    # Split the documents into chunks
    doc_splits = text_splitter.split_documents(documents)
    
    # Add the splits to the overall list
    all_doc_splits.extend(doc_splits)

### Process Web URLs ###
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Split the web documents into chunks
web_doc_splits = text_splitter.split_documents(docs_list)

# Add the web document splits to the overall list
all_doc_splits.extend(web_doc_splits)

# Now, all_doc_splits contains the splits from both the PDF files and the web documents.

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [7]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_community.vectorstores import FAISS


vectorstore = FAISS.from_documents(documents=all_doc_splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [8]:
groq_api_key = os.getenv("GROQ_API_KEY")

In [9]:
from langchain.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

# RAG-Fusion: Related
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines and give only questions not anything else, not even statements like Here are five search queries:. Original question: {question}"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)


from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_rag_fusion 
    | ChatGroq(model_name="llama-3.1-70b-versatile")
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [10]:
questions = [
    # 'Will the language of instruction of my Bachelor’s degree suffice to demonstrate my proficiency in English for admission in Msc Data Science at FAU Erlangen?',
    # 'Is an APS certificate mandatory for Indian students during the application process?',
    # 'How long does it typically take to receive a decision regarding my application to the MSc Data Science program?',
    # 'Am I eligible to apply for the study course program with a degree from a technical university or a university of applied sciences?',
    # 'Can I apply for the study course program with a degree obtained through a dual study program?',
    # 'What could be the reasons for receiving a rejection for my application?',
    # 'Is it possible to defer my admission to a future semester?',
    # 'Where can I find accommodation while studying at FAU?',
    # 'Can I switch my major subject after being admitted?',
    # 'Is it allowed to combine modules from different application subjects?',
    # 'How can I transfer ECTS credits from one module group to another?',
    # 'Can modules from my major subject count towards my minor subject requirements?',
    # 'Can my werkstudent experience be used to earn ECTS as a technical qualification?',
    # 'Can I participate in an examination without prior registration?',
    # 'How do I register for a module’s examination in Campo within the correct specialisation area?',
    # 'If I fail an examination on the first attempt, is it mandatory to attend the next attempt?',
    # 'Is it possible to withdraw from an examination after registering?',
    # 'What should I do if I cannot attend an examination due to illness?',
    # 'What are the consequences of failing an examination three times?',
    # 'Can I retake a completed module to improve my grades?',
    # 'Is it possible to improve my grades by completing additional modules?',
    # 'How many ECTS credits are required before I can start writing my Master’s thesis?',
    # 'How can I find a topic for my Master’s thesis?',
    # 'Who is eligible to supervise my Master’s thesis?',
    # 'Can my Master’s thesis supervisor be from a different department?',
    # 'Is it possible to complete my Master’s thesis while working in a company?',
    # 'Is it necessary to have a university supervisor for a company-based thesis?',
    # 'What is the process for registering my Master’s thesis?',
    # 'What steps should I take after completing my Master’s thesis?',

    'What are the admission requirements of M.Sc. Data Science at FAU Erlangen?',
    'How can I apply for M.Sc. Data Science at FAU Erlangen?',
    'Is accommodation available at FAU Erlangen?',
    'How can I finance my studies at FAU Erlangen?',
    'What are the next steps if I get an admission at FAU Erlangen?',
    'What is the duration of M.Sc. Data Science degree program at FAU Erlangen?',
    'What is the teaching language of M.Sc. Data Science at FAU Erlangen?',
    'What is the structure of M.Sc. Data Science at FAU Erlangen?',
    'Is German required for M.Sc. Data Science at FAU Erlangen?',
    'What is the duration of B.Sc. Data Science degree program at FAU Erlangen?',
    'What is the teaching language of B.Sc. Data Science at FAU Erlangen?',
    'What is the structure of B.Sc. Data Science FAU Erlangen?',
    'What are the admission requirements of B.Sc. Data Science FAU Erlangen?'

]

In [11]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

In [12]:
from operator import itemgetter
answers = []
for question in questions:
    retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
    docs = retrieval_chain_rag_fusion.invoke({"question": question})

    from langchain_core.runnables import RunnablePassthrough

    # RAG
    template = """Answer the following question in detail and based on this context:

    {context}

    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)

    final_rag_chain = (
        {"context": retrieval_chain_rag_fusion, 
        "question": itemgetter("question")} 
        | prompt
        | ChatGroq(model_name="llama-3.1-70b-versatile")
        | StrOutputParser()
    )

    ans = final_rag_chain.invoke({"question":question})
    answers.append(ans)

  warn_beta(


In [14]:
answers

['The admission requirements for the M.Sc. Data Science program at FAU Erlangen-Nürnberg include:\n\n1. A completed B.Sc. degree in Mathematics, Industrial Mathematics, Mathematical Economy, Computer Science, Data Science, or Physics from FAU or another equivalent domestic or international degree.\n2. A Grade Point Average (GPA) of 2.5 or better with respect to the German grading system. Candidates with an admissible degree and a GPA between 2.6 and 2.8 are invited for a short online interview in which their knowledge in calculus, linear algebra, algorithms, and data structures is evaluated.\n3. English proficiency at level B2 CEFR (vantage or upper intermediate, not be older than 2 years) or six years of English classes at a German secondary school (Gymnasium). Applicants who have completed their university entrance qualifications or their first degree in English are not required to provide proof of proficiency in English.\n\nAdditionally, applicants need to register online using the 