In [1]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv(dotenv_path="../../keys/.env")

True

In [2]:
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")

In [3]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")

In [4]:
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# List of PDF file paths
pdf_dir = "../../data/pdfs"
pdf_files = [os.path.join(pdf_dir, file) for file in os.listdir(pdf_dir) if file.endswith(".pdf")]

# URLs to load
urls = [
    "https://www.math-datascience.nat.fau.de/im-studium/masterstudiengaenge/master-data-science/",
    "https://www.fau.eu/studiengang/data-science-bsc/",
    "https://www.fau.eu/studiengang/data-science-msc/",
]

# Initialize an empty list to hold all document splits
all_doc_splits = []

# Define the text splitter with a chunk size of 1000 characters and overlap of 200
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200)

### Process PDF files ###
for pdf_file in pdf_files:
    # Load the PDF
    pdf_loader = PyPDFLoader(pdf_file)
    documents = pdf_loader.load()
    
    # Split the documents into chunks
    doc_splits = text_splitter.split_documents(documents)
    
    # Add the splits to the overall list
    all_doc_splits.extend(doc_splits)

### Process Web URLs ###
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Split the web documents into chunks
web_doc_splits = text_splitter.split_documents(docs_list)

# Add the web document splits to the overall list
all_doc_splits.extend(web_doc_splits)

# Now, all_doc_splits contains the splits from both the PDF files and the web documents.

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
print(len(all_doc_splits))

175


In [6]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_community.vectorstores import FAISS


vectorstore = FAISS.from_documents(documents=all_doc_splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [7]:
groq_api_key = os.getenv("GROQ_API_KEY")

In [8]:
from langchain.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser

# Direct Query: Correct Spelling and Grammar Only
template = """You are an AI language model assistant. Your task is to correct any spelling or grammatical mistakes in the given user question while keeping the wording unchanged. 
Provide only the corrected question and nothing else.
Original question: {question}"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

# Ensure ChatGroq is properly set up
generate_queries = (
    prompt_perspectives 
    | ChatGroq(model_name="llama-3.1-8b-instant") 
    | StrOutputParser()
)


In [9]:
questions = [
    "Will the language of instruction of my Bachelor’s degree suffice to demonstrate my proficiency in English for admission in MSc Data Science at FAU Erlangen?",
    "Is it possible to defer my admission to a future semester?",
    "What are the consequences of failing an examination three times?",
    "Who is eligible to supervise my Master’s thesis?",
    "How many ECTS credits are required before I can start writing my Master’s thesis?",
    "What are the admission requirements of MSc Data Science?",
    "What is the teaching language of MSc Data Science?",
    "What is the teaching language of BSc Data Science?",
    "What are the next steps if I get an admission?",
    "Is German required for MSc Data Science?",
    "As your application period will start on 15 February 2024 for the winter semester 2024/25, I have received an offer letter for the summer semester 2024. Do I have to reapply to defer my admission? Can I reapply with my existing account on the Campo portal, which I used for the summer semester, or do I have to open a new account to reapply for the winter 2024/25?",
    "Hello, I have applied to a Master's degree program in Data Science (both part-time and full-time) at FAU. Unfortunately, I have not added my previous offer letter (offer letter is attached below). However, the application portal is not letting me add this offer letter now as it says I cannot change it because my other applications are in process. How do I add this now? I have tried to contact the Masterbüro and admission office, but I have not received any response yet!",
    "I just wanted to know if the course 'Trustworthy Artificial Intelligence' can be taken as a Master Seminar in my course. On Campo, I find it mentioned as both a Master Seminar and an application subject (screenshot attached). Just to clarify before taking the subject.",
    "I am currently pursuing a Master's degree in Data Science. I have registered for the course 'Machine Learning for Engineers 1' for the upcoming examination. Is it eligible to be counted towards the requirements of my degree program? If it is, could you please inform me under which category it falls? I would greatly appreciate your guidance on this matter.",
    "What exactly defines the completion of a degree? Is it the transcript showing 120 ECTS? Let’s say I complete my thesis in September, but the transcript gets updated exactly on 15th October (for example). According to the university, will I be de-enrolled at the end of the Winter semester 2025? But if I apply for de-enrollment myself, can I be de-enrolled at any time during the winter semester? Hope I understood the information correctly.",
    "Sir/Madam, I am a prospective student of your esteemed university from India. I would like to join the Master's program in Data Science (English-taught) in 2024. I have some doubts regarding the full-time study program: 1) What is the duration of a single semester? 2) How many hours of class should I take in a week? 3) If I opt for the full-time study program, would I be able to do part-time jobs? Eagerly waiting for your reply.",
    "I'm a final-year Electrical Engineering student at NUST, Pakistan, interested in the Master's in Data Science program. Having taken relevant coursework in statistics and computer science, along with online courses, I'm curious about my eligibility. My transcript is attached for review.",
    "I have failed two courses: 'Mathematical Foundations of Artificial Intelligence' and 'Neural Networks and Data Analytics - Artificial Intelligence I'. Both of them have not been registered for the exam automatically. Last semester, my failed courses were registered automatically, but this time, it doesn't look like that. Please clarify!",
    "I am a Master's in Data Science student keen to enroll in a Computer Vision Seminar. This seminar is closely related to my academic and career goals. I also got enrolled in this course. However, I couldn't find it in the current course catalog for our program. I kindly request the addition of the Computer Vision Seminar to our course offerings. I believe it will greatly benefit students interested in this field. Your consideration is greatly appreciated.",
    "I am from Data Science, and I am writing to seek your guidance regarding the completion of my major module requirements. I have completed all my credits except for 2.5 ECTS in the major module. In light of this, I would like to ask whether I can fulfill this requirement by taking only a 2.5 ECTS subject, or if I can also consider enrolling in a 5 ECTS subject to meet this requirement. I would appreciate your advice on the best course of action to ensure I complete my credits in a timely and effective manner."
]

In [10]:
answers = []

retriever = vectorstore.as_retriever()

for question in questions:
    # Step 1: Rephrase the question
    rephrased_question = generate_queries.invoke(question)

    # Step 2: Retrieve relevant documents using the rephrased question
    docs = retriever.get_relevant_documents(rephrased_question)

    from operator import itemgetter
    from langchain_openai import ChatOpenAI
    from langchain_core.runnables import RunnablePassthrough

    # Step 3: Answer using retrieved documents
    template = """Answer the following question based on this context:

    {context}

    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)

    final_rag_chain = (
        {"context": itemgetter("context"), 
         "question": itemgetter("question")} 
        | prompt
        | ChatGroq(model_name="llama-3.1-8b-instant")
        | StrOutputParser()
    )

    # Invoke the final RAG pipeline
    ans = final_rag_chain.invoke({"context": docs, "question": rephrased_question})
    answers.append(ans)


Failed to multipart ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/multipart. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Monthly unique traces usage limit exceeded"}')trace=a2a2e2b6-b9c7-4f63-8186-2319d83d40d3,id=a2a2e2b6-b9c7-4f63-8186-2319d83d40d3; trace=a2a2e2b6-b9c7-4f63-8186-2319d83d40d3,id=128f24ae-c04f-4053-aafe-7a3d1bdc369c; trace=a2a2e2b6-b9c7-4f63-8186-2319d83d40d3,id=259826a7-9077-4652-9107-4f16f58c1a2c
Failed to multipart ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/multipart. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Monthly unique traces usage limit exceeded"}')trace=a2a2e2b6-b9c7-4f63-8186-2319d83d40d3,id=c6cca843-af44-4b78-b531-2941ac53ba28; trace=a2a2e2b6-b9c7-4f63-8186-2319d83d40d3,id=a

In [11]:
answers

["According to the document, the language of instruction of your Bachelor's degree will be sufficient to prove English language proficiency if your Bachelor's degree was taught in English. You will need to submit a certificate from your university stating that the language of instruction for your degree has been in English.",
 'Based on the provided context, it seems that the university has a set procedure for deferring admission. \n\nYou can refer to the question and answer from Subject: Re: About application period for Winter 2024/25:\n\nAs your application period will start 15 February ,2024 for winter semester 2024/25. I have received an offer letter for summer semester 2024, so I have to reapply to defer my admission. Can I reapply with my existing account of Campo portal which I used for summer semester? or do I have to open a new account for re-apply for the winter 2024/25?\n\nAnswer:\nYou can re-apply using your existing Campo account.\n\nHowever, for the specific case of defer

In [12]:
Ground_truth = [
    "Yes, if your Bachelor’s degree was taught in English then you can apply with a certificate from your university stating that the language of instruction for your degree has been in English.",
    "Yes, it is possible to defer your admission. Just re-apply for the next semester and upload the current admission letter you received. You will be granted admission provided that there are no significant changes to the regulations and selection criteria.",
    "After three unsuccessful attempts for an examination, the module will be counted as not being passed. This state cannot be changed, and you cannot pass this module in the future. If the module is an elective mandatory module (core elective module in Campo), you can choose another module and continue your studies. If the module is a core module of your study course (Mathematics of Learning, Selected Topics of Mathematics of Learning, or Deep Learning), you cannot continue your studies in this program at FAU.",
    "To start your Master’s thesis, you need an official supervisor from FAU. The person must be a professor or at least have completed a habilitation ('Dr. habil.' or 'PD') and must be a member of one of the following departments: Data Science, Mathematics, Computer Science, or Artificial Intelligence in Biomedical Engineering (AIBE). Professors who are secondary members in these departments can also supervise. Your official supervisor is also the first examiner of your thesis and should propose a second examiner from any department at FAU. Additional guidance from PhD students is possible, but they cannot be official supervisors.",
    "It is highly recommended that you complete 75 ECTS of your study curriculum before choosing a Master’s seminar topic. You should complete all core modules and major modules first. Your thesis topic should ideally align with your specialization and Master’s seminar, following §54 of the examination regulations ('Mentoring') and your individual study agreement.",
    "A completed B.Sc. degree in Mathematics, Industrial Mathematics, Mathematical Economy, Computer Science, Data Science, or Physics from FAU or an equivalent domestic or international degree that aligns with the competence profile of the program. A GPA of 2.5 or better (German grading system) is required. Applicants with a GPA between 2.6 and 2.8 will be invited for an online interview assessing their knowledge of calculus, linear algebra, algorithms, and data structures. English proficiency at B2 CEFR or proof of English-taught education is required.",
    "Completely in English.",
    "Completely in German.",
    "After receiving your admission letter, you must enroll for the next semester by sending certified documents via postal service to the Student Records Office. The enrollment fee should be transferred in advance, especially if transferring from abroad (recommended two weeks before). After enrollment, you gain access to online teaching resources and platforms like StudOn and Campo.",
    "No, but solid German knowledge (B1) is recommended for daily life, internships, and working student jobs. However, a certificate is not required for the application.",
    "You can reapply using your existing Campo account.",
    "Due to the abolition of the 60 ECTS regulation from the winter semester 2022/23, the examination regulations of the Data Science program changed. The letter of admission from winter semester 2021/22 will no longer be recognized. Your application will be completely re-examined and re-evaluated, so uploading the previous offer letter makes no difference.",
    "As you found on Campo, the seminar can be counted as a Master Seminar.",
    "This course is not recognized for the MSc Data Science study program. Therefore, even if you attend this course, you will not be able to accredit any ECTS.",
    "To complete your degree, you need 120 ECTS, including all core, major, minor, application modules, seminar, and thesis. If you complete 120 ECTS by September 2024 and your supervisor submits your thesis grade before the end of September, you will be deregistered at the end of the summer semester 2024. If you complete it in October 2024, you will be deregistered at the end of the winter semester 2024/25. You can also choose to de-register earlier if desired.",
    "1) The duration of one semester is 6 months. 2) There is no restriction on study hours per week. You can take subjects at your comfort, but 30 ECTS per semester is recommended (1 ECTS = 1 hour of study). 3) Yes, you can work part-time. A student can work up to 20 hours per week with a part-time student job.",
    "The admissions committee evaluates your profile only after you apply via the Campo portal.",
    "The automatic re-registration for second attempts in examinations is done manually by the examination office.",
    "I recommend sending a course description for the seminar, including details like the module number and exact name in Campo, to the examination committee and asking if the seminar can be included in one of the module groups of the MSc Data Science program.",
    "You can complete a 5 ECTS module instead of a 2.5 ECTS module. Just ensure that you meet at least the 2.5 ECTS requirement."
]

In [13]:
from datasets import Dataset 
from ragas.metrics import faithfulness, answer_correctness
from ragas import evaluate

Failed to multipart ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/multipart. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Monthly unique traces usage limit exceeded"}')trace=5da38c37-69e4-406b-a718-7cd706cd2f1f,id=d554d073-ab30-4ee9-b47c-00c37ea99798; trace=5da38c37-69e4-406b-a718-7cd706cd2f1f,id=5da38c37-69e4-406b-a718-7cd706cd2f1f; trace=5da38c37-69e4-406b-a718-7cd706cd2f1f,id=e1bc2e0d-6729-48a7-b93f-2008d117aa7e


In [14]:

data_samples = {
    
    'question': questions,
    
    "answer": answers,
    
    'ground_truth': Ground_truth
}

dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset,metrics=[answer_correctness])
score.to_pandas()

Evaluating:   0%|          | 0/20 [00:00<?, ?it/s]

Failed to multipart ingest runs: langsmith.utils.LangSmithRateLimitError: Rate limit exceeded for https://api.smith.langchain.com/runs/multipart. HTTPError('429 Client Error: Too Many Requests for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Monthly unique traces usage limit exceeded"}')trace=75865240-084f-4663-9e4b-7a42204a7499,id=75865240-084f-4663-9e4b-7a42204a7499; trace=75865240-084f-4663-9e4b-7a42204a7499,id=696bf247-64c9-4ac8-b48b-f6f6a5285bce; trace=75865240-084f-4663-9e4b-7a42204a7499,id=9e2a025d-5e70-4f30-b104-ddc50dde1279; trace=75865240-084f-4663-9e4b-7a42204a7499,id=eadecaa1-a571-4d24-93e9-661be53180fd; trace=75865240-084f-4663-9e4b-7a42204a7499,id=f7810e75-c2c5-418a-b2e8-d0ccda75ab98; trace=75865240-084f-4663-9e4b-7a42204a7499,id=5df553a1-2970-4654-8122-18787c530ec0; trace=75865240-084f-4663-9e4b-7a42204a7499,id=0f812cff-0d43-435c-87eb-305d80d7da40; trace=75865240-084f-4663-9e4b-7a42204a7499,id=3553b4e2-83e9-4b76-953f-0ff096266eaa; trace=75865240-084f

Unnamed: 0,user_input,response,reference,answer_correctness
0,Will the language of instruction of my Bachelo...,"According to the document, the language of ins...","Yes, if your Bachelor’s degree was taught in E...",0.981305
1,Is it possible to defer my admission to a futu...,"Based on the provided context, it seems that t...","Yes, it is possible to defer your admission. J...",0.533315
2,What are the consequences of failing an examin...,"According to the document on page 7, if a stud...",After three unsuccessful attempts for an exami...,0.994075
3,Who is eligible to supervise my Master’s thesis?,"According to the provided documents, the eligi...","To start your Master’s thesis, you need an off...",0.505893
4,How many ECTS credits are required before I ca...,"According to the provided documents, it is hig...",It is highly recommended that you complete 75 ...,0.481452
5,What are the admission requirements of MSc Dat...,The admission requirements for MSc Data Scienc...,"A completed B.Sc. degree in Mathematics, Indus...",0.77009
6,What is the teaching language of MSc Data Scie...,The teaching language of MSc Data Science at F...,Completely in English.,0.701486
7,What is the teaching language of BSc Data Scie...,"According to the provided documents, the teach...",Completely in German.,0.423618
8,What are the next steps if I get an admission?,"Based on the provided documents, here are the ...","After receiving your admission letter, you mus...",0.217951
9,Is German required for MSc Data Science?,"According to the provided documents, the requi...","No, but solid German knowledge (B1) is recomme...",0.300464


In [15]:
print(sum(score.to_pandas()["answer_correctness"]) / len(score.to_pandas()["answer_correctness"]))

0.6504785852436133
