In [1]:
import os
import sys
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

In [2]:
keys_path = os.path.abspath('../../keys')
sys.path.append(keys_path)
from keys import LANGCHAIN_API_KEY, OPENAI_API_KEY
os.environ["LANGCHAIN_API_KEY"] = LANGCHAIN_API_KEY
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

pdfs_path = os.path.abspath('../../public_data_pdfs')

In [3]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Get all PDF files in the directory
pdf_files = [file for file in os.listdir(pdfs_path) if file.endswith('.pdf')]

# Create a list of full paths by joining the base folder with each file name
pdf_paths = [os.path.join(pdfs_path, pdf_file) for pdf_file in pdf_files]

# Initialize an empty list to hold all document splits
all_doc_splits = []

# Define the text splitter with a chunk size of 1000 tokens and 200 overlap
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200)

# Iterate over each PDF file path
for pdf_path in pdf_paths:
    # Load the PDF
    pdf_loader = PyPDFLoader(pdf_path)
    documents = pdf_loader.load()
    
    # Split the documents into chunks
    doc_splits = text_splitter.split_documents(documents)
    
    # Add the splits to the overall list
    all_doc_splits.extend(doc_splits)

In [4]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import SKLearnVectorStore

vectorstore = SKLearnVectorStore.from_documents(documents=all_doc_splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [5]:
from langchain.prompts import ChatPromptTemplate

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatOpenAI(temperature=0) 
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [6]:
questions = [
    'Will the language of instruction of my Bachelor’s degree suffice to demonstrate my proficiency in English for admission in Msc Data Science at FAU Erlangen?',
    'Is an APS certificate mandatory for Indian students during the application process?',
    'How long does it typically take to receive a decision regarding my application to the MSc Data Science program?',
    'Am I eligible to apply for the study course program with a degree from a technical university or a university of applied sciences?',
    'Can I apply for the study course program with a degree obtained through a dual study program?',
    'What could be the reasons for receiving a rejection for my application?',
    'Is it possible to defer my admission to a future semester?',
    'Where can I find accommodation while studying at FAU?',
    'Can I switch my major subject after being admitted?',
    'Is it allowed to combine modules from different application subjects?',
    'How can I transfer ECTS credits from one module group to another?',
    'Can modules from my major subject count towards my minor subject requirements?',
    'Can my werkstudent experience be used to earn ECTS as a technical qualification?',
    'Can I participate in an examination without prior registration?',
    'How do I register for a module’s examination in Campo within the correct specialisation area?',
    'If I fail an examination on the first attempt, is it mandatory to attend the next attempt?',
    'Is it possible to withdraw from an examination after registering?',
    'What should I do if I cannot attend an examination due to illness?',
    'What are the consequences of failing an examination three times?',
    'Can I retake a completed module to improve my grades?',
    'Is it possible to improve my grades by completing additional modules?',
    'How many ECTS credits are required before I can start writing my Master’s thesis?',
    'How can I find a topic for my Master’s thesis?',
    'Who is eligible to supervise my Master’s thesis?',
    'Can my Master’s thesis supervisor be from a different department?',
    'Is it possible to complete my Master’s thesis while working in a company?',
    'Is it necessary to have a university supervisor for a company-based thesis?',
    'What is the process for registering my Master’s thesis?',
    'What steps should I take after completing my Master’s thesis?'
]

In [7]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

In [8]:
answers = []
for question in questions:
    retrieval_chain = generate_queries | retriever.map() | get_unique_union
    docs = retrieval_chain.invoke({"question":question})

    from operator import itemgetter
    from langchain_openai import ChatOpenAI
    from langchain_core.runnables import RunnablePassthrough

    # RAG
    template = """Answer the following question based on this context:

    {context}

    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)

    llm = ChatOpenAI(temperature=0)

    final_rag_chain = (
        {"context": retrieval_chain, 
        "question": itemgetter("question")} 
        | prompt
        | llm
        | StrOutputParser()
    )

    ans = final_rag_chain.invoke({"question":question})
    answers.append(ans)

  warn_beta(


In [10]:
answers

["Based on the provided context, if your Bachelor's degree was taught in English, then you can apply with a certificate from your university stating that the language of instruction for your degree was in English. This should be sufficient to demonstrate your proficiency in English for admission to the MSc Data Science program at FAU Erlangen.",
 'Yes, an APS certificate is mandatory for Indian students during the application process at FAU as well as for applying for the study visa.',
 "It typically takes about 4 weeks after the application deadline to receive a decision regarding your application to the MSc Data Science program. If you have not received any answer after 4 weeks, it is recommended to be patient. If there is still no response after 8 weeks, it is advised to politely ask the Master's Office about the current state of your application.",
 'Yes, you are eligible to apply for the study course program with a degree from a technical university or a university of applied scie