In [13]:
import os
from dotenv import load_dotenv

# Load environment variables from the .env file
load_dotenv(dotenv_path="../../keys/.env")

True

In [14]:
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")

In [15]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
langchain_api_key = os.getenv("LANGCHAIN_API_KEY")

In [16]:
openAI_api_key = os.getenv("OPENAI_API_KEY")

In [17]:
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# List of PDF file paths
# pdf_dir = "../../data/pdfs"
# pdf_files = [os.path.join(pdf_dir, file) for file in os.listdir(pdf_dir) if file.endswith(".pdf")]

# URLs to load
urls = [
    "https://www.math-datascience.nat.fau.de/im-studium/masterstudiengaenge/master-data-science/",
    "https://www.fau.eu/studiengang/data-science-bsc/",
    "https://www.fau.eu/studiengang/data-science-msc/",
]

# Initialize an empty list to hold all document splits
all_doc_splits = []

# Define the text splitter with a chunk size of 1000 characters and overlap of 200
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=1000, 
    chunk_overlap=200)

### Process PDF files ###
# for pdf_file in pdf_files:
#     # Load the PDF
#     pdf_loader = PyPDFLoader(pdf_file)
#     documents = pdf_loader.load()
    
#     # Split the documents into chunks
#     doc_splits = text_splitter.split_documents(documents)
    
#     # Add the splits to the overall list
#     all_doc_splits.extend(doc_splits)

### Process Web URLs ###
docs = [WebBaseLoader(url).load() for url in urls]
docs_list = [item for sublist in docs for item in sublist]

# Split the web documents into chunks
web_doc_splits = text_splitter.split_documents(docs_list)

# Add the web document splits to the overall list
all_doc_splits.extend(web_doc_splits)

# Now, all_doc_splits contains the splits from both the PDF files and the web documents.

In [18]:
print(len(all_doc_splits))

17


In [19]:
# Index
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.vectorstores import SKLearnVectorStore
from langchain_community.vectorstores import FAISS


vectorstore = FAISS.from_documents(documents=all_doc_splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

In [20]:
groq_api_key = os.getenv("GROQ_API_KEY")

In [21]:
from langchain.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate three 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines, give only questions and nothing else. Original question: {question}"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_perspectives 
    | ChatGroq(model_name="llama-3.1-8b-instant") 
    | StrOutputParser() 
    | (lambda x: x.split("\n\n"))
)

In [22]:
questions = [
    'What are the admission requirements of M.Sc. Data Science at FAU Erlangen?',
    'How can I apply for M.Sc. Data Science at FAU Erlangen?',
    'Is accommodation available at FAU Erlangen?',
    'How can I finance my studies at FAU Erlangen?',
    'What are the next steps if I get an admission at FAU Erlangen?',
    'What is the duration of M.Sc. Data Science degree program at FAU Erlangen?',
    'What is the teaching language of M.Sc. Data Science at FAU Erlangen?',
    'What is the structure of M.Sc. Data Science at FAU Erlangen?',
    'Is German required for M.Sc. Data Science at FAU Erlangen?',
    'What is the duration of B.Sc. Data Science degree program at FAU Erlangen?',
    'What is the teaching language of B.Sc. Data Science at FAU Erlangen?',
    'What is the structure of B.Sc. Data Science FAU Erlangen?',
    'What are the admission requirements of B.Sc. Data Science FAU Erlangen?'
]

In [23]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

In [24]:
answers = []
for question in questions:
    retrieval_chain = generate_queries | retriever.map() | get_unique_union
    docs = retrieval_chain.invoke({"question":question})

    from operator import itemgetter
    from langchain_openai import ChatOpenAI
    from langchain_core.runnables import RunnablePassthrough

    # RAG
    template = """Answer the following question based on this context:

    {context}

    Question: {question}

    Additionally, provide your confidence level using cosine similarity in your answer between 0-100% and please do not give explanation regarding this, give just confidence level

    """

    prompt = ChatPromptTemplate.from_template(template)

    final_rag_chain = (
        {"context": retrieval_chain, 
        "question": itemgetter("question")} 
        | prompt
        | ChatGroq(model_name="llama-3.1-8b-instant")
        | StrOutputParser()
    )

    ans = final_rag_chain.invoke({"question":question})
    answers.append(ans)

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama-3.1-8b-instant` in organization `org_01j5z017bvfxwsmjmc5jn6mm4n` on : Limit 500000, Used 496459, Requested 5020. Please try again in 4m15.4294s. Visit https://console.groq.com/docs/rate-limits for more information.', 'type': '', 'code': 'rate_limit_exceeded'}}

In [25]:
answers

["The admission requirements for M.Sc. Data Science at FAU Erlangen include:\n\n1. A completed Bachelor's degree in Mathematics, Industrial Mathematics, Mathematical Economy, Computer Science, Data Science, or Physics from FAU or another equivalent German or international degree that is not significantly different with regard to the competence profile taught in the respective degree program.\n\n2. A Grade Point Average (GPA) of 2.5 or better with respect to the German grading system. Candidates with an admissible degree and a GPA between 2.6 and 2.8 are invited for a short online interview in which their knowledge in calculus, linear algebra, algorithms, and data structures is evaluated.\n\n3. English proficiency at level B2 CEFR (vantage or upper intermediate) or six years of English classes at a German secondary school (Gymnasium). Applicants who have completed their university entrance qualifications or their first degree in English are not required to provide proof of proficiency i