In [6]:
import pandas as pd

data=pd.read_csv("rag_inputs.csv",index_col=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13 entries, 0 to 12
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   13 non-null     object
 1   URL     13 non-null     object
 2   Text    13 non-null     object
dtypes: object(3)
memory usage: 416.0+ bytes


# DSI program data

In [2]:
data["length"] = data["Text"].apply(lambda x: len(str(x).split()))
data

Unnamed: 0,Title,URL,Text,length
0,Master’s Programs – DSI,https://datascience.uchicago.edu/education/mas...,The Data Science Institute supports masters-le...,564
1,Master’s in Applied Data Science – DSI,https://datascience.uchicago.edu/education/mas...,Elevate Your Expertise in Data Science The Uni...,314
2,In-Person Program – DSI,https://datascience.uchicago.edu/education/mas...,Your Career Success Take the next step to adva...,3931
3,Online Program – DSI,https://datascience.uchicago.edu/education/mas...,Rigor Meets Flexibility You will benefit from ...,4352
4,Capstone Projects – DSI,https://datascience.uchicago.edu/education/mas...,The culminating experience in the Masters in A...,675
5,How to Apply – DSI,https://datascience.uchicago.edu/education/mas...,Masters in Applied Data Science Application Re...,946
6,"Faculty, Instructors, Staff – DSI",https://datascience.uchicago.edu/education/mas...,"As a Masters in Applied Data Science student, ...",7498
7,Our Students – DSI,https://datascience.uchicago.edu/education/mas...,"Featured Graduates EJ Kang Graduate In-Person,...",2147
8,FAQs – DSI,https://datascience.uchicago.edu/education/mas...,Masters in Applied Data Science FAQs Learn mor...,2483
9,Events & Deadlines – DSI,https://datascience.uchicago.edu/education/mas...,Upcoming Events Virtual Information Session On...,724


# calling fine turning embedding model
- embedding model trained in different notebook

# RAG

data processing

In [None]:
# keys
import os
# os.environ["LANGCHAIN_API_KEY"] = 
# os.environ["OPENAI_API_KEY"] = 
# os.environ["COHERE_API_KEY"] =

from langchain_cohere import ChatCohere
from langchain_openai import ChatOpenAI
# llm = ChatCohere(model="command-r-plus")

# 2. split with source metadata, label with source
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, 
    chunk_overlap=150
)

# 3. Build chunks
#Concatenate Title + Text for embedding
from langchain_core.documents import Document
all_chunks = []

for idx, row in data.iterrows():
    # Combine title + text before splitting
    combined_text = row['Title'] + "\n" + row['Text']
    chunks = splitter.split_text(combined_text)
    
    for chunk in chunks:
        all_chunks.append(
            Document(
                page_content=chunk, 
                metadata={
                    "title": row["Title"],
                    "url": row["URL"],
                }
            )
        )
#4 Embed and index using FAISS - VD
#retrival
from langchain.vectorstores import FAISS

# openai emebeding
from langchain_openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(all_chunks, embedding=embedding)

# alter, cohere embedding
# from langchain_cohere import CohereEmbeddings
# embedding = CohereEmbeddings(model="embed-english-v3.0")
# vectorstore = FAISS.from_documents(all_chunks, embedding=embedding)

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# rag pipeline
from langchain import hub
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import PromptTemplate
from langchain_cohere import ChatCohere

prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""
You are an expert assistant helping answer questions about the MS in Applied Data Science program.
Use the context below to answer the question at the end.
If you don't know the answer, just return "I'm not sure" and do not invent facts.

Context:
{context}

Question:
{question}

Answer in a detailed, professional way:
""",
)

llm = ChatOpenAI(model_name="gpt-4o", temperature=0) # gpt
# llm = ChatCohere(model="command-r-plus") # cohere

# combine all chunks into one piece with \n\n\ between each
def format_docs(docs):
    return "\n\n".join(f"[{doc.metadata['title']}] {doc.page_content}" for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


calling LLM

In [41]:
test_queries = [
    "What is the MS in Applied Data Science program at the University of Chicago?",
    "How many courses are required to complete the MSADS degree?",
    "What types of Capstone Projects have MSADS students worked on?",
    "What are the main differences between the Online and In-Person MSADS programs?",
    "When is the final application deadline for Autumn 2025 entry?",
    "Who are some of the instructors teaching in the MSADS program?",
    "What career services are available to MSADS students?",
    "Can students complete the MSADS program on a part-time schedule?",
    "Are there any prerequisite programming skills required before joining the MSADS program?",
    "Does the MSADS curriculum allow specialization in fields like machine learning or healthcare analytics?",
    "What is the Immersion Weekend experience for Online MSADS students?",
    "Are scholarships or financial aid available for MSADS applicants?",
    "Where are the in-person classes for the MSADS program held?",
    "What Foundational Courses are offered to help prepare students for the MSADS curriculum?",
    "What is the structure of the Career Seminar in the MSADS program?",
    "Can international students apply to the MSADS Online program?",
    "What programming languages are emphasized in the MSADS coursework?",
    "What is the focus of the Leadership and Consulting for Data Science course?",
    "What kinds of companies sponsor Capstone Projects in the MSADS program?",
    "How do the Online program’s live classes work each week?"
]
df = pd.DataFrame(test_queries, columns=["Query"])

from tqdm import tqdm

# Store responses in a new column
responses = []
for query in tqdm(df["Query"], desc="Running RAG queries with LLM"):
    try:
        response = rag_chain.invoke(query)
    except Exception as e:
        response = f"Error: {str(e)}"
    responses.append(response)

df["Response"] = responses

# Save to CSV
csv_path = "DSI_rag_responses.csv"
df.to_csv(csv_path, index=False)

Running RAG queries with LLM: 100%|██████████| 20/20 [01:09<00:00,  3.49s/it]
