In [17]:
from typing import Literal
from langchain_core.messages import HumanMessage
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.tools import tool
from langgraph.checkpoint.memory import MemorySaver
from langgraph.graph import END, START, StateGraph, MessagesState
from langgraph.prebuilt import ToolNode
import os

In [18]:
import pandas as pd
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

In [19]:
import pandas as pd
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings

In [32]:
import json, os

def load_profile(path: str):
    with open(path, "r") as f:
        return json.load(f)

# Example: replace with the user’s actual JSON path
PROFILE = load_profile("profiles/user1.json")
print("Loaded profile for:", PROFILE.get("user_id", "unknown"))

Loaded profile for: abira


In [21]:
CSV_PATH = "scholarship_whizz_db.csv"  # adjust if needed
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

def normalize_level(degrees_value) -> str:
    """Map degree(s) text to one of: undergrad / grad / both."""
    s = str(degrees_value).lower()
    grad_keys = ["grad", "graduate", "master", "m.sc", "ms", "m.tech", "phd", "doctoral", "postgrad", "post-graduate"]
    ug_keys   = ["undergrad", "undergraduate", "bachelor", "b.sc", "btech", "ba", "bcom", "b.eng", "b.engg"]

    is_grad = any(k in s for k in grad_keys)
    is_ug   = any(k in s for k in ug_keys)

    if is_grad and is_ug: return "both"
    if is_grad:           return "grad"
    if is_ug:             return "undergrad"
    return "both"
    
def build_vectorstore_from_scholarships(csv_path=CSV_PATH):
    df = pd.read_csv(csv_path)

    # Must match your headers exactly
    required = ["Scholarship Name", "Provider", "Amount", "Closing Date", "Degree(s)", "Status", "Link"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns in CSV: {missing}")

    # NEW: derive normalized level
    df["level_norm"] = df["Degree(s)"].apply(normalize_level)

    docs = []
    for i, row in df.iterrows():
        content = (
            f"Scholarship Name: {row['Scholarship Name']}\n"
            f"Provider: {row['Provider']}\n"
            f"Amount: {row['Amount']}\n"
            f"Closing Date: {row['Closing Date']}\n"
            f"Degree(s): {row['Degree(s)']}\n"
            f"Status: {row['Status']}\n"
            f"Link: {row['Link']}"
        )
        docs.append(Document(
            page_content=content,
            metadata={
                "row": int(i),
                "name": row["Scholarship Name"],
                "provider": row["Provider"],
                "closing_date": row["Closing Date"],
                "degree": row["Degree(s)"],
                "level": row["level_norm"],            # NEW: used for UG/Grad filter
                "link": row["Link"],
            }
        ))

    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    chunks = splitter.split_documents(docs)

    vs = Chroma.from_documents(
        chunks,
        embedding=embeddings,
        collection_name="scholarship_whizz_db.csv",
        # persist_directory="./chroma_scholarships"
    )
    return vs

# Build once (or rebuild if CSV changed)
vectorstore = build_vectorstore_from_scholarships()
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

In [22]:
os.environ["OPENAI_API_KEY"] = "EMPTY"
os.environ["OPENAI_BASE_URL"] = "http://research-ai.tail69783d.ts.net:8001/v1"

model = ChatOpenAI(
    base_url=os.environ["OPENAI_BASE_URL"],
    api_key=os.environ["OPENAI_API_KEY"],
    model="openai/gpt-oss-20b",
    temperature=0,
    max_completion_tokens=10000
)

In [33]:
# # Simple v1 user setting
# USER_LEVEL = "undergrad"   # or "grad". Later: set from Google Forms.

# def set_user_level(level: str):
#     global USER_LEVEL
#     level = level.strip().lower()
#     assert level in {"undergrad", "grad"}, "level must be 'undergrad' or 'grad'"
#     USER_LEVEL = level
#     print("User level set to:", USER_LEVEL)

# @tool
# def retrieve_context(query: str) -> str:
#     """
#     Retrieve relevant scholarships from the CSV index, filtered by USER_LEVEL.
#     """
#     results = retriever.invoke(query)

#     def match_level(md):
#         lvl = md.get("level", "both")
#         return (lvl == USER_LEVEL) or (lvl == "both")

#     filtered = [d for d in results if match_level(d.metadata)] or results

#     lines = []
#     for doc in filtered[:5]:
#         md = doc.metadata
#         # Extract Amount nicely
#         amt = "-"
#         if "Amount:" in doc.page_content:
#             try:
#                 amt = doc.page_content.split("Amount: ")[1].splitlines()[0]
#             except Exception:
#                 pass
#         lines.append(
#             f"- {md.get('name','(no name)')} | Level: {md.get('level','both')} | Provider: {md.get('provider','-')} | "
#             f"Amount: {amt} | Closing: {md.get('closing_date','-')} | Degree(s): {md.get('degree','-')} | Link: {md.get('link','-')}"
#         )
#     return "\n".join(lines) if lines else "No matching scholarships found."


In [34]:
from langchain_core.tools import tool

def _match_level(md_level: str, user_level: str) -> bool:
    md_level = (md_level or "both").lower()
    user_level = (user_level or "undergrad").lower()
    return md_level == "both" or md_level == user_level

def _profile_filter(md, profile: dict) -> bool:
    ok = True
    # level filter (required)
    ok &= _match_level(md.get("level"), profile.get("level"))
    # optional filters (add as your CSV supports them)
    # if profile.get("country_of_study"):
    #     ok &= profile["country_of_study"].lower() in str(md.get("country","")).lower() or md.get("country") in (None, "", "Any")
    # if profile.get("citizenship"):
    #     ok &= profile["citizenship"].lower() in str(md.get("citizenship","Any")).lower() or md.get("citizenship","Any")=="Any"
    # if profile.get("field"):
    #     ok &= profile["field"].lower() in str(md.get("field","")).lower() or md.get("field","")=="" 
    return ok

@tool
def retrieve_context(query: str) -> str:
    """Retrieve relevant scholarships personalized by the loaded PROFILE."""
    # augment the query with profile keywords (if any)
    kw = " ".join(PROFILE.get("keywords", []))
    composite_query = f"{query} {kw}".strip()

    candidates = retriever.invoke(composite_query)
    filtered = [d for d in candidates if _profile_filter(d.metadata, PROFILE)] or candidates

    lines = []
    for doc in filtered[:5]:
        md = doc.metadata
        amt = "-"
        if "Amount:" in doc.page_content:
            try: amt = doc.page_content.split("Amount: ")[1].splitlines()[0]
            except: pass
        lines.append(
            f"- {md.get('name','(no name)')} | Level: {md.get('level','both')} | "
            f"Amount: {amt} | Closing: {md.get('closing_date','-')} | Degree(s): {md.get('degree','-')} | Link: {md.get('link','-')}"
        )
    return "\n".join(lines) if lines else "No matching scholarships found."


In [24]:
tools = [retrieve_context]
tool_node = ToolNode(tools)

# bind tools to the model you just created
model = model.bind_tools(tools)

print("Internal vLLM model connected and tools bound successfully")

Internal vLLM model connected and tools bound successfully


In [25]:
def should_continue(state: MessagesState) -> Literal["tools", END]:
    messages = state["messages"]
    last_message = messages[-1]
    if getattr(last_message, "tool_calls", None):
        return "tools"
    return END

def call_model(state: MessagesState):
    messages = state["messages"]
    response = model.invoke(messages)
    return {"messages": [response]}


In [26]:
workflow = StateGraph(MessagesState)
workflow.add_node("agent", call_model)
workflow.add_node("tools", tool_node)

workflow.add_edge(START, "agent")
workflow.add_conditional_edges("agent", should_continue)
workflow.add_edge("tools", "agent")

<langgraph.graph.state.StateGraph at 0x147ab9880>

In [35]:
tools = [retrieve_context]
tool_node = ToolNode(tools)
model = model.bind_tools(tools)  # reuse your existing ChatOpenAI instance

In [36]:
# checkpointer = MemorySaver()
# app = workflow.compile()

# # thread_id must be a string when using MemorySaver
# set_user_level("grad")  # choose session persona
# final_state = app.invoke(
#     {
#         "messages": [HumanMessage(content="print 5 scholarships in US")],
#         "configurable": {"thread_id": "42"}
#     }
# )
# print(final_state["messages"][-1].content)


In [37]:
from langchain_core.prompts import ChatPromptTemplate

SYSTEM_TMPL = """You are ScholarshipWhizz, a helpful assistant.

User profile:
- Level: {level}
- Country of study: {country_of_study}
- Citizenship: {citizenship}
- Field: {field}
- Deadline preference: {deadline_preference}
- Keywords: {keywords}

Use the retrieved scholarships below to propose the top options for this user.
Be concise and include scholarship name, amount, deadline, and link.
If something is missing, say so.
Retrieved candidates:
{retrieved}
"""

prompt = ChatPromptTemplate.from_messages([
    ("system", SYSTEM_TMPL),
    ("human", "{question}")
])

def build_prompt_inputs(question: str, retrieved_text: str, profile: dict):
    return {
        "level": profile.get("level",""),
        "country_of_study": profile.get("country_of_study",""),
        "citizenship": profile.get("citizenship",""),
        "field": profile.get("field",""),
        "deadline_preference": profile.get("deadline_preference",""),
        "keywords": ", ".join(profile.get("keywords", [])),
        "retrieved": retrieved_text,
        "question": question
    }


In [38]:
retrieved = retrieve_context.invoke("Find scholarships about data/AI")
inputs = build_prompt_inputs("Recommend the best 3 for me", retrieved, PROFILE)
msg = prompt.invoke(inputs)
resp = model.invoke(msg.to_messages())
print(resp.content)

**Top 2 scholarships that match your profile (undergrad, Canada, international, CS, women‑in‑tech focus, 2025 deadline)**  

| Scholarship | Amount | Deadline | Link |
|-------------|--------|----------|------|
| **Canadian Women in STEM Scholarship** | **$10,000** | **30 Jan 2025** | [Apply here](https://universitystudy.ca/scholarship/14586/) |
| **Clare E. and Anne Winterbottom Scholarship** | **$2,600** (2 × $1,300 terms) | **30 Apr 2025** | [Apply here](https://universitystudy.ca/scholarship/14169/) |

**Why these are the best fit**

* **Canadian Women in STEM Scholarship** – specifically targets women in STEM fields, offers a substantial award, and has a 2025 deadline that fits your timeline.  
* **Clare E. and Anne Winterbottom Scholarship** – while not explicitly tech‑focused, it is open to all undergraduates, provides a decent award, and meets your deadline preference.

**Note:** The other retrieved scholarships either lack a specified amount, have a later deadline (2026), or a