In [29]:
from langsmith import traceable, utils
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import re
import umd_rag
import pineconing
import umd_webscraper
import my_utils

import importlib

In [34]:
importlib.reload(umd_rag)
importlib.reload(pineconing)
importlib.reload(umd_webscraper)
importlib.reload(my_utils)

<module 'my_utils' from 'c:\\Users\\ethan\\Desktop\\umd\\s4 spring 2025\\HDCC209B\\UMD-Sustainability-Chatbot\\my_utils.py'>

Load environment variables from .env

In [2]:
load_dotenv(override=True)

True

#### Web Scraping ####

In [3]:
sites = ["https://sustainability.umd.edu/",
         "https://sustainingprogress.umd.edu/"]

In [4]:
def get_site_name(site):
  site_re = re.compile(r'^https://([A-Za-z0-9]*).umd.edu/$')

  matched = re.match(site_re, site)
  if matched:
    return matched.group(1)
  else:
    raise Exception("Not good site name (not umd)")

Scrape all sites

In [None]:
data_count = 0

for site in sites:
  scraper = umd_webscraper.UMDWebScraper(site)
  scraper.scrape(site)

  site_name = get_site_name(site)
  print(f"Total for {site_name} site: {len(scraper.data)}")
  data_count += len(scraper.data)
  scraper.save_data(f"umd_{site_name}_data.json")

In [None]:
data_count

### Embedding into vector storage ###

In [13]:
vector_db = pineconing.VectorDB()

Index exists already


In [14]:
vector_db.index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'file_data': {'vector_count': 1620}},
 'total_vector_count': 1620,
 'vector_type': 'dense'}

Loading embedding model:

- SentenceTransformer (all-MiniLM-L6-v2)
- GoogleGenerativeAIEmbeddings (models/embedding-001)

In [5]:
embedding_model = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

#### Adding Data ####
We only need to do this once

By files

In [None]:
files = ["datafiles/umd_sustainability_data.json",
         "datafiles/umd_sustainingprogress_data.json"]

vector_db.upsert_files(files)

By own data

In [15]:
our_data = "The shortened link for our water refilling stations is ter.ps/heartthetap"
vector_db.upsert_own_data(our_data)

### Retrieving and Generating (RAG) with langchain/langsmith tracing ###

In [16]:
utils.tracing_is_enabled()

True

### Using langchain google generative ai ###

In [21]:
google_model = "gemini-2.0-flash-lite"
llm = ChatGoogleGenerativeAI(model=google_model)

rag = umd_rag.UMDRAG(vector_db, llm)

In [26]:
piped = rag.pipe("What are some of the challenges of sustainability at UMD?", include_metadata=True)
print(f"AI:\n{piped['answer'].content}")
print()
meta = piped['metadata']
print(f"Metadata:\n{meta}")

AI:
UMD is committed to addressing the grand challenges of climate change and sustainability. The university recognizes that healthy water, air, and ecosystems are foundational to a thriving society. UMD aims to address sustainability issues by utilizing the campus as a living lab to develop leadership, innovation, and sustainability solutions. Students are concerned about major environmental issues and feel that the university should continue to advance efforts to address these issues.

Metadata:
[{
    "namespace": "file_data",
    "score": 0.77050668,
    "id": "sustainability_119",
    "values": [],
    "metadata": {
        "Content": "Campus programs you can take part in University initiatives to address sustainability in these areas UMD's key goals, achievements, and commitments ",
        "Header": "Explore Sustainability Topics",
        "Link": "https://sustainability.umd.edu/topics",
        "Site_Title": "Sustainability Impact Areas | SustainableUMD"
    }
}, {
    "namespa

In [35]:
a = my_utils.organize_retrieval(meta)
print(a)

Top 3 retrieved documents:
1. sustainability_119
- Text: Campus programs you can take part in University initiatives to address sustainability in these areas UMD's key goals, achievements, and commitments 
- Link: https://sustainability.umd.edu/topics
- Score: 0.77050668

2. sustainingprogress_220
- Text: All Schools and Colleges at UMD include academic and research opportunities relating to sustainability. Staff and administrative teams throughout the university incorporate sustainability into the university operations. We're committed to addressing the grand challenges of climate change and sustainability. The SustainableUMD Network connects students, staff, faculty, and other community members to sustainability-related opportunities at UMD. This section curates a collection of resources -- reports, articles, newsletters, social media accounts, and other resources -- created by many sustainability-affiliated units across campus. 
- Link: https://sustainingprogress.umd.edu/celebrating