In [1]:
from langsmith import traceable, utils
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import re
import umd_rag
import pineconing
import umd_webscraper
import my_utils

import importlib

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
importlib.reload(umd_rag)
importlib.reload(pineconing)
importlib.reload(umd_webscraper)
importlib.reload(my_utils)

<module 'my_utils' from 'c:\\Users\\ethan\\Desktop\\umd\\s4 spring 2025\\HDCC209B\\UMD-Sustainability-Chatbot\\my_utils.py'>

Load environment variables from .env

In [3]:
load_dotenv(override=True)

True

#### Web Scraping ####

In [11]:
sites = ["https://sustainability.umd.edu/",
         "https://sustainingprogress.umd.edu/"]

In [12]:
def get_site_name(site):
  site_re = re.compile(r'^https://([A-Za-z0-9]*).umd.edu/$')

  matched = re.match(site_re, site)
  if matched:
    return matched.group(1)
  else:
    raise Exception("Not good site name (not umd)")

Scrape all sites

In [None]:
data_count = 0

for site in sites:
  scraper = umd_webscraper.UMDWebScraper(site)
  scraper.scrape(site)

  site_name = get_site_name(site)
  print(f"Total for {site_name} site: {len(scraper.data)}")
  data_count += len(scraper.data)
  scraper.save_data(f"datafiles/umd_{site_name}_data.json")

Scraping: https://sustainability.umd.edu/
Scraping: https://sustainability.umd.edu/transportation
Scraping: https://sustainability.umd.edu/about/contact-us
Scraping: https://sustainability.umd.edu/OS
Scraping: https://sustainability.umd.edu/about/office-sustainability/trisha-raghuram
Scraping: https://sustainability.umd.edu/news
Scraping: https://sustainability.umd.edu/sustainability-grants/umd-sustainability-mini-grant
Scraping: https://sustainability.umd.edu/education-and-research/majors-minors-graduate-programs
Scraping: https://sustainability.umd.edu/sustainability-education-research
Scraping: https://sustainability.umd.edu/progress/reports-other-resources
Scraping: https://sustainability.umd.edu/about/sustainability-council
Scraping: https://sustainability.umd.edu/node/155
Scraping: https://sustainability.umd.edu/education-and-research/living-learning-programs
Scraping: https://sustainability.umd.edu/about
Scraping: https://sustainability.umd.edu/waste
Scraping: https://sustainabi

In [19]:
print(scraper.data[0]['Content'])

Welcome to the Office of Sustainability's web portal for reporting UMD's measurable steps toward achieving campus sustainability goals. This portal also makes connections between UMD's campus-based progress and support for global sustainable development. Explore campus data 
Site Title: SustainableUMD Progress Hub | Office of Sustainability
Header: SustainableUMD ProgressHub
Link: https://sustainingprogress.umd.edu/


In [14]:
data_count

1590

### Embedding into vector storage ###

In [4]:
vector_db = pineconing.VectorDB()

Index exists already


In [5]:
vector_db.index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'file_data': {'vector_count': 1590},
                'own_data': {'vector_count': 2}},
 'total_vector_count': 1592,
 'vector_type': 'dense'}

#### Adding Data ####
We only need to do this once

By files

In [53]:
files = ["datafiles/umd_sustainability_data.json",
         "datafiles/umd_sustainingprogress_data.json"]

vector_db.upsert_files(files)

Embedding everything from sustainability data
Embedding everything from sustainingprogress data
Upserting batch 0
Upserting batch 1
Upserting batch 2
Upserting batch 3
Upserting batch 4
Upserting batch 5
Upserting batch 6
Upserting batch 7


By own data

In [10]:
our_data = "The shortened link for the water refilling stations website is https://ter.ps/heartthetap"
vector_db.upsert_own_data(our_data)

Delete File data

In [71]:
vector_db.index.delete(delete_all=True, namespace='file_data')

{}

Delete own data and reset count

In [8]:
vector_db.index.delete(delete_all=True, namespace='own_data')
with open("datafiles/own_data_id_count.txt", 'r+') as file:
    file.seek(0)
    file.write('0')

In [25]:
vector_db.index.delete(['own_data_1'], namespace='own_data')

{}

In [28]:
vector_db.index.fetch(['own_data_0'], namespace='own_data')

FetchResponse(namespace='own_data', vectors={'own_data_0': Vector(id='own_data_0', values=[-0.0458585434, -0.0130797494, -0.00269041327, 0.0263503268, 0.00474185823, 0.0276646279, -0.0591854192, 0.00725139072, -0.0548413694, -0.0541994199, -0.0456622243, 0.0325723812, 0.0184200816, 0.00550129125, -0.0646630228, -0.0807948932, -0.0691601, 0.0392280743, 0.0301762037, -0.0140081309, 0.0420468301, -0.00557948509, -0.0549777299, -0.0220120344, 0.00502859708, 0.0374645256, -0.0760904327, 0.0330352224, 0.0163584054, 0.0245779436, -0.0579745099, 0.0352621749, 0.0915637463, -0.118237056, 0.060236, 0.0660546571, 0.020591991, 0.0096854642, -0.00692258, 0.0519558154, 0.0114249783, -0.0768825784, 0.00756506249, 0.127229154, -0.0109763751, 0.0473845899, -0.0579140484, -0.0459274128, -0.0147999637, 0.0512433574, -0.00297062239, -0.0547073521, -0.0396010801, 0.082407631, -0.0190310683, -0.0936329663, -0.0327521078, -0.0637850165, -0.0172274932, 0.0314972289, 0.0465377457, -0.00642039767, -0.100367449,

In [26]:
vector_db.index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'file_data': {'vector_count': 1590},
                'own_data': {'vector_count': 2}},
 'total_vector_count': 1592,
 'vector_type': 'dense'}

### Retrieving and Generating (RAG) with langchain/langsmith tracing ###

In [6]:
utils.tracing_is_enabled()

True

### Using langchain google generative ai ###

In [17]:
google_model = "gemini-2.5-flash"
# google_model = "gemini-2.0-flash-lite"
llm = ChatGoogleGenerativeAI(model=google_model)

rag = umd_rag.UMDRAG(vector_db, llm)

In [12]:
query = "How to dispose of batteries at UMD?"

In [18]:
piped = rag.pipe(query, include_metadata=True)
answer = piped['answer'].content
print(answer)

The University of Maryland encourages the responsible recycling of batteries, even though common alkaline batteries can legally be disposed of in the trash.

Here's how you can dispose of batteries at UMD:

*   **Use Campus Recycling Bins:** Batteries can be recycled in special bins located across campus. If you are in a lab setting, you can find a bin near your lab.
*   **UMD's Recycling Program:** Through a Sustainability Fund Mini-Grant, UMD has a program in place to safely recycle alkaline batteries, diverting thousands of pounds from landfills. The Office of Environmental Affairs (OEA) works with one of the largest battery recyclers in the US to collect and process used batteries.
*   **For Assistance:** If you have any questions about battery recycling or need help locating a bin, you can contact `recycle@umd.edu` for assistance.
