In [56]:
from langsmith import traceable, utils
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
import re
import umd_rag
import pineconing
import umd_webscraper
import my_utils

import importlib

In [71]:
importlib.reload(umd_rag)
importlib.reload(pineconing)
importlib.reload(umd_webscraper)
importlib.reload(my_utils)

<module 'my_utils' from 'c:\\Users\\ethan\\Desktop\\umd\\s4 spring 2025\\HDCC209B\\UMD-Sustainability-Chatbot\\my_utils.py'>

Load environment variables from .env

In [45]:
load_dotenv(override=True)

True

#### Web Scraping ####

In [11]:
sites = ["https://sustainability.umd.edu/",
         "https://sustainingprogress.umd.edu/"]

In [12]:
def get_site_name(site):
  site_re = re.compile(r'^https://([A-Za-z0-9]*).umd.edu/$')

  matched = re.match(site_re, site)
  if matched:
    return matched.group(1)
  else:
    raise Exception("Not good site name (not umd)")

Scrape all sites

In [None]:
data_count = 0

for site in sites:
  scraper = umd_webscraper.UMDWebScraper(site)
  scraper.scrape(site)

  site_name = get_site_name(site)
  print(f"Total for {site_name} site: {len(scraper.data)}")
  data_count += len(scraper.data)
  scraper.save_data(f"datafiles/umd_{site_name}_data.json")

Scraping: https://sustainability.umd.edu/
Scraping: https://sustainability.umd.edu/transportation
Scraping: https://sustainability.umd.edu/about/contact-us
Scraping: https://sustainability.umd.edu/OS
Scraping: https://sustainability.umd.edu/about/office-sustainability/trisha-raghuram
Scraping: https://sustainability.umd.edu/news
Scraping: https://sustainability.umd.edu/sustainability-grants/umd-sustainability-mini-grant
Scraping: https://sustainability.umd.edu/education-and-research/majors-minors-graduate-programs
Scraping: https://sustainability.umd.edu/sustainability-education-research
Scraping: https://sustainability.umd.edu/progress/reports-other-resources
Scraping: https://sustainability.umd.edu/about/sustainability-council
Scraping: https://sustainability.umd.edu/node/155
Scraping: https://sustainability.umd.edu/education-and-research/living-learning-programs
Scraping: https://sustainability.umd.edu/about
Scraping: https://sustainability.umd.edu/waste
Scraping: https://sustainabi

In [19]:
print(scraper.data[0]['Content'])

Welcome to the Office of Sustainability's web portal for reporting UMD's measurable steps toward achieving campus sustainability goals. This portal also makes connections between UMD's campus-based progress and support for global sustainable development. Explore campus data 
Site Title: SustainableUMD Progress Hub | Office of Sustainability
Header: SustainableUMD ProgressHub
Link: https://sustainingprogress.umd.edu/


In [14]:
data_count

1590

### Embedding into vector storage ###

In [58]:
vector_db = pineconing.VectorDB()

Index exists already


In [59]:
vector_db.index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'file_data': {'vector_count': 1590}},
 'total_vector_count': 1590,
 'vector_type': 'dense'}

#### Adding Data ####
We only need to do this once

By files

In [53]:
files = ["datafiles/umd_sustainability_data.json",
         "datafiles/umd_sustainingprogress_data.json"]

vector_db.upsert_files(files)

Embedding everything from sustainability data
Embedding everything from sustainingprogress data
Upserting batch 0
Upserting batch 1
Upserting batch 2
Upserting batch 3
Upserting batch 4
Upserting batch 5
Upserting batch 6
Upserting batch 7


By own data

In [79]:
our_data = "The shortened link for the water refilling stations website is https://ter.ps/heartthetap"
vector_db.upsert_own_data(our_data)

Delete File data

In [71]:
vector_db.index.delete(delete_all=True, namespace='file_data')

{}

Delete own data and reset count

In [31]:
vector_db.index.delete(delete_all=True, namespace='own_data')
with open("datafiles/own_data_id_count.txt", 'r+') as file:
    file.seek(0)
    file.write('0')

In [80]:
vector_db.index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'file_data': {'vector_count': 1590},
                'own_data': {'vector_count': 1}},
 'total_vector_count': 1591,
 'vector_type': 'dense'}

### Retrieving and Generating (RAG) with langchain/langsmith tracing ###

In [81]:
utils.tracing_is_enabled()

True

### Using langchain google generative ai ###

In [75]:
google_model = "gemini-2.5-flash"
llm = ChatGoogleGenerativeAI(model=google_model)

rag = umd_rag.UMDRAG(vector_db, llm)

In [78]:
piped = rag.pipe(query, include_metadata=True)
answer = piped['answer'].content
print(answer)

The provided information highlights UMD's progress and commitment to sustainability, including addressing "society's Grand Challenges" and the "grand challenges of climate change and sustainability." However, it does not explicitly detail specific challenges that UMD faces in its sustainability efforts on campus.

The context focuses on:
*   Interactive summaries of UMD's progress (2020-2021).
*   Articles highlighting individuals and departments driving sustainability.
*   The role of the Sustainability Council in integrating sustainability into teaching, research, and service.
*   The Sustainability Advisors program, which aims to inspire students to get involved with "ending poverty, fighting inequality, enhancing public health, stopping climate change, and more," describing this as a "bold agenda" and supporting "Climate Action goals and President Pines' call to address society's Grand Challenges."
*   Academic and research opportunities, and administrative teams incorporating sust