In [13]:
from langsmith import traceable, utils
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import re
import umd_rag
import pineconing
import umd_webscraper
import my_utils

import importlib

In [14]:
importlib.reload(umd_rag)
importlib.reload(pineconing)
importlib.reload(umd_webscraper)
importlib.reload(my_utils)

<module 'my_utils' from 'c:\\Users\\ethan\\Desktop\\umd\\s4 spring 2025\\HDCC209B\\UMD-Sustainability-Chatbot\\my_utils.py'>

Load environment variables from .env

In [15]:
load_dotenv(override=True)

True

#### Web Scraping ####

In [4]:
sites = ["https://sustainability.umd.edu/",
         "https://sustainingprogress.umd.edu/"]

In [5]:
def get_site_name(site):
  site_re = re.compile(r'^https://([A-Za-z0-9]*).umd.edu/$')

  matched = re.match(site_re, site)
  if matched:
    return matched.group(1)
  else:
    raise Exception("Not good site name (not umd)")

Scrape all sites

In [6]:
data_count = 0

for site in sites:
  scraper = umd_webscraper.UMDWebScraper(site)
  scraper.scrape(site)

  site_name = get_site_name(site)
  print(f"Total for {site_name} site: {len(scraper.data)}")
  data_count += len(scraper.data)
  scraper.save_data(f"umd_{site_name}_data.json")

Scraping: https://sustainability.umd.edu/
Scraping: https://sustainability.umd.edu/education-and-research/majors-minors-graduate-programs
Scraping: https://sustainability.umd.edu/food
Scraping: https://sustainability.umd.edu/transportation
Scraping: https://sustainability.umd.edu/topics
Scraping: https://sustainability.umd.edu/sustainability-education-research
Scraping: https://sustainability.umd.edu/OS
Scraping: https://sustainability.umd.edu/about/office-sustainability/quinn-lugenbeel
Scraping: https://sustainability.umd.edu/energy
Scraping: https://sustainability.umd.edu/sustainability-grants/umd-sustainability-mini-grant
Scraping: https://sustainability.umd.edu/about
Scraping: https://sustainability.umd.edu/advisors
Scraping: https://sustainability.umd.edu/about/contact-us
Scraping: https://sustainability.umd.edu/sustainability-grants/umd-sustainability-fund-grant
Scraping: https://sustainability.umd.edu/sustainability-grants/other-funding-sources
Scraping: https://sustainability.u

KeyboardInterrupt: 

In [None]:
data_count

### Embedding into vector storage ###

In [16]:
vector_db = pineconing.VectorDB()

Index exists already


In [17]:
vector_db.index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'file_data': {'vector_count': 1620},
                'own_data': {'vector_count': 1}},
 'total_vector_count': 1621,
 'vector_type': 'dense'}

Loading embedding model:

- SentenceTransformer (all-MiniLM-L6-v2)
- GoogleGenerativeAIEmbeddings (models/embedding-001)

In [32]:
embedding_model = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

#### Adding Data ####
We only need to do this once

By files

In [None]:
files = ["datafiles/umd_sustainability_data.json",
         "datafiles/umd_sustainingprogress_data.json"]

vector_db.upsert_files(files)

By own data

In [8]:
our_data = "The shortened link for the water refilling stations website is https://ter.ps/heartthetap"
vector_db.upsert_own_data(our_data)

Delete data and reset count

In [18]:
vector_db.index.delete(delete_all=True, namespace='own_data')
with open("datafiles/own_data_id_count.txt", 'r+') as file:
    file.seek(0)
    file.write('0')

### Retrieving and Generating (RAG) with langchain/langsmith tracing ###

In [16]:
utils.tracing_is_enabled()

True

### Using langchain google generative ai ###

In [10]:
google_model = "gemini-2.0-flash-lite"
llm = ChatGoogleGenerativeAI(model=google_model)

rag = umd_rag.UMDRAG(vector_db, llm)

In [12]:
piped = rag.pipe("What are some of the challenges of sustainability at UMD?", include_metadata=True)
piped['metadata']

[{
     "namespace": "file_data",
     "score": 0.77050668,
     "id": "sustainability_119",
     "values": [],
     "metadata": {
         "Content": "Campus programs you can take part in University initiatives to address sustainability in these areas UMD's key goals, achievements, and commitments ",
         "Header": "Explore Sustainability Topics",
         "Link": "https://sustainability.umd.edu/topics",
         "Site_Title": "Sustainability Impact Areas | SustainableUMD"
     }
 },
 {
     "namespace": "file_data",
     "score": 0.770001054,
     "id": "sustainingprogress_220",
     "values": [],
     "metadata": {
         "Content": "All Schools and Colleges at UMD include academic and research opportunities relating to sustainability. Staff and administrative teams throughout the university incorporate sustainability into the university operations. We're committed to addressing the grand challenges of climate change and sustainability. The SustainableUMD Network connects stud

In [35]:
a = my_utils.organize_retrieval(meta)
print(a)

Top 3 retrieved documents:
1. sustainability_119
- Text: Campus programs you can take part in University initiatives to address sustainability in these areas UMD's key goals, achievements, and commitments 
- Link: https://sustainability.umd.edu/topics
- Score: 0.77050668

2. sustainingprogress_220
- Text: All Schools and Colleges at UMD include academic and research opportunities relating to sustainability. Staff and administrative teams throughout the university incorporate sustainability into the university operations. We're committed to addressing the grand challenges of climate change and sustainability. The SustainableUMD Network connects students, staff, faculty, and other community members to sustainability-related opportunities at UMD. This section curates a collection of resources -- reports, articles, newsletters, social media accounts, and other resources -- created by many sustainability-affiliated units across campus. 
- Link: https://sustainingprogress.umd.edu/celebrating