In [1]:
from langsmith import traceable, utils
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import re
import umd_rag
import pineconing
import umd_webscraper

import importlib
importlib.reload(umd_rag)
importlib.reload(pineconing)
importlib.reload(umd_webscraper)

  from .autonotebook import tqdm as notebook_tqdm


<module 'umd_webscraper' from 'c:\\Users\\ethan\\Desktop\\umd\\s4 spring 2025\\HDCC209B\\UMD-Sustainability-Chatbot\\umd_webscraper.py'>

Load environment variables from .env

In [2]:
load_dotenv(override=True)

True

#### Web Scraping ####

In [3]:
sites = ["https://sustainability.umd.edu/",
         "https://sustainingprogress.umd.edu/"]

In [4]:
def get_site_name(site):
  site_re = re.compile(r'^https://([A-Za-z0-9]*).umd.edu/$')

  matched = re.match(site_re, site)
  if matched:
    return matched.group(1)
  else:
    raise Exception("Not good site name (not umd)")

Scrape all sites

In [None]:
data_count = 0

for site in sites:
  scraper = umd_webscraper.UMDWebScraper(site)
  scraper.scrape(site)

  site_name = get_site_name(site)
  print(f"Total for {site_name} site: {len(scraper.data)}")
  data_count += len(scraper.data)
  scraper.save_data(f"umd_{site_name}_data.json")

In [None]:
data_count

### Embedding into vector storage ###

In [5]:
vector_db = pineconing.VectorDB()

Index exists already


In [6]:
vector_db.index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'file_data': {'vector_count': 1620},
                'own_data': {'vector_count': 1}},
 'total_vector_count': 1621,
 'vector_type': 'dense'}

Loading embedding model:

- SentenceTransformer (all-MiniLM-L6-v2)
- GoogleGenerativeAIEmbeddings (models/embedding-001)

In [7]:
embedding_model = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

#### Adding Data ####
We only need to do this once

By files

In [None]:
files = ["umd_sustainability_data.json",
         "umd_sustainingprogress_data.json"]

vector_db.upsert_files(files)

By own data

In [None]:
our_data = "The shortened link for our water refilling stations is ter.ps/heartthetap"
vector_db.upsert_own_data(our_data)

### Retrieving and Generating (RAG) with langchain/langsmith tracing ###

In [8]:
utils.tracing_is_enabled()

True

### Using langchain google generative ai ###

In [9]:
google_model = "gemini-2.0-flash-lite"
llm = ChatGoogleGenerativeAI(model=google_model)

rag = umd_rag.UMDRAG(vector_db, llm)

In [10]:
rag.pipe("What are some of the challenges of sustainability at UMD?")


AIMessage(content='UMD is committed to addressing the grand challenges of climate change and sustainability. The university recognizes the importance of healthy water, air, and ecosystems for a thriving society. UMD strives to empower students, faculty, and staff to take sustainable steps by bringing local and global systems into balance, redesigning processes to conserve resources, and experimenting with novel sustainable practices.', additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.0-flash-lite', 'safety_ratings': []}, id='run-e88d329a-f153-44e1-a03e-ca17c937a61d-0', usage_metadata={'input_tokens': 1432, 'output_tokens': 72, 'total_tokens': 1504, 'input_token_details': {'cache_read': 0}})