In [28]:
import os
from pinecone import Pinecone, ServerlessSpec

import json
import re
from langsmith import traceable, utils
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv

Load environment variables from .env

In [55]:
load_dotenv(override=True)


True

Index for SentenceTransformer embeddings: "umdsustainabilitychatbot"

Index for GoogleGenerativeAI embeddings (from langchain_google_genai): "umdsustainabilitychatbot2" - use this

### Create Pinecone / Get Index ###

In [56]:
pc = Pinecone()

In [57]:
index_name = "umdsustainabilitychatbot2"

existing_indexes = [index["name"] for index in pc.list_indexes()]

# create only if it doesn't exist already
if index_name not in existing_indexes:
  print("Index doesn't exist. Creating...")
  pc.create_index(
      name=index_name,
      dimension= 384, # Add embedding dimensions
      metric= "cosine",   # Add your similarity metric
      spec=ServerlessSpec(
          cloud="aws",
          region="us-east-1"
      )
  )
else:
  print("Index exists already")

Index exists already


In [58]:
index = pc.Index(index_name)
index

<pinecone.data.index.Index at 0x23bc6bf6ed0>

In [59]:
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 1620}},
 'total_vector_count': 1620,
 'vector_type': 'dense'}

Loading embedding model:

- SentenceTransformer (all-MiniLM-L6-v2)
- GoogleGenerativeAIEmbeddings (models/embedding-001)

In [60]:
embedding_model = GoogleGenerativeAIEmbeddings(model='models/embedding-001')

### Get data to add ###

In [None]:
files = ["umd_sustainability_data.json",
         "umd_sustainingprogress_data.json"]

In [None]:
def get_file_name(fname):
  file_re = re.compile(r'^umd_([A-Za-z0-9]*)_data.json$')

  matched = re.match(file_re, fname)
  if matched:
    return matched.group(1)
  else:
    raise Exception("Not good file name")

In [None]:
total_to_upsert = []

for file in files:
  with open(file, 'r') as f:
    data = json.load(f)

  fname = get_file_name(file)

  # Embed in batches because it's faster
  content = []
  for d in data:
    content.append(d['Content'])
  embedded_content = embedding_model.embed_documents(content)
  print(f"Embedding everything from {fname} data")

  # Create pinecone type of data
  batch = []
  for i, d in enumerate(data):
    pinecone_form = {}
    pinecone_form["id"] = f"{fname}_{i}"
    pinecone_form["values"] = embedded_content[i]
    pinecone_form["metadata"] = d

    batch.append(pinecone_form)

  total_to_upsert.extend(batch)

Embedding everything from sustainability data
Embedding everything from sustainingprogress data


In [None]:
len(total_to_upsert)

1620

In [None]:
batch_size = 200

for i in range(len(total_to_upsert) // batch_size + 1):
  start = i*batch_size
  stop = start + batch_size if len(total_to_upsert) - start > 200 else len(total_to_upsert)
  index.upsert(total_to_upsert[start:stop])
  print(f"Upserting batch {i}")


Upserting batch 0
Upserting batch 1
Upserting batch 2
Upserting batch 3
Upserting batch 4
Upserting batch 5
Upserting batch 6
Upserting batch 7
Upserting batch 8


In [None]:
index.describe_index_stats()

#### Upsert random data by ourselves ####

In [None]:
our_data = "The shortened link for our water refilling stations is ter.ps/heartthetap"
embedded = embed_text(our_data)
index.upsert([("our_data", embedded, {"text": our_data})])

{'upserted_count': 1}

### Retrieving and Generating (RAG) with langchain/langsmith tracing ###

In [None]:
utils.tracing_is_enabled()

In [62]:
@traceable
def retrieve_relevant_docs(query, top_k=10):
  # embed the query
  query_embedding = embedding_model.embed_query(query)

  # Query Pinecone for the top_k most relevant chunks
  # <code to query index>
  search_results = index.query(vector=query_embedding,
                                top_k=top_k,
                                include_metadata=True)

  # Return the relevant text chunks
  result = []
  for matched in search_results['matches']:
    result.append(matched['metadata'])

  return result

### Using langchain google generative ai ###

In [63]:
google_model = "gemini-2.0-flash-lite"
system_instruction = """
You are a helpful and thoughtful AI assistant designed to support the University of Maryland's
(UMD) sustainability-related inquiries.
You use only the provided context to generate responses.
If no context is provided, just say you don't have the capabilities for it.
Your tone should be clear, respectful, and supportive.
Avoid making assumptions if the context is missing or ambiguous.
Do not mention that you are getting context from somewhere.
Always aim to empower the user with useful and easy-to-understand guidance,
especially when supporting decision-making or understanding sustainability concepts.
Keep responses informative and avoid using technical terms unless requested.
If the context provided matches very well, use as much of the context as you can.
"""

llm = ChatGoogleGenerativeAI(model=google_model)

In [65]:
def generate_answer_langchain(query):
    retrieved_info = retrieve_relevant_docs(query)
    context = ""
    links = []
    for ri in retrieved_info:
      context += f"Site Title: {ri['Site_Title']}\n"
      context += f"Header: {ri['Header']}\n"
      context += f"Text: {ri['Content']}\n\n"

      links.append(ri['Link'])

    rag_prompt = f"Here are your instructions: \n {system_instruction} \n \
                   Here is the context provided to answer the query.\n \
                   Context: {context}\n\nQuestion: {query}\n\nAnswer:"
    generated = llm.invoke(rag_prompt)
    return generated, links

In [None]:
@traceable (
  run_type="llm",
  metadata={"ls_provider": "google_genai", "ls_model_name": google_model}
)
def run_pipeline(query):
  rag_generated, links = generate_answer_langchain(query)
  output = rag_generated.content + " Here are some links if you want to navigate: \n" + "\n".join(links)
  return output

Test

In [67]:
query = "What are some student groups on campus?"
answer = run_pipeline(query)
print(answer)

There are many student organizations on campus that focus on sustainability. You can find a highlighted list on the Office of Sustainability website and a comprehensive list on Terplink. Student groups such as the SGA Sustainability Committees and Sustainable Oceans Alliance are examples.Here are some links if you want to navigate: 
https://sustainingprogress.umd.edu/climate-change-resources/climate-emotions
https://sustainingprogress.umd.edu/guiding-commitments/climate-action-plan/climate-emotions
https://sustainability.umd.edu/get-involved/students/become-green-terp
https://sustainability.umd.edu/get-involved/students/become-green-terp
https://sustainability.umd.edu/sustainability-education-research
https://sustainingprogress.umd.edu/celebrating-stories/umds-journey-curbing-waste
https://sustainingprogress.umd.edu/celebrating-stories/umds-journey-curbing-waste
https://sustainability.umd.edu/topics
https://sustainability.umd.edu/greenworkspace
https://sustainability.umd.edu/get-involv

### Using Gradio ###

In [13]:
import gradio as gr

In [18]:
def gradio_response(input, history=""):
  return run_pipeline(input)

In [None]:
demo = gr.ChatInterface(fn=gradio_response,
                        title="UMD Sustainability Chatbot",
                        description="This is my DCC Capstone Project",
                        theme="ocean",
                        autofocus=True,
                        autoscroll=True)

demo.launch(debug=True)

  self.chatbot = Chatbot(


Keyboard interruption in main thread... closing server.




In [None]:
demo.close()

In [None]:
retrieve_relevant_docs("What is the 2030 Agenda for Sustainable Development?")

['"The 2030 Agenda for Sustainable Development, adopted by all United Nations Member States in 2015, provides a shared blueprint for peace and prosperity for people and the planet, now and into the future. At its heart are the17 Sustainable Development Goals (SDGs), which are an urgent call for action by all countries - developed and developing - in a global partnership. They recognize that ending poverty and other deprivations must go hand-in-hand with strategies that improve health and education, reduce inequality, and spur economic growth – all while tackling climate change and working to preserve our oceans and forests." -United Nations  ',
 'Encourage sustainable behaviors; and',
 'SustainableUMD refers to the campus-wide commitment to environmental responsibility. Any student, staff, or faculty can contribute to the SustainableUMD Network through research, academics, operations, or individual actions. Together we can tackle some of humanity’s grand challenges. SustainableUMD refe