In [1]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr
import numpy as np


In [2]:
# imports for langchain and Chroma and plotly

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from sklearn.manifold import TSNE
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain



In [3]:
# price is a factor for our company, so we're going to use a low cost model

MODEL = "gpt-4o-mini"
db_name = "vector_db"

In [4]:
# Load environment variables in a file called .env

load_dotenv(override=True)
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')

In [5]:
# Read in documents using LangChain's loaders
# Take everything in all the sub-folders of our knowledgebase


folders = glob.glob("knowledge base/*")

text_loader_kwargs = {'encoding': 'utf-8'}
# text_loader_kwargs={'autodetect_encoding': True}

documents = []
for folder in folders:
    doc_type = os.path.basename(folder)
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    for doc in folder_docs:
        doc.metadata["doc_type"] = doc_type
        documents.append(doc)

In [6]:
print(documents[0:2])
len(documents)

[Document(metadata={'source': 'knowledge base\\Almora\\Kumaon_Hills_Characteristics.md', 'doc_type': 'Almora'}, page_content='## Kumaon Hills Characteristics\n\nAlmora district represents the heart of the Kumaon Hills region, characterized by a complex ridge-valley system that creates a dendritic pattern of drainage and settlement. The geological foundation of the district consists primarily of rocks from the Lesser Himalayan sequence, which are generally more stable than the highly metamorphosed rocks found in the higher elevation districts of the Garhwal region.The ridge-valley system creates a natural pattern of settlement where communities are typically located on ridge tops or upper slopes, providing natural protection from valley floor hazards while maintaining access to water sources and agricultural land. This settlement pattern has evolved over centuries and reflects the accumulated wisdom of local communities in managing natural hazards in mountain environments. <br><br>\nSlo

50

In [7]:
text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

Created a chunk of size 1058, which is longer than the specified 800


In [8]:
len(chunks)

135

In [9]:
chunks[3]

Document(metadata={'source': 'knowledge base\\Almora\\Soil_Conservation_Practices.md', 'doc_type': 'Almora'}, page_content='Traditional soil conservation methods in Almora district include the chaur system of terrace farming, which has been refined over centuries to provide effective erosion control while maintaining agricultural productivity. These traditional methods are based on detailed understanding of local topographic, geological, and climatic conditions and represent sophisticated approaches to sustainable land management in mountain environments. Modern interventions in soil conservation include the construction of check dams to control erosion in drainage channels, contour trenching to reduce surface runoff and increase infiltration, and the establishment of vegetation barriers along contour lines to provide physical and biological soil protection. These interventions have been implemented through various government and non-governmental programs aimed at reducing erosion and 

In [10]:
doc_types = set(chunk.metadata['doc_type'] for chunk in chunks)
print(f"Document types found: {', '.join(doc_types)}")

Document types found: Pithoragarh, Major_Historical_Events, Bageshwar, Udham Singh Nagar, Nainital, Rudraprayag, Pauri Garhwal, Tehri Garhwal, Champawat, Chamoli, Dehradun, Haridwar, Almora, Uttarkashi


In [11]:
for chunk in chunks:
    if 'disaster' in chunk.page_content:
        print(chunk)
        print("_________")

page_content='##  Advanced Risk Zonation

Risk zonation within Chamoli district reflects the complex interaction between glacial hazards, topographic vulnerability, and population distribution. Extremely high-risk areas include the Alaknanda valley between Badrinath and Joshimath, where narrow valley geometry, extensive glaciation, and concentrated infrastructure create maximum vulnerability conditions. The Dhauliganga valley, site of the 2021 disaster, remains under extremely high risk due to continued glacial instability and the potential for additional rock-ice avalanches.' metadata={'source': 'knowledge base\\Chamoli\\Advanced_Risk_Zonation.md', 'doc_type': 'Chamoli'}
_________
page_content='The 2021 Chamoli flash flood represents the most recent example of the complex interaction between glacial processes and extreme weather events. While the exact trigger mechanism remains under investigation, the event demonstrated how rapidly glacial hazards can develop and how devastating thei

In [12]:
embeddings = OpenAIEmbeddings()

In [13]:
# Check if a Chroma Datastore already exists - if so, delete the collection to start from scratch

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

In [14]:
# Create our Chroma vectorstore!

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 135 documents


In [15]:
# Get one vector and find how many dimensions it has

collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


In [16]:
sample_embedding

array([ 0.01624026,  0.02546609,  0.00811309, ..., -0.01238091,
       -0.01204287, -0.01495851], shape=(1536,))

In [17]:
# Prework

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
doc_types = [metadata['doc_type'] for metadata in result['metadatas']]
print(doc_types)

colors = [['blue', 'green', 'red', 'orange', 'black', 'purple','brown']
          [['Almora', 'Dehradun', 'Haridwar', 'Nanital', 'Uttarkashi','Tehri', 'Rudraprayag'].index(t)]
          for t in doc_types if t in ['Almora', 'Dehradun', 'Haridwar', 'Nanital', 'Uttarkashi','Tehri', 'Rudraprayag']]


print(colors.count('green'))




['Almora', 'Almora', 'Almora', 'Almora', 'Bageshwar', 'Bageshwar', 'Bageshwar', 'Bageshwar', 'Chamoli', 'Chamoli', 'Chamoli', 'Chamoli', 'Chamoli', 'Chamoli', 'Chamoli', 'Chamoli', 'Chamoli', 'Chamoli', 'Chamoli', 'Chamoli', 'Champawat', 'Champawat', 'Champawat', 'Champawat', 'Champawat', 'Champawat', 'Champawat', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Dehradun', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Haridwar', 'Major_Historical_Events', 'Major_Historical_Events', 'Major_Historical_Events', 'Major_Historical_Events', 'Major_Historical_Events', 'Major_Historical_Events', 'Major_Historical_Events', 'Major_Historical_Events', 'Major_Historical_Events', 'Major_Historical_Events', 'Nainital', 'Nainital', 'Naini

In [18]:
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [19]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()