In [1]:
!pip install langchain langchain-community langchain-google-genai chromadb
!pip install pypdf pandas streamlit python-dotenv




In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

from dotenv import load_dotenv
import os


In [None]:
import tempfile 
import pandas as pd
from dotenv import load_dotenv
import streamlit as st

load_dotenv()  # Load environment variables from .env file

True

In [3]:
Google_api_key = os.environ.get("GOOGLE_API_KEY")

In [4]:
llm = ChatGoogleGenerativeAI(temperature=0, model="gemini-2.5-flash", api_key=Google_api_key)
llm.invoke("tell me a joke")

AIMessage(content="Why don't scientists trust atoms?\n\nBecause they make up everything!", additional_kwargs={}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': [], 'model_provider': 'google_genai'}, id='lc_run--981a0eb8-a600-49a5-b192-dff34790746a-0', usage_metadata={'input_tokens': 5, 'output_tokens': 637, 'total_tokens': 642, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 622}})

In [5]:
### load
loader = PyPDFLoader("deepak_seminar.pdf")
pages = loader.load()
pages

[Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-11-06T11:49:19+05:30', 'author': 'HODIT', 'moddate': '2025-11-06T11:49:19+05:30', 'source': 'deepak_seminar.pdf', 'total_pages': 20, 'page': 0, 'page_label': '1'}, page_content='1 \n \nA Seminar Report \n on \n \n“Automatic Weather Observation System (AWOS)” \n \nSubmitted to the  \nSavitribai Phule Pune University  \nIn partial fulfillment for the award of the Degree of \nBachelor of Engineering  \nin \nInformation Technology \nby \nDeepak Kumar \nRoll No: 4318 \nClass: TE-IT(A)  \n \n \nUnder the guidance of \n \nDr. Ashwini Sapkal \n \nDepartment of Information Technology \nArmy Institute of Technology, \nDighi Hills, Pune - 411 015. \n \n2025-2026'),
 Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-11-06T11:49:19+05:30', 'author': 'HODIT', 'moddate': '2025-

In [6]:
# split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=200 , separators=["\n\n", "\n", " ", ""] , length_function=len)
chunks = text_splitter.split_documents(pages)
chunks[0]

Document(metadata={'producer': 'Microsoft® Word for Microsoft 365', 'creator': 'Microsoft® Word for Microsoft 365', 'creationdate': '2025-11-06T11:49:19+05:30', 'author': 'HODIT', 'moddate': '2025-11-06T11:49:19+05:30', 'source': 'deepak_seminar.pdf', 'total_pages': 20, 'page': 0, 'page_label': '1'}, page_content='1 \n \nA Seminar Report \n on \n \n“Automatic Weather Observation System (AWOS)” \n \nSubmitted to the  \nSavitribai Phule Pune University  \nIn partial fulfillment for the award of the Degree of \nBachelor of Engineering  \nin \nInformation Technology \nby \nDeepak Kumar \nRoll No: 4318 \nClass: TE-IT(A)  \n \n \nUnder the guidance of \n \nDr. Ashwini Sapkal \n \nDepartment of Information Technology \nArmy Institute of Technology, \nDighi Hills, Pune - 411 015. \n \n2025-2026')

In [7]:
def get_embedding():
    embeddings = GoogleGenerativeAIEmbeddings(
        model="gemini-embedding-001",
        api_key=Google_api_key
    )
    return embeddings
embedding_function = get_embedding()



In [16]:
import uuid
import hashlib
from langchain_community.vectorstores import Chroma

def create_vectorstore(chunks, embedding_function, vectorstore_path="chroma_db"):
    
    def hash_text(text):
        return hashlib.md5(text.encode()).hexdigest()

    # Create unique IDs
    ids = [hash_text(doc.page_content) for doc in chunks]

    unique_chunks = []
    unique_ids = []
    seen = set()

    for chunk, id_ in zip(chunks, ids):
        if id_ not in seen:
            seen.add(id_)
            unique_chunks.append(chunk)
            unique_ids.append(id_)

    # ❗❗ FIXED: use embedding= , NOT embedding_function=
    vectorstore = Chroma.from_documents(
        documents=unique_chunks,
        embedding=embedding_function,     
        ids=unique_ids,
        collection_name="ai_notes",
        persist_directory=vectorstore_path
    )

    vectorstore.persist()
    return vectorstore


In [17]:
vectorstore = Chroma(
    collection_name="ai_notes",
    persist_directory="chroma_db",
    embedding_function=embedding_function
)


In [19]:
vectorstore = create_vectorstore(
    chunks=chunks,  
    embedding_function=embedding_function   # ✅ FIXED
)


  vectorstore.persist()


In [21]:
vectorstore = Chroma(
    collection_name="ai_notes",
    persist_directory="chroma_db",
    embedding_function=embedding_function   # ← use this here
)


In [24]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4}
)

query = "topic of seminar "

relevant_docs = retriever.invoke(query)

for i, doc in enumerate(relevant_docs, start=1):
    print(f"\n---- Result {i} ----")
    print(doc.page_content)




---- Result 1 ----
1 
 
A Seminar Report 
 on 
 
“Automatic Weather Observation System (AWOS)” 
 
Submitted to the  
Savitribai Phule Pune University  
In partial fulfillment for the award of the Degree of 
Bachelor of Engineering  
in 
Information Technology 
by 
Deepak Kumar 
Roll No: 4318 
Class: TE-IT(A)  
 
 
Under the guidance of 
 
Dr. Ashwini Sapkal 
 
Department of Information Technology 
Army Institute of Technology, 
Dighi Hills, Pune - 411 015. 
 
2025-2026

---- Result 2 ----
11 
 
3.MOTIVATION, PURPOSE , SCOPE AND 
OBJECTIVE OF SEMINAR  
 
 
MOTIVATION AND PURPOSE (PROBLEM STATEMENT) 
Accurate and continuous weather observation is essential for effective 
forecasting, marine safety, climate research, and disaster preparedness. However, 
traditional observation methods and ground-based systems face major limitations 
in coverage, data consistency, and real-time accessibility 
• Limited Spatial and Temporal Coverage: Conventional observation stations 
are often restricted 

In [27]:
PROMPT_TEMPLATE = """
You are an assistant for a question-answering task.
Use the following pieces of retrieved context to answer the question.
If you do not know the answer, say "I don't know" — do not make up an answer.
If the question is not related to the context, politely say that you can answer only context-related questions.

=== Retrieved Context ===
{context}

---

Answer the question: {question}
"""


In [28]:
content_text = "\n\n".join([doc.page_content for doc in relevant_docs])

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=PROMPT_TEMPLATE
)

prompt = prompt_template.format(
    context=content_text,
    question=query
)


In [29]:
response = llm.invoke(prompt)
print(response)


content='The topic of the seminar is "Automatic Weather Observation System (AWOS)".' additional_kwargs={} response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'model_name': 'gemini-2.5-flash', 'safety_ratings': [], 'model_provider': 'google_genai'} id='lc_run--e374e76f-1f99-4deb-93f0-c96644ab8f8e-0' usage_metadata={'input_tokens': 1027, 'output_tokens': 436, 'total_tokens': 1463, 'input_token_details': {'cache_read': 0}, 'output_token_details': {'reasoning': 421}}
