In [13]:
!pip install langchain-core langchain-community langgraph transformers accelerate sentencepiece



In [23]:
!pip install -U langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Downloading langchain_huggingface-1.2.0-py3-none-any.whl (30 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-1.2.0


In [24]:
import sqlite3
from datetime import datetime

from transformers import pipeline

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFacePipeline

In [25]:
hf_pipeline = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_length=256
)

llm = HuggingFacePipeline(pipeline=hf_pipeline)

Device set to use cpu


In [26]:
class FrameIndex:
    def __init__(self, db="frames.db"):
        self.conn = sqlite3.connect(db)
        self.conn.execute("""
        CREATE TABLE IF NOT EXISTS frames (
            id INTEGER PRIMARY KEY,
            time TEXT,
            location TEXT,
            description TEXT,
            object_type TEXT
        )
        """)
        self.conn.commit()

    def insert(self, time, location, desc, obj):
        self.conn.execute(
            "INSERT INTO frames VALUES (NULL, ?, ?, ?, ?)",
            (time, location, desc, obj)
        )
        self.conn.commit()

    def all_events(self):
        return self.conn.execute(
            "SELECT time, location, description, object_type FROM frames"
        ).fetchall()

In [27]:
def detect_object(frame_text):
    text = frame_text.lower()
    if "truck" in text:
        return "vehicle"
    if "person" in text:
        return "person"
    return "unknown"

In [33]:
qa_prompt = ChatPromptTemplate.from_template("""
You are a drone security analyst AI.

You are given raw visual observations collected over time.
Each observation is evidence, NOT an answer.

Your task:
- Answer the question precisely
- Aggregate information across time
- Do NOT list individual observations
- Do NOT include timestamps unless the question explicitly asks for time
- Ignore irrelevant observations
- Do NOT mention people if the question is about vehicles (and vice versa)
- Produce a concise, human-readable answer

Observations:
{events}

Question:
{question}

Final Answer:
""")

In [29]:
qa_chain = (
    qa_prompt
    | llm
    | StrOutputParser()
)

In [34]:
index = FrameIndex()

def ask_question(question):
    events = index.all_events()
    if not events:
        return "No events recorded."

    context = ""
    for _, _, desc, _ in events:
        context += f"- {desc}\n"


    return qa_chain.invoke({
        "events": context,
        "question": question
    })

In [37]:
print("---- LIVE MONITORING ----")

frames = [
    {"time": "00:01", "location": "Main Gate", "frame": "Person standing near gate"},
    {"time": "12:00", "location": "Garage", "frame": "Blue truck entering garage"},
    {"time": "15:30", "location": "House", "frame": "Blue box of jewels"},
]

for f in frames:
    obj = detect_object(f["frame"])
    index.insert(f["time"], f["location"], f["frame"], obj)
    print(f"LOG: {obj} detected at {f['location']} {f['time']}")

---- LIVE MONITORING ----
LOG: person detected at Main Gate 00:01
LOG: vehicle detected at Garage 12:00
LOG: unknown detected at House 15:30


In [39]:
print("\n---- NATURAL LANGUAGE Q&A ----\n")

question = "What do you see in the house?"
print("USER:", question)
print("AGENT:", ask_question(question))


---- NATURAL LANGUAGE Q&A ----

USER: What do you see in the house?
AGENT: Blue box of jewels
