In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/agents-intensive-capstone-project/Hackathon dataset.txt


# **Notee Insight Agent**
**A Cognitive AI system for Knowledge Graph Reasoning & Insight Generator**\

This notebook implements a *second brain* agent that:
1. Generates a synthetic dataset of personal notes and interactions
2. Uses Gemini embeddings to embed notes ans build a semantic memory space
3. Builds a personal knowledge graph over notes + tags + time
4. Runs an insight engine that detects:
       - Novel connections between distant notes
       - Revived old notes
       - Topic drift over time
5. Uses Gemini as:
       - A reflection agent that explains the insights and suggests next steps.
       - A RAG Q&A Layer over notes

In [5]:
!pip uninstall -qqy jupyterlab kfp 2>/dev/null
!pip install -q faker umap-learn networkx
!pip install -U -q "google-genai==1.7.0"

In [8]:
import random
import uuid
import math
from datetime import datetime, timedelta
from collections import Counter

from faker import Faker
from dateutil.relativedelta import relativedelta

import networkx as nx
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import umap
import matplotlib.pyplot as plt

from tqdm import tqdm
import json

RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
Faker.seed(RANDOM_SEED)
fake = Faker()

plt.rcParams["figure.figsize"] = (10, 7)

print("Imports ready")

Imports ready


In [11]:
##Gemini configuration
from google import genai
from google.genai import types

##Retry
from google.api_core import retry


is_retriable = lambda e: (isinstance(e, genai.errors.APIError) and e.code in {429, 503})

genai.models.Models.generate_content = retry.Retry(
    predicate=is_retriable)(genai.models.Models.generate_content)



from kaggle_secrets import UserSecretsClient

GEMINI_API_KEY = UserSecretsClient().get_secret("GEMINI_API_KEY")

client = genai.Client(api_key=GEMINI_API_KEY)

EMBED_MODEL = "models/text-embedding-004"

GEN_MODEL = "gemini-2.0-flash"

print("Gemini configured")

Gemini configured


**Generate Synthetic Notes & Interactions**
 
We create:
 
- `notes_df` – personal notes with:
  - title, content, tags, topic, context, timestamp
- `interactions_df` – events like opening, editing, and revisiting notes

In [12]:
# ==== Synthetic Data Generation ====
 
N_NOTES = 400
MONTHS_SPAN = 9
 
START_DATE = datetime.utcnow() - relativedelta(months=MONTHS_SPAN)
 
TAG_POOL = [
    "ai", "product", "research", "design", "meeting", "personal", "health",
    "finance", "career", "reading", "idea", "experiment", "ml", "data", "ops", "writing"
]
 
TOPIC_POOL = [
    "agent-architecture", "user-modeling", "product-vision", "growth",
    "habit-tracking", "literature-review", "tutorials", "experiments",
    "design-patterns", "notes-org"
]
 
CONTEXT_POOL = ["meeting", "journaling", "reading", "idea", "task-note", "clip", "email"]
SOURCE_POOL = ["manual", "web-clip", "pdf", "auto-generated", "imported"]
 
INTERACTION_TYPES = [
    "open_note", "edit_note", "revisit_old_note",
    "add_new_note", "search", "link_note", "merge_notes"
]
 
def random_timestamp(start=START_DATE, months_span=MONTHS_SPAN):
    days = months_span * 30
    offset = random.randint(0, days * 24 * 60 * 60)
    return start + timedelta(seconds=offset)
 
def sample_tags(k=2):
    k = min(k, len(TAG_POOL))
    return sorted(random.sample(TAG_POOL, k))
 
def generate_paragraph(min_sent=2, max_sent=6):
    return " ".join(fake.sentence().rstrip('.') for _ in range(random.randint(min_sent, max_sent)))
 
 
# ---- Notes ----
notes = []
note_ids = []
 
for _ in tqdm(range(N_NOTES), desc="Generating notes"):
    nid = str(uuid.uuid4())
    note_ids.append(nid)
 
    title = fake.sentence(nb_words=random.randint(3, 8)).rstrip('.')
    paragraphs = [generate_paragraph() for _ in range(random.randint(1, 3))]
    content = "\n\n".join(paragraphs)
 
    n_tags = random.choices([1, 2, 3], weights=[0.5, 0.35, 0.15])[0]
    tags = sample_tags(k=n_tags)
    topic = random.choice(TOPIC_POOL)
    context = random.choice(CONTEXT_POOL)
    timestamp = random_timestamp()
    source = random.choice(SOURCE_POOL)
 
    # bias a few notes to be older for resurrection logic
    if random.random() < 0.08:
        timestamp = START_DATE + timedelta(days=random.randint(0, int(MONTHS_SPAN * 30 / 2)))
 
    notes.append({
        "note_id": nid,
        "title": title,
        "content": content,
        "tags": ",".join(tags),
        "topic": topic,
        "context": context,
        "timestamp": timestamp,
        "source": source
    })
 
notes_df = pd.DataFrame(notes).sort_values("timestamp").reset_index(drop=True)
 
 
# ---- Interactions ----
interactions = []
interaction_id = 0
 
if not notes_df.empty:
    start_ts = notes_df["timestamp"].min()
    end_ts = notes_df["timestamp"].max()
else:
    start_ts = START_DATE
    end_ts = datetime.utcnow()
 
num_sessions = max(30, N_NOTES // 4)
 
for _ in range(num_sessions):
    session_anchor = start_ts + timedelta(
        seconds=random.randint(0, int((end_ts - start_ts).total_seconds()))
    )
    session_length = random.randint(3, 8)
 
    for _ in range(session_length):
        interaction_id += 1
        ts = session_anchor + timedelta(minutes=random.randint(0, 90))
        itype = random.choices(
            INTERACTION_TYPES,
            weights=[0.3, 0.15, 0.1, 0.25, 0.08, 0.06, 0.06],
            k=1
        )[0]
 
        target = random.choice(note_ids) if itype in ["open_note", "edit_note", "revisit_old_note", "add_new_note", "link_note", "merge_notes"] else ""
 
        interactions.append({
            "interaction_id": f"int_{interaction_id}",
            "type": itype,
            "target_id": target,
            "timestamp": ts
        })
 
interactions_df = pd.DataFrame(interactions).sort_values("timestamp").reset_index(drop=True)
 
print("notes_df:", notes_df.shape)
print("interactions_df:", interactions_df.shape)
 
notes_df.head()

Generating notes: 100%|██████████| 400/400 [00:00<00:00, 4630.52it/s]

notes_df: (400, 8)
interactions_df: (570, 4)





Unnamed: 0,note_id,title,content,tags,topic,context,timestamp,source
0,b3fe0633-4805-4015-bf14-8976682a6296,High tough hundred bar effect international,Movie audience run yet Research nor positive m...,meeting,habit-tracking,idea,2025-03-01 09:55:48.467780,imported
1,175e9de5-6b57-4dee-97cf-21ad7a0a2ad3,Many most green,Center build happy near Trouble news five deci...,reading,product-vision,journaling,2025-03-01 12:06:25.467780,imported
2,49c02499-5c5f-4cb9-b7c9-0be3776143aa,Former bed use,Data character defense subject guy training To...,"idea,writing",experiments,meeting,2025-03-03 08:58:51.467780,imported
3,b20750e6-1114-4df4-af5e-a7bf3729ddef,Question evening imagine,Site price ever Out wish fish determine adult ...,"ops,personal",habit-tracking,meeting,2025-03-04 09:43:32.467780,pdf
4,886a5879-e932-40de-b0ae-d6b2d948797b,Record couple,Owner company expert table reality stock site ...,"idea,personal,product",literature-review,idea,2025-03-05 05:33:51.467780,pdf


**Gemini Embeddings & Personal Knowledge Graph**
 
Now we:
 
1. Use Gemini embeddings to embed each note.
2. Build a Personal Knowledge Graph (PKG) with:
   - note nodes
   - tag nodes
   - semantic edges (similar notes)
   - temporal edges (notes created near each other)
   - interaction-based metadata (revisits, last interaction)
3. Create a UMAP projection for visualization.

In [23]:
# ==== Embeddings with Gemini + Graph Construction ====
 
notes_df["timestamp"] = pd.to_datetime(notes_df["timestamp"])
notes_df["content_short"] = notes_df["title"].fillna('') + ". " + notes_df["content"].fillna('')
 
texts = notes_df["content_short"].tolist()
 
def embed_texts_with_gemini(text_list, model_name=EMBED_MODEL, batch_size=32):
    """Use Gemini embeddings API to embed a list of texts."""
    all_embeddings = []
    for i in tqdm(range(0, len(text_list), batch_size), desc="Embedding notes with Gemini"):
        batch = text_list[i:i+batch_size]
        result = client.models.embed_content(
            model=model_name,
            contents=batch,
            config=types.EmbedContentConfig(
                task_type="retrieval_document",
            ),
        )
        # result['embedding'] is a list of vectors (for list input)
        batch_embs = result.embeddings
        print(batch_embs)
        all_embeddings.extend(batch_embs)
        print(all_embeddings)
    return np.array(all_embeddings, dtype="float32")
 
emb_matrix = embed_texts_with_gemini(texts)
emb_matrix = normalize(emb_matrix)
 
print("Embedding matrix shape:", emb_matrix.shape)
 
# Mapping note_id <-> embedding index
note_id_to_idx = {nid: i for i, nid in enumerate(notes_df["note_id"])}
idx_to_note_id = {i: nid for nid, i in note_id_to_idx.items()}
 
# Full cosine similarity matrix
sim_matrix = cosine_similarity(emb_matrix)
print("sim_matrix shape:", sim_matrix.shape)
 
# Build NetworkX graph
G = nx.Graph()
 
# Add note nodes
for _, row in notes_df.iterrows():
    nid = f"note:{row['note_id']}"
    G.add_node(
        nid,
        node_type="note",
        title=row["title"],
        content=row["content"],
        tags=row["tags"],
        topic=row["topic"],
        context=row["context"],
        timestamp=row["timestamp"],
        source=row["source"]
    )
 
# Add semantic edges (top-K neighbors beyond threshold)
TOP_K = 6
SIM_THRESHOLD = 0.55
 
for i in range(len(notes_df)):
    nid_i = notes_df.loc[i, "note_id"]
    src_key = f"note:{nid_i}"
    sims = sim_matrix[i]
    neighbor_idx = np.argsort(-sims)[: TOP_K + 1]  # includes self
 
    for j in neighbor_idx:
        if j == i:
            continue
        score = float(sims[j])
        if score < SIM_THRESHOLD:
            continue
        nid_j = idx_to_note_id[j]
        dst_key = f"note:{nid_j}"
 
        if G.has_edge(src_key, dst_key):
            if score > G[src_key][dst_key].get("weight", 0):
                G[src_key][dst_key]["weight"] = score
                G[src_key][dst_key]["type"] = "semantic"
        else:
            G.add_edge(src_key, dst_key, type="semantic", weight=score)
 
# Add tag nodes & edges
for _, row in notes_df.iterrows():
    nid = f"note:{row['note_id']}"
    tags_str = row.get("tags", "")
    if pd.isna(tags_str) or str(tags_str).strip() == "":
        continue
    for t in str(tags_str).split(","):
        t = t.strip()
        if not t:
            continue
        tid = f"tag:{t}"
        if not G.has_node(tid):
            G.add_node(tid, node_type="tag", tag=t)
        G.add_edge(nid, tid, type="shares_tag")
 
# Add simple temporal edges: notes within 3 days of each other (sliding window)
notes_ts = notes_df[["note_id", "timestamp"]].dropna().sort_values("timestamp").reset_index(drop=True)
 
for i in range(len(notes_ts)):
    for j in range(i + 1, min(i + 4, len(notes_ts))):
        t1 = notes_ts.loc[i, "timestamp"]
        t2 = notes_ts.loc[j, "timestamp"]
        if abs((t2 - t1).days) <= 3:
            n1 = f"note:{notes_ts.loc[i, 'note_id']}"
            n2 = f"note:{notes_ts.loc[j, 'note_id']}"
            if not G.has_edge(n1, n2):
                G.add_edge(n1, n2, type="created_near", weight=0.2)
 
# Interactions to mark revisits / last interaction
interactions_df["timestamp"] = pd.to_datetime(interactions_df["timestamp"])
for _, r in interactions_df.iterrows():
    itype = r["type"]
    target = r["target_id"]
    if not target:
        continue
    if target in note_id_to_idx:
        nid = f"note:{target}"
        if nid not in G:
            continue
        if itype == "revisit_old_note":
            G.nodes[nid]["revisited"] = G.nodes[nid].get("revisited", 0) + 1
        if itype in ["open_note", "edit_note", "add_new_note"]:
            G.nodes[nid]["last_interaction"] = r["timestamp"]
 
print("Graph nodes:", G.number_of_nodes())
print("Graph edges:", G.number_of_edges())
 
# UMAP projection
reducer = umap.UMAP(n_components=2, random_state=RANDOM_SEED)
emb_2d = reducer.fit_transform(emb_matrix)
notes_df["umap_x"] = emb_2d[:, 0]
notes_df["umap_y"] = emb_2d[:, 1]
 
plt.scatter(notes_df["umap_x"], notes_df["umap_y"], s=10)
plt.title("UMAP projection of note embeddings (Gemini)")
plt.show()
 
notes_meta = notes_df.set_index("note_id")[["timestamp", "tags", "topic", "umap_x", "umap_y"]]
notes_meta.head()

Embedding notes with Gemini:   0%|          | 0/13 [00:00<?, ?it/s]


AttributeError: 'list' object has no attribute 'values'