## imports

In [21]:
import json
import pandas as pd
from bs4 import BeautifulSoup
import chromadb
import ollama
from sentence_transformers import SentenceTransformer
from datetime import datetime
from sentence_transformers import CrossEncoder

## load the JSON

In [2]:
# Load the conversations data first
import json

json_file_path = "/Users/michaeltrang/Documents/Coding Projects/AI Builders Bootcamp/hw3_chatgpt_history_chat/chatgpt_data/conversations.json"

with open(json_file_path, 'r', encoding='utf-8') as f:
    conversations_data = json.load(f)

print(f"✅ Loaded {len(conversations_data)} conversations")


✅ Loaded 437 conversations


## JSON extraction

In [3]:
# Fixed message extraction function
def get_conversation_messages(conversation):
    """Extract messages from a conversation using the correct ChatGPT structure"""
    messages = []
    mapping = conversation.get("mapping", {})
    
    # Get all nodes that have messages
    for node_id, node_data in mapping.items():
        message = node_data.get("message")
        if not message:
            continue
            
        # Get the content
        content = message.get("content")
        if not content:
            continue
            
        # Check if it's text content
        if content.get("content_type") == "text":
            parts = content.get("parts", [])
            if parts and len(parts) > 0:
                # Get the author role
                author_role = message.get("author", {}).get("role", "")
                
                # Skip system messages unless they're user system messages
                if author_role == "system":
                    metadata = message.get("metadata", {})
                    if not metadata.get("is_user_system_message", False):
                        continue
                
                # Clean up author names
                if author_role == "assistant":
                    author = "ChatGPT"
                elif author_role == "user":
                    author = "Michael"
                elif author_role == "system":
                    author = "System"
                else:
                    author = author_role
                
                # Add the message
                messages.append({
                    "author": author,
                    "text": parts[0],
                    "timestamp": message.get("create_time"),
                    "message_id": message.get("id"),
                    "node_id": node_id
                })
    
    # Sort by timestamp to get chronological order
    messages.sort(key=lambda x: x["timestamp"] if x["timestamp"] else 0)
    
    return messages

## Test conversation

In [4]:
# Test the extraction on the first conversation
if conversations_data:
    print("Testing message extraction on first conversation...")
    print("-" * 50)
    
    first_conv = conversations_data[0]
    title = first_conv.get('title', 'No title')
    print(f"Conversation: {title}")
    
    messages = get_conversation_messages(first_conv)
    print(f"Found {len(messages)} messages")
    
    # Show first few messages
    print("\nFirst few messages:")
    for i, msg in enumerate(messages[:5]):
        timestamp = msg['timestamp']
        if timestamp:
            from datetime import datetime
            dt = datetime.fromtimestamp(timestamp)
            time_str = dt.strftime("%Y-%m-%d %H:%M:%S")
        else:
            time_str = "No timestamp"
            
        print(f"\n{i+1}. {msg['author']} ({time_str}):")
        print(f"   {msg['text'][:150]}...")

Testing message extraction on first conversation...
--------------------------------------------------
Conversation: Montessori for self-growth
Found 21 messages

First few messages:

1. Michael (2025-09-03 14:34:30):
   how do I incorporate concepts of ideas of Montessori method into my own development and growth...

2. ChatGPT (2025-09-03 14:34:31):
   ...

3. ChatGPT (2025-09-03 14:34:31):
   That’s a really thoughtful question, Michael. Montessori is usually associated with children, but its principles can be surprisingly powerful for adul...

4. Michael (2025-09-03 14:37:17):
   can you go deeper here?Self-Directed Learning → Self-Agency in Growth

Montessori idea: Children choose work, set their pace, and self-correct.

Adult...

5. ChatGPT (2025-09-03 14:37:18):
   Got it — let’s really zoom in on **Self-Directed Learning → Self-Agency in Growth**, because this is where Montessori becomes especially powerful for ...


## process and chunk the JSON

In [11]:
# Process all conversations and create chunks
print("Processing all conversations...")
print("-" * 50)

all_messages = []
total_conversations = len(conversations_data)

for conv_idx, conversation in enumerate(conversations_data):
  title = conversation.get('title', f'Conversation {conv_idx}')
  messages = get_conversation_messages(conversation)

  # Only print for first 3 and last 3 conversations
  if conv_idx < 3 or conv_idx >= total_conversations - 3:
    print(f"Processing {conv_idx + 1}/{total_conversations}: '{title}' - {len(messages)} messages")
  elif conv_idx == 3:
      print("...")  # Show ellipsis after first 3

  for msg_idx, msg in enumerate(messages):
      message_info = {
          'conversation_id': conv_idx,
          'conversation_title': title,
          'message_id': f"{conv_idx}_{msg_idx}",
          'author': msg['author'],
          'content': msg['text'],
          'timestamp': msg['timestamp'],
          'source': 'ChatGPT',
          'chunk_type': 'message'
      }
      all_messages.append(message_info)

print(f"\n✅ Total messages extracted: {len(all_messages)}")

# Show some statistics
authors = [msg['author'] for msg in all_messages]
from collections import Counter
author_counts = Counter(authors)
print(f"\nMessage breakdown:")
for author, count in author_counts.items():
  print(f"  {author}: {count} messages")

# Show sample of different types of messages
print(f"\nSample messages by author:")
for author in ['Michael', 'ChatGPT']:
  sample_msg = next((msg for msg in all_messages if msg['author'] == author), None)
  if sample_msg:
      print(f"\n{author}:")
      print(f"  {sample_msg['content'][:200]}...")

Processing all conversations...
--------------------------------------------------
Processing 1/437: 'Montessori for self-growth' - 21 messages
Processing 2/437: 'Ilama Index explanation' - 41 messages
Processing 3/437: 'Okonomiyaki cooking styles' - 3 messages
...
Processing 435/437: 'Test Assistance Available' - 2 messages
Processing 436/437: 'New chat' - 1 messages
Processing 437/437: 'Social Hobbies for Gamers' - 11 messages

✅ Total messages extracted: 4424

Message breakdown:
  Michael: 1841 messages
  ChatGPT: 2445 messages
  tool: 138 messages

Sample messages by author:

Michael:
  how do I incorporate concepts of ideas of Montessori method into my own development and growth...

ChatGPT:
  ...


## Test script for conversations with metadata to see if they match spec

In [12]:
# Test a different conversation with metadata
conversation_index = 2  # Change this number to test different conversations (0-436)

if conversations_data and conversation_index < len(conversations_data):
    print(f"Testing conversation {conversation_index}...")
    print("-" * 50)
    
    conv = conversations_data[conversation_index]
    title = conv.get('title', 'No title')
    print(f"Conversation: {title}")
    
    messages = get_conversation_messages(conv)
    print(f"Found {len(messages)} messages")
    
    # Show first few messages with metadata
    print("\nFirst few messages with metadata:")
    for i, msg in enumerate(messages[:3]):  # Show first 3 for brevity
        timestamp = msg['timestamp']
        if timestamp:
            from datetime import datetime
            dt = datetime.fromtimestamp(timestamp)
            time_str = dt.strftime("%Y-%m-%d %H:%M:%S")
        else:
            time_str = "No timestamp"
        
        # Create metadata according to PRD schema
        creation_date = "Unknown"
        if msg['timestamp']:
            dt = datetime.fromtimestamp(msg['timestamp'])
            creation_date = dt.isoformat()
        
        metadata = {
            'chunk_id': msg['message_id'],
            'chunk_type': msg['author'].lower(),
            'creation_date': creation_date,
            'source_id': f"chatgpt_conv_{conversation_index}",
            'source_name': f"ChatGPT - {title}",
            'tags': [],
            'token_estimate': int(len(msg['text'].split()) * 1.3),
            'author': msg['author']
        }
        
        print(f"\n--- Message {i+1} ---")
        print(f"Author: {msg['author']} ({time_str})")
        print(f"Content: {msg['text'][:100]}...")
        print(f"Content length: {len(msg['text'])} chars, {len(msg['text'].split())} words")
        
        print(f"\nMetadata:")
        for key, value in metadata.items():
            print(f"  {key}: {value}")
        
    # Show conversation stats
    authors = [msg['author'] for msg in messages]
    from collections import Counter
    author_counts = Counter(authors)
    print(f"\nMessage breakdown:")
    for author, count in author_counts.items():
        print(f"  {author}: {count} messages")
        
else:
    print(f"❌ Conversation {conversation_index} not found. Available: 0-{len(conversations_data)-1}")

Testing conversation 2...
--------------------------------------------------
Conversation: Okonomiyaki cooking styles
Found 3 messages

First few messages with metadata:

--- Message 1 ---
Author: Michael (2025-09-04 09:14:40)
Content: In Japan, do they serve okonomiyaki mostly precooked? I went to a spot and they gave me a mostly coo...
Content length: 162 chars, 32 words

Metadata:
  chunk_id: ffad621a-0a25-4db5-8278-b9ff8c4cdf8e
  chunk_type: michael
  creation_date: 2025-09-04T09:14:40.441990
  source_id: chatgpt_conv_2
  source_name: ChatGPT - Okonomiyaki cooking styles
  tags: []
  token_estimate: 41
  author: Michael

--- Message 2 ---
Author: ChatGPT (2025-09-04 09:14:40)
Content: ...
Content length: 0 chars, 0 words

Metadata:
  chunk_id: eb5690b8-f711-4bca-95d3-79edae02f200
  chunk_type: chatgpt
  creation_date: 2025-09-04T09:14:40.686570
  source_id: chatgpt_conv_2
  source_name: ChatGPT - Okonomiyaki cooking styles
  tags: []
  token_estimate: 0
  author: ChatGPT

--- Messa

## create embeddings and store in ChromaDB locally

In [13]:
# Create embeddings and store in ChromaDB

print("Setting up embeddings and ChromaDB...")

# Initialize the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
print("✅ Embedding model loaded")

# Initialize ChromaDB
client = chromadb.PersistentClient(path="./chroma_db")
collection = client.get_or_create_collection(name="chat_history")
print("✅ ChromaDB ready")

# Process messages in batches
print(f"Creating embeddings for {len(all_messages)} messages...")
batch_size = 100

for batch_idx in range(0, len(all_messages), batch_size):
    batch = all_messages[batch_idx:batch_idx + batch_size]
    batch_num = (batch_idx // batch_size) + 1
    
    print(f"Batch {batch_num} ({len(batch)} messages)...")
    
    texts = []
    ids = []
    metadatas = []
    
    for msg in batch:
        # Create metadata according to schema
        creation_date = "Unknown"
        if msg['timestamp']:
            dt = datetime.fromtimestamp(msg['timestamp'])
            creation_date = dt.isoformat()
        
        metadata = {
            'chunk_id': msg['message_id'],
            'chunk_type': msg['author'].lower(),
            'creation_date': creation_date,
            'source_id': f"chatgpt_conv_{msg['conversation_id']}",
            'source_name': f"ChatGPT - {msg['conversation_title']}",
            'tags': "",
            'token_estimate': int(len(msg['content'].split()) * 1.3),
            'author': msg['author']
        }
        
        texts.append(msg['content'])
        ids.append(msg['message_id'])
        metadatas.append(metadata)
    
    # Create embeddings and store
    embeddings = embedding_model.encode(texts).tolist()
    collection.add(embeddings=embeddings, documents=texts, metadatas=metadatas, ids=ids)

print(f"✅ Stored {len(all_messages)} messages in ChromaDB!")

# Quick test
test_results = collection.query(query_texts=["Montessori method"], n_results=2)
print("\nTest search results:")
for i, (doc, metadata) in enumerate(zip(test_results['documents'][0], test_results['metadatas'][0])):
    print(f"{i+1}. {metadata['author']}: {doc[:100]}...")

Setting up embeddings and ChromaDB...
✅ Embedding model loaded
✅ ChromaDB ready
Creating embeddings for 4424 messages...
Batch 1 (100 messages)...
Batch 2 (100 messages)...
Batch 3 (100 messages)...
Batch 4 (100 messages)...
Batch 5 (100 messages)...
Batch 6 (100 messages)...
Batch 7 (100 messages)...
Batch 8 (100 messages)...
Batch 9 (100 messages)...
Batch 10 (100 messages)...
Batch 11 (100 messages)...
Batch 12 (100 messages)...
Batch 13 (100 messages)...
Batch 14 (100 messages)...
Batch 15 (100 messages)...
Batch 16 (100 messages)...
Batch 17 (100 messages)...
Batch 18 (100 messages)...
Batch 19 (100 messages)...
Batch 20 (100 messages)...
Batch 21 (100 messages)...
Batch 22 (100 messages)...
Batch 23 (100 messages)...
Batch 24 (100 messages)...
Batch 25 (100 messages)...
Batch 26 (100 messages)...
Batch 27 (100 messages)...
Batch 28 (100 messages)...
Batch 29 (100 messages)...
Batch 30 (100 messages)...
Batch 31 (100 messages)...
Batch 32 (100 messages)...
Batch 33 (100 messages).

## Test ChromaDB storage and embeddings

In [15]:
# Test ChromaDB storage and embeddings
print("Testing ChromaDB storage...")
print("-" * 40)

# 1. Check collection info
print(f"Collection name: {collection.name}")
print(f"Collection count: {collection.count()}")

# 2. Test semantic search with different queries
test_queries = [
    "Montessori method",
    "coding and programming", 
    "yoga and health",
]

print(f"\nTesting semantic search with {len(test_queries)} queries:")
for query in test_queries:
    print(f"\n--- Query: '{query}' ---")
    results = collection.query(
        query_texts=[query],
        n_results=3
    )
    
    print(f"Found {len(results['documents'][0])} results:")
    for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
        print(f"  {i+1}. {metadata['author']} ({metadata['chunk_type']})")
        print(f"     Source: {metadata['source_name']}")
        print(f"     Content: {doc[:80]}...")

# 3. Test with a specific conversation
print(f"\n--- Testing specific conversation search ---")
results = collection.query(
    query_texts=["Montessori"],
    n_results=5,
    where={"source_id": "chatgpt_conv_0"}  # First conversation
)
print(f"Found {len(results['documents'][0])} results from conversation 0:")
for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])):
    print(f"  {i+1}. {metadata['author']}: {doc[:60]}...")

# 4. Check metadata fields
print(f"\n--- Checking metadata structure ---")
sample_result = collection.query(query_texts=["test"], n_results=1)
if sample_result['metadatas'][0]:
    sample_metadata = sample_result['metadatas'][0][0]
    print("Sample metadata fields:")
    for key, value in sample_metadata.items():
        print(f"  {key}: {value} ({type(value).__name__})")

print(f"\n✅ ChromaDB test complete!")

Testing ChromaDB storage...
----------------------------------------
Collection name: chat_history
Collection count: 4424

Testing semantic search with 3 queries:

--- Query: 'Montessori method' ---
Found 3 results:
  1. Michael (michael)
     Source: ChatGPT - Montessori for self-growth
     Content: how do I incorporate concepts of ideas of Montessori method into my own developm...
  2. ChatGPT (chatgpt)
     Source: ChatGPT - Montessori for self-growth
     Content: Got it — let’s really zoom in on **Self-Directed Learning → Self-Agency in Growt...
  3. Michael (michael)
     Source: ChatGPT - Montessori for self-growth
     Content: can you go deeper here?Self-Directed Learning → Self-Agency in Growth

Montessor...

--- Query: 'coding and programming' ---
Found 3 results:
  1. Michael (michael)
     Source: ChatGPT - Virtual environment in coding
     Content: what is a virtual environment in the context of coding...
  2. Michael (michael)
     Source: ChatGPT - Happy 1 Year!
     

Integrate LlamaIndex and setup Ollama

In [16]:
## LlamaIndex Integration with ChromaDB

print("Setting up LlamaIndex with existing ChromaDB...")

# Import LlamaIndex components
from llama_index.core import VectorStoreIndex, Settings
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.ollama import Ollama
from llama_index.core import StorageContext

# Set up the embedding model (same as before)
Settings.embed_model = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")

# Set up Ollama LLM
Settings.llm = Ollama(model="gemma3:1b", request_timeout=60.0)

# Higher quality synthesis
# Settings.llm = Ollama(model="gemma3:4b", request_timeout=90.0)

print("✅ LlamaIndex settings configured")

# Test the connection
try:
    # Test a simple query
    test_response = Settings.llm.complete("Hello, this is a test.")
    print(f"✅ Ollama connection working!")
    print(f"Test response: {test_response.text[:100]}...")
    
except Exception as e:
    print(f"❌ Still having issues: {e}")

Setting up LlamaIndex with existing ChromaDB...
✅ LlamaIndex settings configured
✅ Ollama connection working!
Test response: Okay, sounds good! How can I help you with your test? Do you have any questions for me, or would you...


In [17]:
## Create LlamaIndex Vector Store from existing ChromaDB

# Create ChromaVectorStore from existing collection
vector_store = ChromaVectorStore(chroma_collection=collection)

# Create storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

# Create the index
index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    storage_context=storage_context
)

print("✅ LlamaIndex index created from existing ChromaDB")
print(f"Index ready for queries")

✅ LlamaIndex index created from existing ChromaDB
Index ready for queries


In [18]:
## Create RAG Query Engine

# Create a query engine with advanced settings
query_engine = index.as_query_engine(
    response_mode="tree_summarize",  # Better for synthesizing multiple sources
    similarity_top_k=5,              # Get top 5 most relevant chunks
    verbose=True                     # Show what it's doing
)

print("✅ RAG Query Engine created")
print("Response mode: tree_summarize (synthesizes multiple conversations)")
print("Similarity top-k: 5 (gets 5 most relevant chunks)")

✅ RAG Query Engine created
Response mode: tree_summarize (synthesizes multiple conversations)
Similarity top-k: 5 (gets 5 most relevant chunks)


## Full RAG Test Script

In [25]:
## Full RAG with manual CrossEncoder rerank (no extra deps)
from llama_index.core import PromptTemplate
from sentence_transformers import CrossEncoder
from time import perf_counter

Settings.llm = Ollama(model="gemma3:1b", request_timeout=60.0, temperature=0.1)

qa_tmpl = PromptTemplate(
    "You are a concise, grounded assistant.\n"
    "Only use the provided context. If an answer is not supported by the context, say you don't know.\n"
    "Do not include any external links or sources; cite only the provided source titles.\n"
    "Return a short synthesis.\n\n"
    "Context:\n{context_str}\n\n"
    "Question: {query_str}\n\n"
    "Answer:"
)

# Cross-encoder for reranking
ce = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

def rag_query_with_rerank(q: str, retrieve_k=10, top_n=5):
    retriever = index.as_retriever(similarity_top_k=retrieve_k)
    nodes = retriever.retrieve(q)

    # Cross-encode (query, passage) pairs
    pairs = [(q, n.text or "") for n in nodes]
    scores = ce.predict(pairs)

    # Pick top_n by CE score
    ranked = sorted(zip(nodes, scores), key=lambda x: x[1], reverse=True)[:top_n]
    top_nodes = [n for n, _ in ranked]

    # Build grounded context
    ctx = "\n\n".join(n.text for n in top_nodes if n.text)

    # Synthesize with LLM using the grounded prompt
    prompt = qa_tmpl.format(context_str=ctx, query_str=q)
    resp = Settings.llm.complete(prompt)

    return resp.text, ranked

# Test
test_queries = [
    "What have I learned about AI?",
]

for q in test_queries:
    print(f"\n=== RAG Query ===\n{q}")
    t0 = perf_counter()
    answer, ranked = rag_query_with_rerank(q, retrieve_k=12, top_n=5)
    dt = perf_counter() - t0

    print("\n--- Synthesized Answer ---")
    print(answer)

    print("\n--- Top Sources ---")
    for i, (n, ce_score) in enumerate(ranked, 1):
        md = n.metadata or {}
        title = md.get("source_name", "Unknown")
        author = md.get("author", "Unknown")
        snip = (n.text or "").replace("\n", " ")[:160]
        print(f"{i}. ce_score={ce_score:.3f} | {author} — {title}")
        print(f"   {snip}...")

    print(f"\n(latency: {dt:.2f}s) {'='*60}")


=== RAG Query ===
What have I learned about AI?

--- Synthesized Answer ---
You’ve learned that AI is a broad field encompassing machine learning, deep learning, natural language processing, and computer vision. You understand that it’s rooted in statistical concepts, programming (primarily Python), and the ability to build and evaluate models. You’ve grasped the core concepts of supervised and unsupervised learning, and the importance of understanding the ethical considerations surrounding AI development. You’ve also recognized the need for a combination of technical knowledge, practical experience, and a strong understanding of business strategy and product management principles.

--- Top Sources ---
1. ce_score=4.979 | Michael — ChatGPT - Pursuing Authentic Dreams
   Are there any in-person courses at University of Pennsylvania I can take to learn about AI and business or maybe some kind of AI coding boot camp of some sort o...
2. ce_score=4.928 | Michael — ChatGPT - New chat
   As