# Search Analytics with DuckDB

This notebook analyzes your Search Analytics data from the pre-processed DuckDB database.

**Prerequisites:** Run `python process_search_analytics.py` first to create the database.

---
# QUICK START

**Just run this cell to get started!**

In [None]:
# ===== CONNECT TO DATABASE =====
# This connects to the database created by process_search_analytics.py

import duckdb
import pandas as pd
from pathlib import Path

# Plotting optional
try:
    import matplotlib.pyplot as plt
    import matplotlib.dates as mdates
    plt.style.use('seaborn-v0_8-whitegrid')
    PLOTTING_AVAILABLE = True
except ImportError:
    PLOTTING_AVAILABLE = False

# Pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Connect to database
db_path = Path('../data/searchanalytics.db')
if not db_path.exists():
    raise FileNotFoundError(
        f"Database not found at {db_path}\n"
        "Please run: python process_search_analytics.py\n"
        "to create the database first."
    )

con = duckdb.connect(str(db_path), read_only=True)

# Helper functions
def query(sql):
    """Execute SQL and return DataFrame"""
    return con.execute(sql).df()

# Show what's available
print("="*60)
print("DATABASE CONNECTED")
print("="*60)
print(f"Database: {db_path}")

# Show tables
tables = query("SHOW TABLES")
table_names = tables['name'].tolist()
print(f"\nTables: {', '.join(table_names)}")

# Check if 'searches' table exists
if 'searches' not in table_names:
    print("\n" + "!"*60)
    print("WARNING: 'searches' table not found!")
    print("Please run: python process_search_analytics.py --full-refresh")
    print("!"*60)
else:
    # Show row counts
    for table in table_names:
        count = query(f"SELECT COUNT(*) as n FROM {table}")['n'][0]
        print(f"  {table}: {count:,} rows")

    # Date range
    date_range = query("""
        SELECT 
            MIN(session_date) as first_date,
            MAX(session_date) as last_date
        FROM searches
    """)
    print(f"\nDate range: {date_range['first_date'][0]} to {date_range['last_date'][0]}")

print("\n" + "="*60)
print("Ready! Run the analysis cells below.")
print("="*60)

---
# ANALYSES

From here you can run the cells that interest you.

**Available tables:**
- `searches` - All event-level data with calculated columns
- `searches_raw` - Original imported data

In [None]:
# Overview: What do we have?
query("DESCRIBE searches")

In [None]:
# Event Types Distribution - check what events we have and their sequence
print("=== Event Types ===")
display(query("""
    SELECT name as event_type, COUNT(*) as count
    FROM searches
    GROUP BY name
    ORDER BY count DESC
"""))

print("\n=== Previous Event for SEARCH_RESULT_COUNT ===")
print("(This shows what event typically precedes result events)")
display(query("""
    SELECT 
        COALESCE(prev_event, '(first event)') as previous_event, 
        COUNT(*) as count
    FROM searches
    WHERE name = 'SEARCH_RESULT_COUNT'
    GROUP BY prev_event
    ORDER BY count DESC
"""))

In [None]:
# First and last entries
query("""
    SELECT
        COUNT(*) as total_rows,
        MIN(timestamp) as first_entry,
        MAX(timestamp) as last_entry
    FROM searches
""")

In [None]:
# View sample data
query("SELECT * FROM searches LIMIT 20")

---
## Time Distribution

In [None]:
# Entries per day
query("""
    SELECT
        DATE_TRUNC('day', timestamp)::DATE as date,
        COUNT(*) as count
    FROM searches
    GROUP BY 1
    ORDER BY 1 DESC
    LIMIT 30
""")

In [None]:
# Distribution by hour (using pre-calculated event_hour)
query("""
    SELECT
        event_hour as hour,
        COUNT(*) as count,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) as percent
    FROM searches
    GROUP BY 1
    ORDER BY 1
""")

In [None]:
# Distribution by weekday (using pre-calculated event_weekday)
query("""
    SELECT
        event_weekday as weekday,
        event_weekday_num as day_nr,
        COUNT(*) as count
    FROM searches
    GROUP BY 1, 2
    ORDER BY 2
""")

---
## Session Analyses

Analyses based on user sessions (grouped by session_key = day + user_id + session_id)

In [None]:
# Session overview: How many sessions, how many searches per session?
query("""
    SELECT
        COUNT(DISTINCT session_key) as total_sessions,
        COUNT(*) as total_searches,
        ROUND(COUNT(*) * 1.0 / COUNT(DISTINCT session_key), 1) as avg_searches_per_session,
        COUNT(DISTINCT user_id) as unique_users
    FROM searches
""")

In [None]:
# Distribution: Number of searches per session
query("""
    SELECT
        searches_in_session,
        COUNT(*) as session_count,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as percent
    FROM (
        SELECT session_key, COUNT(*) as searches_in_session
        FROM searches
        GROUP BY session_key
    )
    GROUP BY 1
    ORDER BY 1
    LIMIT 20
""")

In [None]:
# Sessions with most searches (power users / problems?)
query("""
    SELECT
        session_key,
        session_date,
        user_id,
        session_id,
        COUNT(*) as search_count_in_session,
        MIN(timestamp) as first_search,
        MAX(timestamp) as last_search,
        DATEDIFF('minute', MIN(timestamp), MAX(timestamp)) as duration_minutes
    FROM searches
    GROUP BY session_key, session_date, user_id, session_id
    ORDER BY search_count DESC
    LIMIT 20
""")

In [None]:
# User activity: How many sessions per user?
query("""
    SELECT
        user_id,
        COUNT(DISTINCT session_key) as session_count,
        COUNT(*) as total_searches,
        ROUND(COUNT(*) * 1.0 / COUNT(DISTINCT session_key), 1) as searches_per_session,
        MIN(session_date) as first_session,
        MAX(session_date) as last_session
    FROM searches
    GROUP BY user_id
    ORDER BY session_count DESC
    LIMIT 20
""")

In [None]:
# Session timeline: Show all activities for a specific session
# Replace session_key with a value from the query above

SESSION_KEY = 'ENTER_SESSION_KEY_HERE'  # <-- Change this

query(f"""
    SELECT *
    FROM searches
    WHERE session_key = '{SESSION_KEY}'
    ORDER BY timestamp
""")

In [None]:
# Sessions per day
query("""
    SELECT
        session_date as date,
        COUNT(DISTINCT session_key) as sessions,
        COUNT(DISTINCT user_id) as unique_users,
        COUNT(*) as total_searches,
        ROUND(COUNT(*) * 1.0 / COUNT(DISTINCT session_key), 1) as searches_per_session
    FROM searches
    GROUP BY 1
    ORDER BY 1 DESC
    LIMIT 30
""")

---
## Search Journey Analysis

Analyzes the complete search flow per session: Search → Results → Click?

**Event types:**
- `SEARCH_TRIGGERED` / `SEARCH_TRIGGERED` - User starts search
- `SEARCH_RESULT_COUNT` - Results are displayed
- `SEARCH_TAB_CLICK`, `SEARCH_ALL_TAB_PAGE_CLICK`, `SEARCH_NEWS_TAB_PAGE_CLICK`, `SEARCH_GOTO_TAB_PAGE_CLICK` - User clicks on result

**Success = Search leads to click**

In [None]:
# Overview: What event types do we have?
query("""
    SELECT 
        name as event_type,
        COUNT(*) as count,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as percent
    FROM searches
    GROUP BY 1
    ORDER BY 2 DESC
""")

In [None]:
# Search Funnel: How many searches → results → clicks?
query("""
    SELECT
        COUNT(DISTINCT CASE WHEN name = 'SEARCH_TRIGGERED' THEN session_key END) as sessions_with_search,
        COUNT(DISTINCT CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN session_key END) as sessions_with_results,
        COUNT(DISTINCT CASE WHEN name IN ('SEARCH_TAB_CLICK', 'SEARCH_ALL_TAB_PAGE_CLICK', 'SEARCH_NEWS_TAB_PAGE_CLICK', 'SEARCH_GOTO_TAB_PAGE_CLICK') THEN session_key END) as sessions_with_click,
        ROUND(100.0 * COUNT(DISTINCT CASE WHEN name IN ('SEARCH_TAB_CLICK', 'SEARCH_ALL_TAB_PAGE_CLICK', 'SEARCH_NEWS_TAB_PAGE_CLICK', 'SEARCH_GOTO_TAB_PAGE_CLICK') THEN session_key END) 
            / NULLIF(COUNT(DISTINCT CASE WHEN name = 'SEARCH_TRIGGERED' THEN session_key END), 0), 1) as click_through_rate_pct
    FROM searches
""")

In [None]:
# Null-result searches: Which search terms return 0 results?
# Using is_null_result and search_term_normalized
query("""
    SELECT 
        search_term_normalized as search_term,
        COUNT(*) as count,
        AVG(CAST(CP_totalResultCount AS INTEGER)) as avg_results,
        SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_result_count,
        ROUND(100.0 * SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) / COUNT(*), 1) as null_rate_pct
    FROM searches
    WHERE name = 'SEARCH_RESULT_COUNT'
      AND search_term_normalized IS NOT NULL
    GROUP BY 1
    HAVING COUNT(*) >= 3
    ORDER BY null_result_count DESC
    LIMIT 30
""")

In [None]:
# Top search terms with success rate (click after search)
query("""
    WITH searches_with_query AS (
        SELECT 
            session_key,
            COALESCE(CP_searchQuery, searchQuery, query) as search_term,
            name,
            timestamp
        FROM searches
        WHERE COALESCE(CP_searchQuery, searchQuery, query) IS NOT NULL
    ),
    search_events AS (
        SELECT DISTINCT session_key, search_term
        FROM searches_with_query 
        WHERE name IN ('SEARCH_TRIGGERED', 'SEARCH_TRIGGERED', 'SEARCH_RESULT_COUNT')
    ),
    click_events AS (
        SELECT DISTINCT session_key
        FROM searches_with_query
        WHERE name IN ('SEARCH_TAB_CLICK', 'SEARCH_ALL_TAB_PAGE_CLICK', 'SEARCH_NEWS_TAB_PAGE_CLICK', 'SEARCH_GOTO_TAB_PAGE_CLICK')
    )
    SELECT 
        s.search_term,
        COUNT(*) as search_count_in_session,
        SUM(CASE WHEN c.session_key IS NOT NULL THEN 1 ELSE 0 END) as with_click,
        ROUND(100.0 * SUM(CASE WHEN c.session_key IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 1) as success_rate_pct
    FROM search_events s
    LEFT JOIN click_events c ON s.session_key = c.session_key
    GROUP BY 1
    HAVING COUNT(*) >= 5
    ORDER BY search_count DESC
    LIMIT 30
""")

In [None]:
# Problematic searches: Many results but no clicks (user doesn't find what they need)
query("""
    WITH search_results AS (
        SELECT 
            session_key,
            COALESCE(CP_searchQuery, searchQuery, query) as search_term,
            CAST(CP_totalResultCount AS INTEGER) as total_results
        FROM searches
        WHERE name = 'SEARCH_RESULT_COUNT'
          AND CAST(CP_totalResultCount AS INTEGER) > 0
    ),
    click_events AS (
        SELECT DISTINCT session_key
        FROM searches
        WHERE name IN ('SEARCH_TAB_CLICK', 'SEARCH_ALL_TAB_PAGE_CLICK', 'SEARCH_NEWS_TAB_PAGE_CLICK', 'SEARCH_GOTO_TAB_PAGE_CLICK')
    )
    SELECT 
        sr.search_term,
        COUNT(*) as search_count_in_session,
        ROUND(AVG(sr.total_results), 0) as avg_results,
        SUM(CASE WHEN c.session_key IS NULL THEN 1 ELSE 0 END) as without_click,
        ROUND(100.0 * SUM(CASE WHEN c.session_key IS NULL THEN 1 ELSE 0 END) / COUNT(*), 1) as abandon_rate_pct
    FROM search_results sr
    LEFT JOIN click_events c ON sr.session_key = c.session_key
    GROUP BY 1
    HAVING COUNT(*) >= 5 AND SUM(CASE WHEN c.session_key IS NULL THEN 1 ELSE 0 END) > 0
    ORDER BY without_click DESC
    LIMIT 30
""")

In [None]:
# Search reformulations: Sessions with multiple searches (user needs to adjust query)
query("""
    WITH session_searches AS (
        SELECT 
            session_key,
            COUNT(DISTINCT COALESCE(CP_searchQuery, searchQuery, query)) as unique_search_terms,
            COUNT(*) as total_search_events
        FROM searches
        WHERE name = 'SEARCH_TRIGGERED'
          AND COALESCE(CP_searchQuery, searchQuery, query) IS NOT NULL
        GROUP BY 1
        HAVING COUNT(DISTINCT COALESCE(CP_searchQuery, searchQuery, query)) > 1
    )
    SELECT 
        unique_search_terms as different_search_count,
        COUNT(*) as sessions,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as percent
    FROM session_searches
    GROUP BY 1
    ORDER BY 1
""")

In [None]:
# Typical journey patterns: What are the most common event sequences?
# Shows the full journey path for each session and counts occurrences
query("""
    WITH session_journeys AS (
        SELECT 
            session_key,
            STRING_AGG(name, ' → ' ORDER BY timestamp) as journey_path,
            COUNT(*) as journey_length
        FROM searches
        GROUP BY session_key
    )
    SELECT 
        journey_path as Journey_Pattern,
        journey_length as Steps,
        COUNT(*) as Sessions,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
    FROM session_journeys
    GROUP BY journey_path, journey_length
    ORDER BY Sessions DESC
    LIMIT 20
""")

In [None]:
# Simplified journey patterns: Categorized by search → result → click flow
query("""
    WITH session_summary AS (
        SELECT 
            session_key,
            COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as searches,
            COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END) as results,
            COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as clicks,
            SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_results
        FROM searches
        GROUP BY session_key
    )
    SELECT 
        CASE 
            WHEN searches > 0 AND results > 0 AND clicks > 0 THEN 
                searches || ' Search → ' || results || ' Result → ' || clicks || ' Click'
            WHEN searches > 0 AND results > 0 AND null_results > 0 AND clicks = 0 THEN 
                searches || ' Search → ' || results || ' Result (incl. ' || null_results || ' null) → No Click'
            WHEN searches > 0 AND results > 0 AND clicks = 0 THEN 
                searches || ' Search → ' || results || ' Result → Abandoned'
            WHEN searches > 0 AND results = 0 THEN 
                searches || ' Search → No Result'
            ELSE 'Other'
        END as Journey_Type,
        COUNT(*) as Sessions,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
    FROM session_summary
    GROUP BY 1
    ORDER BY Sessions DESC
    LIMIT 15
""")

In [None]:
# Complete journey of a session
# Choose a session_key from above

SESSION_KEY = 'ENTER_SESSION_KEY_HERE'  # <-- Change this

query(f"""
    SELECT 
        timestamp,
        name as event,
        COALESCE(CP_searchQuery, searchQuery, query) as search_term,
        CP_totalResultCount as results,
        CP_peopleResultCount as people,
        CP_newsResultCount as news,
        CP_gotoResultCount as goto
    FROM searches
    WHERE session_key = '{SESSION_KEY}'
    ORDER BY timestamp
""")

In [None]:
# Result distribution by category: Where do results come from?
query("""
    SELECT
        'Total' as category,
        COUNT(*) as searches_with_result,
        ROUND(AVG(CAST(CP_totalResultCount AS FLOAT)), 1) as avg_count,
        SUM(CASE WHEN CAST(CP_totalResultCount AS INTEGER) = 0 THEN 1 ELSE 0 END) as null_results
    FROM searches WHERE name = 'SEARCH_RESULT_COUNT'
    
    UNION ALL
    
    SELECT 'People', COUNT(*), ROUND(AVG(CAST(CP_peopleResultCount AS FLOAT)), 1),
        SUM(CASE WHEN CAST(CP_peopleResultCount AS INTEGER) = 0 THEN 1 ELSE 0 END)
    FROM searches WHERE name = 'SEARCH_RESULT_COUNT' AND CP_peopleResultCount IS NOT NULL
    
    UNION ALL
    
    SELECT 'News', COUNT(*), ROUND(AVG(CAST(CP_newsResultCount AS FLOAT)), 1),
        SUM(CASE WHEN CAST(CP_newsResultCount AS INTEGER) = 0 THEN 1 ELSE 0 END)
    FROM searches WHERE name = 'SEARCH_RESULT_COUNT' AND CP_newsResultCount IS NOT NULL
    
    UNION ALL
    
    SELECT 'Intranet News', COUNT(*), ROUND(AVG(CAST(CP_intranetNewsResultCount AS FLOAT)), 1),
        SUM(CASE WHEN CAST(CP_intranetNewsResultCount AS INTEGER) = 0 THEN 1 ELSE 0 END)
    FROM searches WHERE name = 'SEARCH_RESULT_COUNT' AND CP_intranetNewsResultCount IS NOT NULL
    
    UNION ALL
    
    SELECT 'GoTo', COUNT(*), ROUND(AVG(CAST(CP_gotoResultCount AS FLOAT)), 1),
        SUM(CASE WHEN CAST(CP_gotoResultCount AS INTEGER) = 0 THEN 1 ELSE 0 END)
    FROM searches WHERE name = 'SEARCH_RESULT_COUNT' AND CP_gotoResultCount IS NOT NULL
""")

In [None]:
# Click distribution: Which result categories are clicked? (using click_category)
query("""
    SELECT 
        click_category,
        name as click_type,
        COUNT(*) as count,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as percent
    FROM searches
    WHERE click_category IS NOT NULL
    GROUP BY 1, 2
    ORDER BY 3 DESC
""")

---
## Time Interval Analysis

Analyzes the time between events within sessions. These columns are auto-generated during setup.

In [None]:
# Time from Search to Results: How fast does the search return results?
query("""
    SELECT 
        time_since_prev_bucket as time_bucket,
        COUNT(*) as count,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as percent
    FROM searches
    WHERE name = 'SEARCH_RESULT_COUNT' 
      AND prev_event = 'SEARCH_TRIGGERED'
    GROUP BY 1
    ORDER BY 
        CASE time_since_prev_bucket
            WHEN '< 0.5s' THEN 1
            WHEN '0.5-1s' THEN 2
            WHEN '1-2s' THEN 3
            WHEN '2-5s' THEN 4
            WHEN '5-10s' THEN 5
            WHEN '10-30s' THEN 6
            WHEN '30-60s' THEN 7
            WHEN '> 60s' THEN 8
            ELSE 9
        END
""")

In [None]:
# Time from Results to Click: How long do users take to click after seeing results?
query("""
    SELECT 
        time_since_prev_bucket as time_bucket,
        COUNT(*) as count,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as percent
    FROM searches
    WHERE name IN ('SEARCH_TAB_CLICK', 'SEARCH_ALL_TAB_PAGE_CLICK', 'SEARCH_NEWS_TAB_PAGE_CLICK', 'SEARCH_GOTO_TAB_PAGE_CLICK')
      AND prev_event = 'SEARCH_RESULT_COUNT'
    GROUP BY 1
    ORDER BY 
        CASE time_since_prev_bucket
            WHEN '< 0.5s' THEN 1
            WHEN '0.5-1s' THEN 2
            WHEN '1-2s' THEN 3
            WHEN '2-5s' THEN 4
            WHEN '5-10s' THEN 5
            WHEN '10-30s' THEN 6
            WHEN '30-60s' THEN 7
            WHEN '> 60s' THEN 8
            ELSE 9
        END
""")

In [None]:
# Event transition times: Average time between different event type pairs
query("""
    SELECT 
        prev_event || ' → ' || name as transition,
        COUNT(*) as count,
        ROUND(AVG(sec_since_prev_event), 2) as avg_seconds,
        ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sec_since_prev_event), 2) as median_seconds,
        ROUND(MIN(sec_since_prev_event), 2) as min_seconds,
        ROUND(MAX(sec_since_prev_event), 2) as max_seconds
    FROM searches
    WHERE prev_event IS NOT NULL
      AND sec_since_prev_event IS NOT NULL
    GROUP BY prev_event, name
    HAVING COUNT(*) >= 10
    ORDER BY count DESC
    LIMIT 20
""")

In [None]:
# Time interval distribution by click type: Do different click types have different response times?
query("""
    SELECT 
        name as click_type,
        COUNT(*) as count,
        ROUND(AVG(sec_since_prev_event), 2) as avg_seconds,
        ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sec_since_prev_event), 2) as median_seconds
    FROM searches
    WHERE name IN ('SEARCH_TAB_CLICK', 'SEARCH_ALL_TAB_PAGE_CLICK', 'SEARCH_NEWS_TAB_PAGE_CLICK', 'SEARCH_GOTO_TAB_PAGE_CLICK')
      AND prev_event = 'SEARCH_RESULT_COUNT'
    GROUP BY 1
    ORDER BY count DESC
""")

In [None]:
# Sample session with time intervals: See the full journey with timing
query("""
    SELECT 
        session_key,
        event_order,
        timestamp,
        name as event,
        prev_event,
        sec_since_prev_event,
        time_since_prev_bucket,
        COALESCE(CP_searchQuery, searchQuery, query) as search_term,
        CP_totalResultCount as results
    FROM searches
    WHERE session_key IN (
        SELECT session_key FROM searches GROUP BY 1 HAVING COUNT(*) BETWEEN 3 AND 6 LIMIT 1
    )
    ORDER BY event_order
""")

---
## Top Search Terms

In [None]:
# Top 20 most frequent search terms (using normalized search term)
query("""
    SELECT
        search_term_normalized as search_term,
        COUNT(*) as count,
        ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) as percent,
        ROUND(AVG(search_term_length), 1) as avg_length,
        ROUND(AVG(search_term_word_count), 1) as avg_words
    FROM searches
    WHERE search_term_normalized IS NOT NULL
      AND search_term_normalized != ''
    GROUP BY 1
    ORDER BY 2 DESC
    LIMIT 20
""")

---
## Null-Result Analysis

Based on `SEARCH_RESULT_COUNT` events with `CP_totalResultCount = 0`

In [None]:
# Overall null rate (using is_null_result)
query("""
    SELECT
        COUNT(*) as total_searches,
        SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_results,
        ROUND(100.0 * SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) / COUNT(*), 2) as null_rate_pct
    FROM searches
    WHERE name = 'SEARCH_RESULT_COUNT'
""")

In [None]:
# Search terms with most null results (using search_term_normalized and is_null_result)
query("""
    SELECT
        search_term_normalized as search_term,
        COUNT(*) as count,
        SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_results,
        ROUND(100.0 * SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) / COUNT(*), 1) as null_rate_pct
    FROM searches
    WHERE name = 'SEARCH_RESULT_COUNT'
      AND search_term_normalized IS NOT NULL
    GROUP BY 1
    HAVING COUNT(*) >= 5
    ORDER BY null_results DESC
    LIMIT 20
""")

In [None]:
# Null rate per day (using is_null_result)
query("""
    SELECT
        session_date as date,
        COUNT(*) as total,
        SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_results,
        ROUND(100.0 * SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) / COUNT(*), 2) as null_rate_pct
    FROM searches
    WHERE name = 'SEARCH_RESULT_COUNT'
    GROUP BY 1
    ORDER BY 1 DESC
    LIMIT 30
""")

---
## Export Search Journey Analysis (Excel)

Exports all journey analyses to an Excel file with separate tabs.

In [None]:
# Search Journey Analysis → Excel Export
from datetime import datetime
from pathlib import Path

# Requires openpyxl: conda install openpyxl
try:
    from openpyxl import Workbook
    from openpyxl.worksheet.table import Table, TableStyleInfo
    from openpyxl.utils.dataframe import dataframe_to_rows
    from openpyxl.utils import get_column_letter
except ImportError:
    print("openpyxl not installed. Run: conda install openpyxl")
    raise

# Define all queries
journey_queries = {
    "Raw-Data": """
        SELECT *
        FROM searches
        ORDER BY timestamp
    """,
    
    "Daily-Metrics": """
        SELECT 
            session_date as Date,
            COUNT(*) as Total_Events,
            COUNT(DISTINCT session_key) as Unique_Sessions,
            COUNT(DISTINCT user_Id) as Unique_Users,
            COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as Searches,
            COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END) as Results_Shown,
            COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as Clicks,
            SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as Null_Results,
            COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' AND is_null_result = false THEN 1 END) 
                - COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as Abandoned_Searches,
            ROUND(100.0 * COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) 
                / NULLIF(COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END), 0), 1) as Click_Rate_Pct,
            ROUND(100.0 * SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) 
                / NULLIF(COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END), 0), 1) as Null_Rate_Pct,
            ROUND(100.0 * (COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' AND is_null_result = false THEN 1 END) - COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END))
                / NULLIF(COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' AND is_null_result = false THEN 1 END), 0), 1) as Abandon_Rate_Pct,
            ROUND(1.0 * COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) 
                / NULLIF(COUNT(DISTINCT session_key), 0), 2) as Avg_Searches_Per_Session,
            COUNT(DISTINCT search_term_normalized) as Unique_Search_Terms
        FROM searches
        GROUP BY 1
        ORDER BY 1 DESC
    """,
    
    "Event-Overview": """
        SELECT 
            name as Event_Type,
            COUNT(*) as Count,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM searches
        GROUP BY 1
        ORDER BY 2 DESC
    """,
    
    "Search-Funnel": """
        SELECT
            COUNT(DISTINCT CASE WHEN name = 'SEARCH_TRIGGERED' THEN session_key END) as Sessions_With_Search,
            COUNT(DISTINCT CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN session_key END) as Sessions_With_Results,
            COUNT(DISTINCT CASE WHEN click_category IS NOT NULL THEN session_key END) as Sessions_With_Click,
            ROUND(100.0 * COUNT(DISTINCT CASE WHEN click_category IS NOT NULL THEN session_key END) 
                / NULLIF(COUNT(DISTINCT CASE WHEN name = 'SEARCH_TRIGGERED' THEN session_key END), 0), 1) as Click_Through_Rate_Pct
        FROM searches
    """,
    
    "Null-Results": """
        SELECT 
            search_term_normalized as Search_Term,
            COUNT(*) as Count,
            ROUND(AVG(CAST(CP_totalResultCount AS FLOAT)), 1) as Avg_Results,
            SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as Null_Result_Count,
            ROUND(100.0 * SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) / COUNT(*), 1) as Null_Rate_Pct
        FROM searches
        WHERE name = 'SEARCH_RESULT_COUNT'
          AND search_term_normalized IS NOT NULL
        GROUP BY 1
        HAVING COUNT(*) >= 3
        ORDER BY Null_Result_Count DESC
        LIMIT 100
    """,
    
    "Success-Rate": """
        WITH search_events AS (
            SELECT DISTINCT session_key, search_term_normalized as search_term
            FROM searches 
            WHERE name IN ('SEARCH_TRIGGERED', 'SEARCH_TRIGGERED', 'SEARCH_RESULT_COUNT')
              AND search_term_normalized IS NOT NULL
        ),
        click_events AS (
            SELECT DISTINCT session_key
            FROM searches
            WHERE click_category IS NOT NULL
        )
        SELECT 
            s.search_term as Search_Term,
            COUNT(*) as Search_Count,
            SUM(CASE WHEN c.session_key IS NOT NULL THEN 1 ELSE 0 END) as With_Click,
            ROUND(100.0 * SUM(CASE WHEN c.session_key IS NOT NULL THEN 1 ELSE 0 END) / COUNT(*), 1) as Success_Rate_Pct
        FROM search_events s
        LEFT JOIN click_events c ON s.session_key = c.session_key
        GROUP BY 1
        HAVING COUNT(*) >= 5
        ORDER BY Search_Count DESC
        LIMIT 100
    """,
    
    "Abandoned-Searches": """
        WITH search_results AS (
            SELECT 
                session_key,
                search_term_normalized as search_term,
                CAST(CP_totalResultCount AS INTEGER) as total_results
            FROM searches
            WHERE name = 'SEARCH_RESULT_COUNT'
              AND is_null_result = false
        ),
        click_events AS (
            SELECT DISTINCT session_key
            FROM searches
            WHERE click_category IS NOT NULL
        )
        SELECT 
            sr.search_term as Search_Term,
            COUNT(*) as Search_Count,
            ROUND(AVG(sr.total_results), 0) as Avg_Results,
            SUM(CASE WHEN c.session_key IS NULL THEN 1 ELSE 0 END) as Without_Click,
            ROUND(100.0 * SUM(CASE WHEN c.session_key IS NULL THEN 1 ELSE 0 END) / COUNT(*), 1) as Abandon_Rate_Pct
        FROM search_results sr
        LEFT JOIN click_events c ON sr.session_key = c.session_key
        GROUP BY 1
        HAVING COUNT(*) >= 5 AND SUM(CASE WHEN c.session_key IS NULL THEN 1 ELSE 0 END) > 0
        ORDER BY Without_Click DESC
        LIMIT 100
    """,
    
    "Reformulations": """
        WITH session_searches AS (
            SELECT 
                session_key,
                COUNT(DISTINCT search_term_normalized) as unique_search_terms
            FROM searches
            WHERE name = 'SEARCH_TRIGGERED'
              AND search_term_normalized IS NOT NULL
            GROUP BY 1
            HAVING COUNT(DISTINCT search_term_normalized) > 1
        )
        SELECT 
            unique_search_terms as Different_Search_Count,
            COUNT(*) as Sessions,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM session_searches
        GROUP BY 1
        ORDER BY 1
    """,
    
    "Result-Categories": """
        SELECT
            'Total' as Category,
            COUNT(*) as Searches,
            ROUND(AVG(CAST(CP_totalResultCount AS FLOAT)), 1) as Avg_Count,
            SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as Null_Results
        FROM searches WHERE name = 'SEARCH_RESULT_COUNT'
        
        UNION ALL
        
        SELECT 'People', COUNT(*), ROUND(AVG(CAST(CP_peopleResultCount AS FLOAT)), 1),
            SUM(CASE WHEN CAST(CP_peopleResultCount AS INTEGER) = 0 THEN 1 ELSE 0 END)
        FROM searches WHERE name = 'SEARCH_RESULT_COUNT' AND CP_peopleResultCount IS NOT NULL
        
        UNION ALL
        
        SELECT 'News', COUNT(*), ROUND(AVG(CAST(CP_newsResultCount AS FLOAT)), 1),
            SUM(CASE WHEN CAST(CP_newsResultCount AS INTEGER) = 0 THEN 1 ELSE 0 END)
        FROM searches WHERE name = 'SEARCH_RESULT_COUNT' AND CP_newsResultCount IS NOT NULL
        
        UNION ALL
        
        SELECT 'Intranet News', COUNT(*), ROUND(AVG(CAST(CP_intranetNewsResultCount AS FLOAT)), 1),
            SUM(CASE WHEN CAST(CP_intranetNewsResultCount AS INTEGER) = 0 THEN 1 ELSE 0 END)
        FROM searches WHERE name = 'SEARCH_RESULT_COUNT' AND CP_intranetNewsResultCount IS NOT NULL
        
        UNION ALL
        
        SELECT 'GoTo', COUNT(*), ROUND(AVG(CAST(CP_gotoResultCount AS FLOAT)), 1),
            SUM(CASE WHEN CAST(CP_gotoResultCount AS INTEGER) = 0 THEN 1 ELSE 0 END)
        FROM searches WHERE name = 'SEARCH_RESULT_COUNT' AND CP_gotoResultCount IS NOT NULL
    """,
    
    "Click-Distribution": """
        SELECT 
            click_category as Click_Category,
            name as Click_Type,
            COUNT(*) as Count,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM searches
        WHERE click_category IS NOT NULL
        GROUP BY 1, 2
        ORDER BY 3 DESC
    """,
    
    "Top-Search-Terms": """
        SELECT 
            search_term_normalized as Search_Term,
            COUNT(*) as Count,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) as Percent,
            ROUND(AVG(search_term_length), 1) as Avg_Length,
            ROUND(AVG(search_term_word_count), 1) as Avg_Words
        FROM searches
        WHERE search_term_normalized IS NOT NULL
          AND search_term_normalized != ''
          AND name IN ('SEARCH_TRIGGERED', 'SEARCH_TRIGGERED', 'SEARCH_RESULT_COUNT')
        GROUP BY 1
        ORDER BY 2 DESC
        LIMIT 100
    """,
    
    "Time-Search-to-Result": """
        SELECT 
            time_since_prev_bucket as Time_Bucket,
            COUNT(*) as Count,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM searches
        WHERE name = 'SEARCH_RESULT_COUNT' 
          AND prev_event = 'SEARCH_TRIGGERED'
        GROUP BY 1
        ORDER BY 
            CASE Time_Bucket
                WHEN '< 0.5s' THEN 1
                WHEN '0.5-1s' THEN 2
                WHEN '1-2s' THEN 3
                WHEN '2-5s' THEN 4
                WHEN '5-10s' THEN 5
                WHEN '10-30s' THEN 6
                WHEN '30-60s' THEN 7
                WHEN '> 60s' THEN 8
                ELSE 9
            END
    """,
    
    "Time-Result-to-Click": """
        SELECT 
            time_since_prev_bucket as Time_Bucket,
            COUNT(*) as Count,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM searches
        WHERE click_category IS NOT NULL
          AND prev_event = 'SEARCH_RESULT_COUNT'
        GROUP BY 1
        ORDER BY 
            CASE Time_Bucket
                WHEN '< 0.5s' THEN 1
                WHEN '0.5-1s' THEN 2
                WHEN '1-2s' THEN 3
                WHEN '2-5s' THEN 4
                WHEN '5-10s' THEN 5
                WHEN '10-30s' THEN 6
                WHEN '30-60s' THEN 7
                WHEN '> 60s' THEN 8
                ELSE 9
            END
    """,
    
    "Event-Transitions": """
        SELECT 
            prev_event || ' → ' || name as Transition,
            COUNT(*) as Count,
            ROUND(AVG(sec_since_prev_event), 2) as Avg_Seconds,
            ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sec_since_prev_event), 2) as Median_Seconds,
            ROUND(MIN(sec_since_prev_event), 2) as Min_Seconds,
            ROUND(MAX(sec_since_prev_event), 2) as Max_Seconds
        FROM searches
        WHERE prev_event IS NOT NULL
          AND sec_since_prev_event IS NOT NULL
        GROUP BY prev_event, name
        HAVING COUNT(*) >= 10
        ORDER BY Count DESC
        LIMIT 50
    """,
    
    "Journey-Outcomes": """
        WITH session_summary AS (
            SELECT 
                session_key,
                COUNT(*) as total_events,
                COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as search_count_in_session,
                COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END) as result_count,
                COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as click_count,
                SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_result_count
            FROM searches
            GROUP BY session_key
        )
        SELECT 
            CASE 
                WHEN click_count > 0 THEN 'Success'
                WHEN null_result_count > 0 AND click_count = 0 THEN 'No Results'
                WHEN result_count > 0 AND click_count = 0 THEN 'Abandoned'
                ELSE 'Unknown'
            END as Journey_Outcome,
            COUNT(*) as Sessions,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM session_summary
        GROUP BY 1
        ORDER BY Sessions DESC
    """,
    
    "Journey-Complexity": """
        WITH session_summary AS (
            SELECT 
                session_key,
                COUNT(*) as total_events,
                COUNT(DISTINCT search_term_normalized) as unique_search_terms
            FROM searches
            GROUP BY session_key
        )
        SELECT 
            CASE 
                WHEN total_events = 1 THEN 'Single Event'
                WHEN total_events <= 3 THEN 'Simple (2-3)'
                WHEN total_events <= 10 THEN 'Medium (4-10)'
                ELSE 'Complex (>10)'
            END as Session_Complexity,
            COUNT(*) as Sessions,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent,
            ROUND(AVG(unique_search_terms), 1) as Avg_Unique_Queries
        FROM session_summary
        GROUP BY 1
        ORDER BY 
            CASE Session_Complexity
                WHEN 'Single Event' THEN 1
                WHEN 'Simple (2-3)' THEN 2
                WHEN 'Medium (4-10)' THEN 3
                ELSE 4
            END
    """,
    
    "Journey-Timing-Summary": """
        WITH session_timings AS (
            SELECT 
                session_key,
                MIN(CASE WHEN name = 'SEARCH_RESULT_COUNT' AND prev_event = 'SEARCH_TRIGGERED' THEN ms_since_prev_event END) as ms_search_to_result,
                MIN(CASE WHEN click_category IS NOT NULL AND prev_event = 'SEARCH_RESULT_COUNT' THEN ms_since_prev_event END) as ms_result_to_click,
                DATEDIFF('millisecond', MIN(timestamp), MAX(timestamp)) as total_duration_ms
            FROM searches
            GROUP BY session_key
        )
        SELECT 
            'Search to Result' as Metric,
            COUNT(ms_search_to_result) as Sessions_With_Data,
            ROUND(AVG(ms_search_to_result) / 1000.0, 2) as Avg_Seconds,
            ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY ms_search_to_result) / 1000.0, 2) as Median_Seconds
        FROM session_timings
        WHERE ms_search_to_result IS NOT NULL
        
        UNION ALL
        
        SELECT 
            'Result to Click',
            COUNT(ms_result_to_click),
            ROUND(AVG(ms_result_to_click) / 1000.0, 2),
            ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY ms_result_to_click) / 1000.0, 2)
        FROM session_timings
        WHERE ms_result_to_click IS NOT NULL
        
        UNION ALL
        
        SELECT 
            'Total Session Duration',
            COUNT(total_duration_ms),
            ROUND(AVG(total_duration_ms) / 1000.0, 2),
            ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY total_duration_ms) / 1000.0, 2)
        FROM session_timings
        WHERE total_duration_ms > 0
    """,
    
    "Sessions-Detail": """
        WITH session_timings AS (
            SELECT 
                session_key,
                session_date,
                MIN(timestamp) as session_start,
                COUNT(*) as total_events,
                COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as search_count_in_session,
                COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END) as result_count,
                COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as click_count,
                COUNT(DISTINCT search_term_normalized) as unique_search_terms,
                SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_results,
                ROUND(AVG(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN CAST(CP_totalResultCount AS FLOAT) END), 1) as avg_results,
                DATEDIFF('second', MIN(timestamp), MAX(timestamp)) as duration_seconds,
                MIN(CASE WHEN name = 'SEARCH_RESULT_COUNT' AND prev_event = 'SEARCH_TRIGGERED' THEN ms_since_prev_event END) as ms_search_to_result,
                MIN(CASE WHEN click_category IS NOT NULL AND prev_event = 'SEARCH_RESULT_COUNT' THEN ms_since_prev_event END) as ms_result_to_click
            FROM searches
            GROUP BY session_key, session_date
        )
        SELECT 
            session_date as Date,
            total_events as Events,
            search_count as Searches,
            result_count as Results,
            click_count as Clicks,
            unique_search_terms as Unique_Queries,
            null_results as Null_Results,
            avg_results as Avg_Result_Count,
            duration_seconds as Duration_Sec,
            ROUND(ms_search_to_result / 1000.0, 2) as Sec_Search_to_Result,
            ROUND(ms_result_to_click / 1000.0, 2) as Sec_Result_to_Click,
            CASE 
                WHEN click_count > 0 THEN 'Success'
                WHEN null_results > 0 AND click_count = 0 THEN 'No Results'
                WHEN result_count > 0 AND click_count = 0 THEN 'Abandoned'
                ELSE 'Unknown'
            END as Outcome,
            CASE WHEN unique_search_terms > 1 THEN 'Yes' ELSE 'No' END as Reformulated,
            CASE 
                WHEN ms_search_to_result IS NULL THEN 'No Result'
                WHEN ms_search_to_result < 500 THEN '< 0.5s'
                WHEN ms_search_to_result < 1000 THEN '0.5-1s'
                WHEN ms_search_to_result < 2000 THEN '1-2s'
                WHEN ms_search_to_result < 5000 THEN '2-5s'
                ELSE '> 5s'
            END as Search_to_Result_Bucket,
            CASE 
                WHEN ms_result_to_click IS NULL THEN 'No Click'
                WHEN ms_result_to_click < 2000 THEN '< 2s (quick)'
                WHEN ms_result_to_click < 5000 THEN '2-5s'
                WHEN ms_result_to_click < 10000 THEN '5-10s'
                WHEN ms_result_to_click < 30000 THEN '10-30s'
                ELSE '> 30s'
            END as Result_to_Click_Bucket
        FROM session_timings
        ORDER BY session_date DESC, total_events DESC
        LIMIT 1000
    """,
    
    "Hourly-Distribution": """
        SELECT 
            event_hour as Hour,
            COUNT(*) as Total_Events,
            COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as Searches,
            COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as Clicks,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM searches
        GROUP BY 1
        ORDER BY 1
    """,
    
    "Weekday-Distribution": """
        SELECT 
            event_weekday as Weekday,
            event_weekday_num as Day_Num,
            COUNT(*) as Total_Events,
            COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as Searches,
            COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as Clicks,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM searches
        GROUP BY 1, 2
        ORDER BY 2
    """,
    
    "Search-Term-Length": """
        SELECT 
            CASE 
                WHEN search_term_word_count = 1 THEN '1 word'
                WHEN search_term_word_count = 2 THEN '2 words'
                WHEN search_term_word_count = 3 THEN '3 words'
                WHEN search_term_word_count <= 5 THEN '4-5 words'
                ELSE '6+ words'
            END as Word_Count_Bucket,
            COUNT(*) as Count,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent,
            ROUND(AVG(search_term_length), 1) as Avg_Char_Length
        FROM searches
        WHERE search_term_normalized IS NOT NULL
          AND search_term_normalized != ''
          AND name IN ('SEARCH_TRIGGERED', 'SEARCH_TRIGGERED', 'SEARCH_RESULT_COUNT')
        GROUP BY 1
        ORDER BY 
            CASE Word_Count_Bucket
                WHEN '1 word' THEN 1
                WHEN '2 words' THEN 2
                WHEN '3 words' THEN 3
                WHEN '4-5 words' THEN 4
                ELSE 5
            END
    """,
    
    "First-Search-Analysis": """
        SELECT 
            CASE WHEN is_first_search_of_day = true THEN 'First Search of Day' ELSE 'Subsequent Search' END as Search_Type,
            COUNT(*) as Count,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM searches
        WHERE is_first_search_of_day IS NOT NULL
        GROUP BY 1
        ORDER BY 1
    """,
    
    "Click-Category-Analysis": """
        SELECT 
            click_category as Category,
            COUNT(*) as Click_Count,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent,
            ROUND(AVG(sec_since_prev_event), 2) as Avg_Sec_to_Click,
            ROUND(PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sec_since_prev_event), 2) as Median_Sec_to_Click
        FROM searches
        WHERE click_category IS NOT NULL
          AND prev_event = 'SEARCH_RESULT_COUNT'
        GROUP BY 1
        ORDER BY 2 DESC
    """,
    
    "Journey-Patterns": """
        WITH session_journeys AS (
            SELECT 
                session_key,
                STRING_AGG(name, ' → ' ORDER BY timestamp) as journey_path,
                COUNT(*) as journey_length
            FROM searches
            GROUP BY session_key
        )
        SELECT 
            journey_path as Journey_Pattern,
            journey_length as Steps,
            COUNT(*) as Sessions,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM session_journeys
        GROUP BY journey_path, journey_length
        ORDER BY Sessions DESC
        LIMIT 50
    """,
    
    "Journey-Types": """
        WITH session_summary AS (
            SELECT 
                session_key,
                COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as searches,
                COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END) as results,
                COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as clicks,
                SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_results
            FROM searches
            GROUP BY session_key
        )
        SELECT 
            CASE 
                WHEN searches > 0 AND results > 0 AND clicks > 0 THEN 
                    searches || ' Search → ' || results || ' Result → ' || clicks || ' Click'
                WHEN searches > 0 AND results > 0 AND null_results > 0 AND clicks = 0 THEN 
                    searches || ' Search → ' || results || ' Result (incl. ' || null_results || ' null) → No Click'
                WHEN searches > 0 AND results > 0 AND clicks = 0 THEN 
                    searches || ' Search → ' || results || ' Result → Abandoned'
                WHEN searches > 0 AND results = 0 THEN 
                    searches || ' Search → No Result'
                ELSE 'Other'
            END as Journey_Type,
            COUNT(*) as Sessions,
            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as Percent
        FROM session_summary
        GROUP BY 1
        ORDER BY Sessions DESC
        LIMIT 30
    """
}

# Create Excel file
output_file = f'../output/search_journey_analysis_{datetime.now().strftime("%Y%m%d")}.xlsx'
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)

wb = Workbook()
wb.remove(wb.active)  # Remove default sheet

table_style = TableStyleInfo(
    name="TableStyleMedium9",
    showFirstColumn=False,
    showLastColumn=False,
    showRowStripes=True,
    showColumnStripes=False
)

# Explanations for each tab
tab_explanations = {'Event-Overview': "What: Distribution of all event types in the dataset.\n\nHow: Counts every event by type (SEARCH_TRIGGERED, SEARCH_RESULT_COUNT, SEARCH_RESULT_CLICK_*, etc.).\n\nExample: If there were 500 searches and 200 clicks, you'd see SEARCH_TRIGGERED=500 and various click types totaling 200.", 'Search-Funnel': 'What: High-level conversion funnel from search to click.\n\nHow: Counts UNIQUE SESSIONS at each stage (not individual events). A session with 5 searches still counts as 1.\n\nExample: 100 sessions started a search, 90 got results, 40 clicked something -> 40% click-through rate.', 'Null-Results': 'What: Search terms that frequently return zero results.\n\nHow: Counts SEARCH_RESULT_COUNT events (every time results are shown). Same term searched 10 times = Count of 10.\n\nFilter: Only SEARCH_RESULT_COUNT events, minimum 3 occurrences.\n\nExample: "xyz123" searched 50 times, 45 returned no results -> Null_Rate = 90%.', 'Success-Rate': 'What: Which search terms lead to clicks, measured per unique session.\n\nHow: Counts UNIQUE session+term pairs. If same user searches "budget" 3 times in one session, it counts as 1.\n\nFilter: Events SEARCH_TRIGGERED, SEARCH_TRIGGERED, or SEARCH_RESULT_COUNT. Minimum 5 occurrences.\n\nExample: "budget report" appears in 20 unique sessions, 15 had a click -> Success_Rate = 75%.\n\nNote: Count here differs from Null-Results because this counts unique sessions, not every event.', 'Abandoned-Searches': 'What: Searches that showed results but user did not click anything.\n\nHow: Counts SEARCH_RESULT_COUNT events where results > 0 (excludes null results).\n\nFilter: Only non-null SEARCH_RESULT_COUNT events. Minimum 5 occurrences.\n\nExample: "quarterly report" had 100 result events, 60 had no click -> Abandon_Rate = 60%.\n\nNote: Count = Null-Results Count minus Null_Result_Count (events with results only).', 'Reformulations': 'What: How often users refine their search within a session.\n\nHow: Groups sessions by how many DIFFERENT search terms were used.\n\nFilter: Only sessions with 2+ unique search terms.\n\nExample: User searches "budget" then "budget 2024" then "Q4 budget" = 3 different terms in one session.', 'Result-Categories': 'What: Distribution of result category types shown to users.\n\nHow: Counts result category events from search results.', 'Click-Distribution': 'What: Breakdown of what users click on.\n\nHow: Counts ALL click events, grouped by category (General, All, News, GoTo, People) and specific type.\n\nExample: 500 total clicks: 300 General, 100 All, 50 News, 30 GoTo, 20 People.', 'Top-Search-Terms': 'What: Most frequently searched terms with length statistics.\n\nHow: Counts search events (SEARCH_TRIGGERED, SEARCH_TRIGGERED, SEARCH_RESULT_COUNT) per term.\n\nFilter: Non-empty search terms only.\n\nExample: "budget" has Count=150 (total events), Avg_Length=6 chars, Avg_Words=1.', 'Time-Search-to-Result': 'What: How fast results appear after user initiates search.\n\nHow: Measures time between SEARCH_TRIGGERED/STARTED and SEARCH_RESULT_COUNT events.\n\nFilter: Only SEARCH_RESULT_COUNT events that followed a search trigger.\n\nExample: 60% of searches show results in < 0.5 seconds, 25% take 0.5-1 second.', 'Time-Result-to-Click': 'What: How long users take to decide and click after seeing results.\n\nHow: Measures time between SEARCH_RESULT_COUNT and the click event.\n\nFilter: Only click events that followed a SEARCH_RESULT_COUNT.\n\nExample: 30% click within 2 seconds (quick decision), 40% take 2-10 seconds (scanning results).', 'Event-Transitions': 'What: Common event sequences and timing between consecutive events.\n\nHow: Pairs each event with its previous event, shows frequency and timing statistics.\n\nFilter: Minimum 10 occurrences per transition.\n\nExample: "SEARCH_RESULT_COUNT -> SEARCH_RESULT_CLICK_GENERAL" occurs 200 times, avg 3.5 seconds apart.', 'Journey-Outcomes': 'What: How search sessions ultimately end.\n\nHow: Classifies each SESSION (not event) based on what happened:\n- Success: Had at least one click\n- No Results: Had null results, no clicks\n- Abandoned: Got results but never clicked\n- Unknown: Other patterns\n\nExample: 1000 sessions: 400 Success, 100 No Results, 450 Abandoned, 50 Unknown.', 'Journey-Complexity': 'What: How complex search sessions are by total event count.\n\nHow: Counts ALL events per session, groups into complexity buckets.\n\nCategories: Single Event (1), Simple (2-3), Medium (4-10), Complex (>10).\n\nExample: Session with search -> results -> click -> click = 4 events = "Medium" complexity.', 'Journey-Timing-Summary': 'What: Summary timing statistics across all sessions.\n\nHow: Calculates average and median times for key intervals.\n\nMetrics: Search-to-Result, Result-to-Click, Total Session Duration.\n\nExample: Median search-to-result = 0.4s, Median result-to-click = 4.2s.', 'Sessions-Detail': 'What: Detailed breakdown of individual sessions (up to 1000 most recent).\n\nHow: One row per session with counts, timing, and classifications.\n\nIncludes: Event counts, unique queries, null results, duration, timing buckets, outcome.\n\nExample: Session on Jan 5: 6 events, 2 searches, 1 click, 12 second duration, "Success" outcome.', 'Hourly-Distribution': 'What: When searches happen throughout the day.\n\nHow: Groups ALL events by hour of day (0-23).\n\nExample: Hour 9 (9-10 AM) has highest activity with 15% of all events.', 'Weekday-Distribution': 'What: Search activity by day of week.\n\nHow: Groups ALL events by weekday.\n\nExample: Monday has 20% of events, Saturday only 5%.', 'Search-Term-Length': 'What: How complex/specific are users search queries.\n\nHow: Groups search events by WORD COUNT of the search term.\n\nFilter: Events SEARCH_TRIGGERED, SEARCH_TRIGGERED, SEARCH_RESULT_COUNT with non-empty terms.\n\nExample: 45% are 1-word searches ("budget"), 30% are 2-word ("budget report").', 'First-Search-Analysis': 'What: Comparing first search of the day vs. subsequent searches.\n\nHow: Uses pre-calculated is_first_search_of_day flag.\n\nExample: First searches of day = 500 (25%), Subsequent = 1500 (75%).', 'Click-Category-Analysis': 'What: Performance comparison across click categories.\n\nHow: Groups clicks by category, shows timing to click.\n\nFilter: Clicks that followed a SEARCH_RESULT_COUNT event.\n\nExample: "General" clicks avg 3.2s to click, "News" clicks avg 5.1s to click.', 'Journey-Patterns': 'What: Most common exact event sequences.\n\nHow: Concatenates ALL events per session in order, groups by full pattern.\n\nExample: "SEARCH_TRIGGERED -> SEARCH_RESULT_COUNT -> SEARCH_RESULT_CLICK_GENERAL" = 150 sessions.', 'Journey-Types': 'What: Session classification by search/result/click counts.\n\nHow: Categorizes each session by its funnel metrics.\n\nExample: "1 Search -> 1 Result -> 1 Click" = 200 sessions (simple successful searches).', 'Daily-Metrics': 'What: Key performance metrics aggregated by day for trend analysis.\n\nMetrics per day:\n- Unique_Sessions: Distinct search sessions\n- Unique_Users: Distinct users (if user_Id available)\n- Searches: Search trigger events\n- Clicks: Total click events\n- Null_Results: Searches returning 0 results\n- Abandoned_Searches: Results shown but no click\n\nRates:\n- Click_Rate_Pct: Clicks / Searches\n- Null_Rate_Pct: Null results / Results shown\n- Abandon_Rate_Pct: (Results - Clicks) / Results\n\nUse: Create trend charts in Excel or Power BI.', 'Raw-Data': 'What: Complete raw data from the searches table.\n\nHow: Exports ALL columns and ALL rows from the DuckDB searches table.\n\nUse: For custom analysis in Excel, pivot tables, or importing into other tools.\n\nNote: This may be a large dataset depending on your data volume.'}

for sheet_name, sql in journey_queries.items():
    # Execute query
    df = query(sql)
    
    # Convert pd.NA to None for Excel compatibility
    import pandas as pd
    for col in df.columns:
        df[col] = df[col].apply(lambda x: None if pd.isna(x) else x)
    
    # Create sheet (max 31 characters for sheet name)
    ws = wb.create_sheet(title=sheet_name[:31])
    
    # Add explanation to the right of the table
    if sheet_name in tab_explanations:
        from openpyxl.styles import Font
        explanation = tab_explanations[sheet_name]
        exp_col = len(df.columns) + 3 if len(df) > 0 else 3
        ws.cell(row=1, column=exp_col, value="About this tab:")
        ws.cell(row=1, column=exp_col).font = Font(bold=True)
        for idx, line in enumerate(explanation.split("\n"), start=2):
            ws.cell(row=idx, column=exp_col, value=line)
        ws.column_dimensions[get_column_letter(exp_col)].width = 70
    
    # Write DataFrame to sheet
    for r_idx, row in enumerate(dataframe_to_rows(df, index=False, header=True), 1):
        for c_idx, value in enumerate(row, 1):
            ws.cell(row=r_idx, column=c_idx, value=value)
    
    # Auto-adjust column width
    for col_idx, column in enumerate(df.columns, 1):
        max_length = max(
            len(str(column)),
            df[column].astype(str).str.len().max() if len(df) > 0 else 0
        )
        ws.column_dimensions[get_column_letter(col_idx)].width = min(max_length + 2, 50)
    
    # Format as table
    if len(df) > 0:
        table_ref = f"A1:{get_column_letter(len(df.columns))}{len(df) + 1}"
        table = Table(displayName=sheet_name.replace("-", "_").replace(" ", "_"), ref=table_ref)
        table.tableStyleInfo = table_style
        ws.add_table(table)

# Save
wb.save(output_file)

print(f"Excel exported: {output_file}")
print(f"\nIncluded tabs:")
for name in journey_queries.keys():
    print(f"  • {name}")

---
## Visualizations

If Matplotlib is installed (`conda install matplotlib`)

In [None]:
if PLOTTING_AVAILABLE:
    # Entries per day
    daily = query("""
        SELECT
            DATE_TRUNC('day', timestamp)::DATE as date,
            COUNT(*) as count
        FROM searches
        GROUP BY 1
        ORDER BY 1
    """)
    
    fig, ax = plt.subplots(figsize=(14, 5))
    ax.plot(daily['date'], daily['count'], linewidth=2, color='steelblue')
    ax.fill_between(daily['date'], daily['count'], alpha=0.3, color='steelblue')
    ax.set_title('Entries per Day', fontsize=14, fontweight='bold')
    ax.set_xlabel('Date')
    ax.set_ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("Matplotlib not installed. Run: conda install matplotlib")

In [None]:
if PLOTTING_AVAILABLE:
    # Distribution by hour (using pre-calculated event_hour)
    hourly = query("""
        SELECT
            event_hour as hour,
            COUNT(*) as count
        FROM searches
        GROUP BY 1
        ORDER BY 1
    """)
    
    fig, ax = plt.subplots(figsize=(12, 5))
    ax.bar(hourly['hour'], hourly['count'], color='steelblue')
    ax.set_title('Distribution by Time of Day', fontsize=14, fontweight='bold')
    ax.set_xlabel('Hour')
    ax.set_ylabel('Count')
    ax.set_xticks(range(0, 24))
    plt.tight_layout()
    plt.show()

In [None]:
if PLOTTING_AVAILABLE:
    # Top 10 search terms (using normalized search term)
    top = query("""
        SELECT search_term_normalized as search_term, COUNT(*) as count
        FROM searches
        WHERE search_term_normalized IS NOT NULL 
          AND search_term_normalized != ''
        GROUP BY 1
        ORDER BY 2 DESC
        LIMIT 10
    """)
    
    fig, ax = plt.subplots(figsize=(10, 6))
    y_pos = range(len(top))
    ax.barh(y_pos, top['count'], color='steelblue')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(top['search_term'])
    ax.invert_yaxis()
    ax.set_title('Top 10 Search Terms', fontsize=14, fontweight='bold')
    ax.set_xlabel('Count')
    plt.tight_layout()
    plt.show()

---
## Custom Queries

Write your own SQL queries here:

In [None]:
# Another query:
query("""
    SELECT *
    FROM searches
    LIMIT 10
""")

---
## Export for Power BI

Parquet files are ideal for Power BI: smaller, faster, data types preserved.

In [None]:
# Raw data export (all data)
from datetime import datetime
from pathlib import Path
import os

# Fixed filename for Power BI folder refresh
output_file = '../output/searches_raw.parquet'
output_path = Path(output_file)

# Delete old file if exists (avoids corrupt files)
if output_path.exists():
    output_path.unlink()

# Export
execute(f"COPY searches TO '{output_file}' (FORMAT PARQUET)")

# Verify file is valid
try:
    test_read = query(f"SELECT COUNT(*) as n FROM read_parquet('{output_file}')")
    row_count = test_read['n'][0]
    size_mb = os.path.getsize(output_file) / (1024 * 1024)
    
    print(f"Exported: {output_file}")
    print(f"Rows:     {row_count:,}")
    print(f"Size:     {size_mb:.1f} MB")
    print(f"Status:   Parquet file validated")
    print(f"\nIn Power BI: Get Data → Folder → {Path(output_file).parent}")
except Exception as e:
    print(f"ERROR: Parquet file is invalid: {e}")

In [None]:
# Aggregated daily data (for trend dashboards)
output_file = '../output/searches_daily.parquet'
output_path = Path(output_file)

# Delete old file if exists
if output_path.exists():
    output_path.unlink()

execute(f"""
    COPY (
        SELECT 
            session_date as date,
            COUNT(*) as total_events,
            COUNT(DISTINCT session_key) as unique_sessions,
            COUNT(DISTINCT user_Id) as unique_users,
            COUNT(DISTINCT search_term_normalized) as unique_search_terms,
            COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as search_starts,
            COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END) as result_events,
            COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as click_events,
            SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_results,
            SUM(CASE WHEN is_clickable_result = true THEN 1 ELSE 0 END) as clickable_results,
            -- Rate metrics
            ROUND(100.0 * COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) 
                / NULLIF(COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END), 0), 2) as click_through_rate_pct,
            ROUND(100.0 * SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) 
                / NULLIF(COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END), 0), 2) as null_rate_pct,
            -- Abandonment: results shown but no click in session (calculated via subquery)
            ROUND(100.0 * (COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' AND is_null_result = false THEN 1 END) - COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END))
                / NULLIF(COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' AND is_null_result = false THEN 1 END), 0), 2) as abandonment_rate_pct,
            -- Session metrics
            ROUND(1.0 * COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) 
                / NULLIF(COUNT(DISTINCT session_key), 0), 2) as avg_searches_per_session,
            -- Search term metrics (includes SUM columns for weighted DAX calculations)
            ROUND(AVG(search_term_length), 1) as avg_search_term_length,
            ROUND(AVG(search_term_word_count), 1) as avg_search_term_words,
            SUM(search_term_length) as sum_search_term_length,
            SUM(search_term_word_count) as sum_search_term_words,
            COUNT(CASE WHEN search_term_length IS NOT NULL THEN 1 END) as search_term_count,
            COUNT(CASE WHEN is_first_search_of_day = true THEN 1 END) as first_searches_of_day,
            -- Click category breakdown
            COUNT(CASE WHEN click_category = 'General' THEN 1 END) as clicks_general,
            COUNT(CASE WHEN click_category = 'All' THEN 1 END) as clicks_all,
            COUNT(CASE WHEN click_category = 'News' THEN 1 END) as clicks_news,
            COUNT(CASE WHEN click_category = 'GoTo' THEN 1 END) as clicks_goto,
            COUNT(CASE WHEN click_category = 'People' THEN 1 END) as clicks_people
        FROM searches
        GROUP BY 1
        ORDER BY 1
    ) TO '{output_file}' (FORMAT PARQUET)
""")

# Verify
try:
    test_read = query(f"SELECT COUNT(*) as n FROM read_parquet('{output_file}')")
    days = test_read['n'][0]
    size_mb = os.path.getsize(output_file) / (1024 * 1024)
    
    print(f"Exported: {output_file}")
    print(f"Days:     {days}")
    print(f"Size:     {size_mb:.2f} MB")
    print(f"Status:   Parquet file validated")
except Exception as e:
    print(f"ERROR: Parquet file is invalid: {e}")

In [None]:
# Session-level journey data (anonymized, for journey pattern analysis)
# One row per session with journey metrics - no user identifiers
output_file = '../output/searches_journeys.parquet'
output_path = Path(output_file)

# Delete old file if exists
if output_path.exists():
    output_path.unlink()

execute(f"""
    COPY (
        WITH session_events AS (
            SELECT 
                session_key,
                session_date,
                MIN(timestamp) as session_start,
                MAX(timestamp) as session_end,
                DATEDIFF('second', MIN(timestamp), MAX(timestamp)) as duration_seconds,
                
                -- Event counts
                COUNT(*) as total_events,
                COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as search_count_in_session,
                COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END) as result_count,
                COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as click_count,
                
                -- Unique queries in session (using normalized term)
                COUNT(DISTINCT search_term_normalized) as unique_search_terms,
                
                -- Result metrics
                MAX(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN CAST(CP_totalResultCount AS INTEGER) END) as max_total_results,
                SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_result_searches,
                
                -- Click types (using click_category)
                COUNT(CASE WHEN click_category = 'General' THEN 1 END) as general_clicks,
                COUNT(CASE WHEN click_category = 'All' THEN 1 END) as all_tab_clicks,
                COUNT(CASE WHEN click_category = 'News' THEN 1 END) as news_clicks,
                COUNT(CASE WHEN click_category = 'GoTo' THEN 1 END) as goto_clicks,
                COUNT(CASE WHEN click_category = 'People' THEN 1 END) as people_clicks,
                
                
                -- Time metrics
                MIN(event_hour) as first_event_hour,
                MAX(event_hour) as last_event_hour,
                
                -- First search flag
                MAX(CASE WHEN is_first_search_of_day = true THEN 1 ELSE 0 END) as includes_first_search_of_day
                
            FROM searches
            GROUP BY session_key, session_date
        )
        SELECT 
            -- No user_id or session_id - anonymized
            session_date,
            session_start,
            duration_seconds,
            total_events,
            search_count,
            result_count,
            click_count,
            unique_search_terms,
            max_total_results,
            null_result_searches,
            general_clicks,
            all_tab_clicks,
            news_clicks,
            goto_clicks,
            people_clicks,
            first_event_hour,
            last_event_hour,
            CASE WHEN includes_first_search_of_day = 1 THEN true ELSE false END as includes_first_search_of_day,
            
            -- Journey classification
            CASE 
                WHEN click_count > 0 THEN 'Success'
                WHEN null_result_searches > 0 AND click_count = 0 THEN 'No Results'
                WHEN result_count > 0 AND click_count = 0 THEN 'Abandoned'
                ELSE 'Unknown'
            END as journey_outcome,
            
            -- Reformulation flag
            CASE WHEN unique_search_terms > 1 THEN true ELSE false END as had_reformulation,
            
            -- Session complexity
            CASE 
                WHEN total_events = 1 THEN 'Single Event'
                WHEN total_events <= 3 THEN 'Simple'
                WHEN total_events <= 10 THEN 'Medium'
                ELSE 'Complex'
            END as session_complexity
            
        FROM session_events
        ORDER BY session_date, session_start
    ) TO '{output_file}' (FORMAT PARQUET)
""")

# Verify
try:
    test_read = query(f"SELECT COUNT(*) as n FROM read_parquet('{output_file}')")
    sessions = test_read['n'][0]
    size_mb = os.path.getsize(output_file) / (1024 * 1024)
    
    # Show journey outcome distribution
    outcomes = query(f"""
        SELECT journey_outcome, COUNT(*) as count, 
               ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as pct
        FROM read_parquet('{output_file}')
        GROUP BY 1
        ORDER BY 2 DESC
    """)
    
    print(f"Exported: {output_file}")
    print(f"Sessions: {sessions:,}")
    print(f"Size:     {size_mb:.2f} MB")
    print(f"Status:   Parquet file validated")
    print(f"\nJourney Outcomes:")
    for _, row in outcomes.iterrows():
        print(f"  {row['journey_outcome']:12} {row['count']:>6,} ({row['pct']}%)")
except Exception as e:
    print(f"ERROR: Parquet file is invalid: {e}")

In [None]:
# Session journey with time intervals between events
# Calculates time spans between event types within each session
output_file = '../output/searches_journeys_timed.parquet'
output_path = Path(output_file)

# Delete old file if exists
if output_path.exists():
    output_path.unlink()

execute(f"""
    COPY (
        WITH session_timings AS (
            -- Aggregate timing metrics per session (using pre-calculated columns)
            SELECT 
                session_key,
                session_date,
                MIN(timestamp) as session_start,
                MAX(timestamp) as session_end,
                COUNT(*) as total_events,
                
                -- Time to first result (from first search to first result)
                MIN(CASE 
                    WHEN name = 'SEARCH_RESULT_COUNT' AND prev_event = 'SEARCH_TRIGGERED'
                    THEN ms_since_prev_event 
                END) as ms_search_to_result,
                
                -- Time to first click (from first result to first click)
                MIN(CASE 
                    WHEN click_category IS NOT NULL AND prev_event = 'SEARCH_RESULT_COUNT'
                    THEN ms_since_prev_event 
                END) as ms_result_to_click,
                
                
                -- Total session duration
                DATEDIFF('millisecond', MIN(timestamp), MAX(timestamp)) as total_duration_ms,
                
                -- Event counts
                COUNT(CASE WHEN name = 'SEARCH_TRIGGERED' THEN 1 END) as search_count_in_session,
                COUNT(CASE WHEN name = 'SEARCH_RESULT_COUNT' THEN 1 END) as result_count,
                COUNT(CASE WHEN click_category IS NOT NULL THEN 1 END) as click_count,
                COUNT(DISTINCT search_term_normalized) as unique_search_terms,
                SUM(CASE WHEN is_null_result = true THEN 1 ELSE 0 END) as null_result_count,
                
                
                -- Time of day
                MIN(event_hour) as first_event_hour,
                MAX(event_hour) as last_event_hour,
                
                -- Click category breakdown
                COUNT(CASE WHEN click_category = 'General' THEN 1 END) as general_clicks,
                COUNT(CASE WHEN click_category = 'All' THEN 1 END) as all_tab_clicks,
                COUNT(CASE WHEN click_category = 'News' THEN 1 END) as news_clicks,
                COUNT(CASE WHEN click_category = 'GoTo' THEN 1 END) as goto_clicks,
                COUNT(CASE WHEN click_category = 'People' THEN 1 END) as people_clicks,
                
                -- First search flag
                MAX(CASE WHEN is_first_search_of_day = true THEN 1 ELSE 0 END) as includes_first_search_of_day
                
            FROM searches
            GROUP BY session_key, session_date
        )
        SELECT 
            session_date,
            session_start,
            total_events,
            search_count,
            result_count,
            click_count,
            unique_search_terms,
            null_result_count,
            
            -- Time metrics in seconds
            ROUND(ms_search_to_result / 1000.0, 2) as sec_search_to_result,
            ROUND(ms_result_to_click / 1000.0, 2) as sec_result_to_click,
            ROUND(total_duration_ms / 1000.0, 2) as total_duration_sec,
            
            
            -- Time of day
            first_event_hour,
            last_event_hour,
            
            -- Click category breakdown
            general_clicks,
            all_tab_clicks,
            news_clicks,
            goto_clicks,
            people_clicks,
            
            -- First search indicator
            CASE WHEN includes_first_search_of_day = 1 THEN true ELSE false END as includes_first_search_of_day,
            
            -- Time interval buckets for search-to-result
            CASE 
                WHEN ms_search_to_result IS NULL THEN 'No Result'
                WHEN ms_search_to_result < 500 THEN '< 0.5s'
                WHEN ms_search_to_result < 1000 THEN '0.5-1s'
                WHEN ms_search_to_result < 2000 THEN '1-2s'
                WHEN ms_search_to_result < 5000 THEN '2-5s'
                ELSE '> 5s'
            END as search_to_result_bucket,
            
            -- Time interval buckets for result-to-click
            CASE 
                WHEN ms_result_to_click IS NULL THEN 'No Click'
                WHEN ms_result_to_click < 2000 THEN '< 2s (quick)'
                WHEN ms_result_to_click < 5000 THEN '2-5s'
                WHEN ms_result_to_click < 10000 THEN '5-10s'
                WHEN ms_result_to_click < 30000 THEN '10-30s'
                WHEN ms_result_to_click < 60000 THEN '30-60s'
                ELSE '> 60s (browsing)'
            END as result_to_click_bucket,
            
            -- Session duration buckets
            CASE 
                WHEN total_duration_ms < 5000 THEN '< 5s (quick)'
                WHEN total_duration_ms < 30000 THEN '5-30s'
                WHEN total_duration_ms < 60000 THEN '30-60s'
                WHEN total_duration_ms < 180000 THEN '1-3 min'
                WHEN total_duration_ms < 300000 THEN '3-5 min'
                ELSE '> 5 min (extended)'
            END as session_duration_bucket,
            
            -- Journey outcome
            CASE 
                WHEN click_count > 0 THEN 'Success'
                WHEN null_result_count > 0 AND click_count = 0 THEN 'No Results'
                WHEN result_count > 0 AND click_count = 0 THEN 'Abandoned'
                ELSE 'Unknown'
            END as journey_outcome,
            
            CASE WHEN unique_search_terms > 1 THEN true ELSE false END as had_reformulation
            
        FROM session_timings
        ORDER BY session_date, session_start
    ) TO '{output_file}' (FORMAT PARQUET)
""")

# Verify and show distribution
try:
    test_read = query(f"SELECT COUNT(*) as n FROM read_parquet('{output_file}')")
    sessions = test_read['n'][0]
    size_mb = os.path.getsize(output_file) / (1024 * 1024)
    
    print(f"Exported: {output_file}")
    print(f"Sessions: {sessions:,}")
    print(f"Size:     {size_mb:.2f} MB")
    
    # Show timing distributions
    print(f"\n--- Search to Result Time ---")
    dist1 = query(f"""
        SELECT search_to_result_bucket as bucket, COUNT(*) as count,
               ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as pct
        FROM read_parquet('{output_file}')
        GROUP BY 1 ORDER BY 2 DESC
    """)
    for _, row in dist1.iterrows():
        print(f"  {row['bucket']:15} {row['count']:>6,} ({row['pct']}%)")
    
    print(f"\n--- Result to Click Time ---")
    dist2 = query(f"""
        SELECT result_to_click_bucket as bucket, COUNT(*) as count,
               ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as pct
        FROM read_parquet('{output_file}')
        GROUP BY 1 ORDER BY 2 DESC
    """)
    for _, row in dist2.iterrows():
        print(f"  {row['bucket']:20} {row['count']:>6,} ({row['pct']}%)")
        
    print(f"\n--- Session Duration ---")
    dist3 = query(f"""
        SELECT session_duration_bucket as bucket, COUNT(*) as count,
               ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 1) as pct
        FROM read_parquet('{output_file}')
        GROUP BY 1 ORDER BY 2 DESC
    """)
    for _, row in dist3.iterrows():
        print(f"  {row['bucket']:20} {row['count']:>6,} ({row['pct']}%)")
        
except Exception as e:
    print(f"ERROR: {e}")

---
## Add New Data

If you want to import additional CSV files later:

In [None]:
# Append new CSV to existing table
# NEW_CSV = '../data/new_data.csv'

# execute(f"""
#     INSERT INTO searches
#     SELECT * FROM read_csv('{NEW_CSV}', auto_detect=true)
# """)

# print(f"New data added. Total: {query('SELECT COUNT(*) FROM searches')['count_star()'][0]:,} rows")

---
## Cleanup

In [None]:
# Close connection (at end of session)
con.close()
print("Connection closed")

---
## Notes

**My columns:**
- ...

**Findings:**
- ...

**Open questions:**
- ...