# Intranet Analytics Analysis

This notebook analyzes the Intranet Analytics data including:
- **UV (Unique Visitors)** - Calculated via `DISTINCTCOUNT(viewingcontactid)`
- **Likes** - Counted when `marketingpageidliked` has a value (contains the PageID that was liked)
- **Engagement** - Likes + Comments (`comments`)
- **Views, Visits, Duration** - Standard traffic metrics

Data source (star schema):
- `fact` - Page views/visits data (last 90 days)
- `page_inventory` - Page metadata (join via `marketingpageid`)
- `dim_date` - Date dimension (join via `visitdatekey = datekey`)

In [None]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Project paths
PROJECT_ROOT = Path.cwd().parent
DB_PATH = PROJECT_ROOT / "output" / "db" / "analytics.duckdb"

# Plot styling
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Connect to the database
con = duckdb.connect(str(DB_PATH), read_only=True)
print(f"Connected to: {DB_PATH}")

---
## Filter Configuration

Set these filters to narrow down your analysis. Leave as `None` or empty string to include all data.

In [None]:
# ============================================================
# FILTER CONFIGURATION - Modify these values as needed
# ============================================================

# Website filter (uses CONTAINS/LIKE - case insensitive)
# Examples: "intranet", "hr", "sales", None for all
FILTER_WEBSITE = None  # e.g., "intranet"

# Date range filter (format: YYYYMMDD as string)
# Examples: "20251001", "20251231", None for all
FILTER_DATE_FROM = None  # e.g., "20251001"
FILTER_DATE_TO = None    # e.g., "20251031"

# Full page URL filter (uses CONTAINS/LIKE - case insensitive)
# Examples: "/news/", "/hr/policies", None for all
FILTER_PAGEURL = None  # e.g., "/news/"

# ============================================================

# Build WHERE clause dynamically
def build_filter_clause(use_f_alias=True, use_p_alias=True):
    """Build SQL WHERE clause based on filter settings."""
    conditions = []
    f = "f." if use_f_alias else ""
    p = "p." if use_p_alias else ""
    
    if FILTER_DATE_FROM:
        conditions.append(f"{f}visitdatekey >= '{FILTER_DATE_FROM}'")
    if FILTER_DATE_TO:
        conditions.append(f"{f}visitdatekey <= '{FILTER_DATE_TO}'")
    if FILTER_WEBSITE:
        conditions.append(f"LOWER({p}websitename) LIKE LOWER('%{FILTER_WEBSITE}%')")
    if FILTER_PAGEURL:
        conditions.append(f"LOWER({p}fullpageurl) LIKE LOWER('%{FILTER_PAGEURL}%')")
    
    if conditions:
        return "WHERE " + " AND ".join(conditions)
    return ""

# Display active filters
print("=" * 50)
print("ACTIVE FILTERS")
print("=" * 50)
print(f"Website contains:    {FILTER_WEBSITE or '(all)'}")
print(f"Date from:           {FILTER_DATE_FROM or '(all)'}")
print(f"Date to:             {FILTER_DATE_TO or '(all)'}")
print(f"Page URL contains:   {FILTER_PAGEURL or '(all)'}")
print("=" * 50)
print(f"\nGenerated WHERE clause: {build_filter_clause() or '(none)'}")

---
## 1. Data Overview

In [None]:
# Summary statistics (with filters applied)
filter_clause = build_filter_clause()

summary = con.execute(f"""
    SELECT
        COUNT(*) as total_rows,
        COUNT(DISTINCT f.visitdatekey) as days_covered,
        MIN(f.visitdatekey) as first_date,
        MAX(f.visitdatekey) as last_date,
        COUNT(DISTINCT f.marketingpageid) as unique_pages,
        COUNT(DISTINCT f.viewingcontactid) as total_unique_visitors,
        SUM(f.views) as total_views,
        SUM(f.visits) as total_visits,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as total_likes,
        SUM(f.comments) as total_comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as total_engagements
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
""").df()

print("=" * 50)
print("DATASET SUMMARY")
print("=" * 50)
print(f"Total rows:           {summary['total_rows'].iloc[0]:,}")
print(f"Date range:           {summary['first_date'].iloc[0]} to {summary['last_date'].iloc[0]}")
print(f"Days covered:         {summary['days_covered'].iloc[0]:,}")
print(f"Unique pages:         {summary['unique_pages'].iloc[0]:,}")
print(f"Total Unique Visitors:{summary['total_unique_visitors'].iloc[0]:,}")
print(f"Total views:          {summary['total_views'].iloc[0]:,}")
print(f"Total visits:         {summary['total_visits'].iloc[0]:,}")
print(f"Total likes:          {summary['total_likes'].iloc[0]:,}")
print(f"Total comments:       {summary['total_comments'].iloc[0]:,}")
print(f"Total engagements:    {summary['total_engagements'].iloc[0]:,}")
print("=" * 50)

---
## 2. Unique Visitors (UV) Analysis

### 2.1 Daily UV Trend

In [None]:
# Daily unique visitors
filter_clause = build_filter_clause()

daily_uv = con.execute(f"""
    SELECT
        d.date,
        d.day_name,
        d.is_weekend,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views,
        SUM(f.visits) as total_visits,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as engagements
    FROM fact f
    JOIN dim_date d ON f.visitdatekey = d.datekey
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY d.date, d.day_name, d.is_weekend
    ORDER BY d.date
""").df()

print(f"Daily UV statistics:")
print(f"  Average daily UV: {daily_uv['unique_visitors'].mean():,.0f}")
print(f"  Min daily UV:     {daily_uv['unique_visitors'].min():,}")
print(f"  Max daily UV:     {daily_uv['unique_visitors'].max():,}")
print()
daily_uv.tail(30)

### 2.2 Weekly UV Trend

In [None]:
# Weekly unique visitors (UV calculated correctly across the week)
filter_clause = build_filter_clause()

weekly_uv = con.execute(f"""
    SELECT
        MIN(d.date) as week_start,
        d.year_week,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views,
        SUM(f.visits) as total_visits,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as engagements,
        COUNT(DISTINCT d.date) as days_in_week
    FROM fact f
    JOIN dim_date d ON f.visitdatekey = d.datekey
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY d.year_week
    ORDER BY week_start
""").df()

weekly_uv

### 2.3 Monthly UV Trend

In [None]:
# Monthly unique visitors (UV calculated correctly across the month)
filter_clause = build_filter_clause()

monthly_uv = con.execute(f"""
    SELECT
        MIN(d.date) as month_start,
        d.year_month,
        d.month_name,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views,
        SUM(f.visits) as total_visits,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as engagements,
        COUNT(DISTINCT d.date) as days_in_month
    FROM fact f
    JOIN dim_date d ON f.visitdatekey = d.datekey
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY d.year_month, d.month_name
    ORDER BY month_start
""").df()

print(f"Monthly UV statistics:")
print(f"  Average monthly UV: {monthly_uv['unique_visitors'].mean():,.0f}")
print(f"  Min monthly UV:     {monthly_uv['unique_visitors'].min():,}")
print(f"  Max monthly UV:     {monthly_uv['unique_visitors'].max():,}")
print()
monthly_uv

### 2.4 Quarterly UV Trend

In [None]:
# Quarterly unique visitors (UV calculated correctly across the quarter)
filter_clause = build_filter_clause()

quarterly_uv = con.execute(f"""
    SELECT
        MIN(d.date) as quarter_start,
        d.year_quarter,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views,
        SUM(f.visits) as total_visits,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as engagements,
        COUNT(DISTINCT d.date) as days_in_quarter
    FROM fact f
    JOIN dim_date d ON f.visitdatekey = d.datekey
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY d.year_quarter
    ORDER BY quarter_start
""").df()

print(f"Quarterly UV statistics:")
print(f"  Average quarterly UV: {quarterly_uv['unique_visitors'].mean():,.0f}")
print()
quarterly_uv

### 2.5 UV by Website

In [None]:
# Unique visitors by website
filter_clause = build_filter_clause()

uv_by_website = con.execute(f"""
    SELECT
        COALESCE(p.websitename, 'Unknown') as website,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views,
        SUM(f.visits) as total_visits,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as engagements,
        COUNT(DISTINCT f.marketingpageid) as pages_viewed
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY COALESCE(p.websitename, 'Unknown')
    ORDER BY unique_visitors DESC
""").df()

uv_by_website

### 2.6 UV by Theme

In [None]:
# Unique visitors by theme
filter_clause = build_filter_clause()

uv_by_theme = con.execute(f"""
    SELECT
        COALESCE(p.theme, 'Unknown') as theme,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as engagements,
        COUNT(DISTINCT f.marketingpageid) as pages_in_theme
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY COALESCE(p.theme, 'Unknown')
    ORDER BY unique_visitors DESC
    LIMIT 20
""").df()

uv_by_theme

---
## 3. Engagement Analysis (Likes & Comments)

Engagement = Likes + Comments

### 3.1 Daily Engagement Trend

In [None]:
# Daily engagement (likes + comments)
filter_clause = build_filter_clause()

daily_engagement = con.execute(f"""
    SELECT
        d.date,
        d.day_name,
        d.is_weekend,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as total_engagements,
        SUM(f.views) as views,
        ROUND((SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments)) * 100.0 / NULLIF(SUM(f.views), 0), 3) as engagement_rate_pct
    FROM fact f
    JOIN dim_date d ON f.visitdatekey = d.datekey
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY d.date, d.day_name, d.is_weekend
    ORDER BY d.date
""").df()

print(f"Engagement statistics:")
print(f"  Total likes:            {daily_engagement['likes'].sum():,}")
print(f"  Total comments:         {daily_engagement['comments'].sum():,}")
print(f"  Total engagements:      {daily_engagement['total_engagements'].sum():,}")
print(f"  Average daily likes:    {daily_engagement['likes'].mean():,.0f}")
print(f"  Average daily comments: {daily_engagement['comments'].mean():,.0f}")
print(f"  Average engagement rate:{daily_engagement['engagement_rate_pct'].mean():.3f}%")
print()
daily_engagement.tail(30)

### 3.2 Most Engaged Pages (by Total Engagements)

In [None]:
# Top pages by total engagement (likes + comments)
filter_clause = build_filter_clause()

top_engaged_pages = con.execute(f"""
    SELECT
        f.marketingpageid,
        p.pagename,
        p.websitename,
        p.theme,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as total_engagements,
        SUM(f.views) as total_views,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        ROUND((SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments)) * 100.0 / NULLIF(SUM(f.views), 0), 2) as engagement_rate_pct
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY f.marketingpageid, p.pagename, p.websitename, p.theme
    HAVING (SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments)) > 0
    ORDER BY total_engagements DESC
    LIMIT 25
""").df()

top_engaged_pages

### 3.3 Engagement by Content Type

In [None]:
# Engagement by content type
filter_clause = build_filter_clause()

engagement_by_contenttype = con.execute(f"""
    SELECT
        COALESCE(p.contenttype, 'Unknown') as content_type,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as total_engagements,
        SUM(f.views) as total_views,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        ROUND((SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments)) * 100.0 / NULLIF(SUM(f.views), 0), 3) as engagement_rate_pct
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY COALESCE(p.contenttype, 'Unknown')
    ORDER BY total_engagements DESC
""").df()

engagement_by_contenttype

---
## 4. Top Pages Analysis

### 4.1 Top Pages by Views

In [None]:
# Top pages by views
filter_clause = build_filter_clause()

top_pages_views = con.execute(f"""
    SELECT
        f.marketingpageid,
        p.pagename,
        p.websitename,
        p.fullpageurl,
        SUM(f.views) as total_views,
        SUM(f.visits) as total_visits,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        ROUND(AVG(f.durationavg), 1) as avg_duration_sec
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY f.marketingpageid, p.pagename, p.websitename, p.fullpageurl
    ORDER BY total_views DESC
    LIMIT 25
""").df()

top_pages_views

### 4.2 Top Pages by Unique Visitors

In [None]:
# Top pages by unique visitors
filter_clause = build_filter_clause()

top_pages_uv = con.execute(f"""
    SELECT
        f.marketingpageid,
        p.pagename,
        p.websitename,
        p.fullpageurl,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views,
        ROUND(SUM(f.views) * 1.0 / COUNT(DISTINCT f.viewingcontactid), 2) as views_per_visitor,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY f.marketingpageid, p.pagename, p.websitename, p.fullpageurl
    ORDER BY unique_visitors DESC
    LIMIT 25
""").df()

top_pages_uv

---
## 5. Referrer Analysis

In [None]:
# Traffic by referrer application
filter_clause = build_filter_clause()

referrer_analysis = con.execute(f"""
    SELECT
        f.referrerapplicationid,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views,
        SUM(f.visits) as total_visits,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as engagements,
        COUNT(DISTINCT f.marketingpageid) as pages_visited
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY f.referrerapplicationid
    ORDER BY unique_visitors DESC
""").df()

referrer_analysis

---
## 6. Overall Engagement Metrics

In [None]:
# Overall engagement metrics
filter_clause = build_filter_clause()

engagement = con.execute(f"""
    SELECT
        COUNT(DISTINCT f.viewingcontactid) as total_uv,
        SUM(f.views) as total_views,
        SUM(f.visits) as total_visits,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as total_likes,
        SUM(f.comments) as total_comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as total_engagements,
        ROUND(SUM(f.views) * 1.0 / COUNT(DISTINCT f.viewingcontactid), 2) as views_per_visitor,
        ROUND(SUM(f.visits) * 1.0 / COUNT(DISTINCT f.viewingcontactid), 2) as visits_per_visitor,
        ROUND(SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) * 1.0 / COUNT(DISTINCT f.viewingcontactid), 4) as likes_per_visitor,
        ROUND(SUM(f.comments) * 1.0 / COUNT(DISTINCT f.viewingcontactid), 4) as comments_per_visitor,
        ROUND((SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments)) * 1.0 / COUNT(DISTINCT f.viewingcontactid), 4) as engagements_per_visitor,
        ROUND((SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments)) * 100.0 / NULLIF(SUM(f.views), 0), 3) as engagement_rate_pct,
        ROUND(SUM(f.durationsum) * 1.0 / NULLIF(SUM(f.visits), 0), 1) as avg_duration_per_visit
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
""").df()

print("=" * 50)
print("OVERALL ENGAGEMENT METRICS")
print("=" * 50)
print(f"Total Unique Visitors:   {engagement['total_uv'].iloc[0]:,}")
print(f"Total Views:             {engagement['total_views'].iloc[0]:,}")
print(f"Total Visits:            {engagement['total_visits'].iloc[0]:,}")
print("")
print(f"Total Likes:             {engagement['total_likes'].iloc[0]:,}")
print(f"Total Comments:          {engagement['total_comments'].iloc[0]:,}")
print(f"Total Engagements:       {engagement['total_engagements'].iloc[0]:,}")
print("")
print(f"Views per Visitor:       {engagement['views_per_visitor'].iloc[0]}")
print(f"Visits per Visitor:      {engagement['visits_per_visitor'].iloc[0]}")
print(f"Likes per Visitor:       {engagement['likes_per_visitor'].iloc[0]}")
print(f"Comments per Visitor:    {engagement['comments_per_visitor'].iloc[0]}")
print(f"Engagements per Visitor: {engagement['engagements_per_visitor'].iloc[0]}")
print("")
print(f"Engagement Rate:         {engagement['engagement_rate_pct'].iloc[0]}%")
print(f"Avg Duration/Visit:      {engagement['avg_duration_per_visit'].iloc[0]} sec")
print("=" * 50)

---
## 7. Quick Filter Examples

These cells show how to use the filters. Modify the filter variables at the top and re-run cells.

In [None]:
# Quick search: Find websites matching a keyword
search_keyword = ""  # Enter keyword to search, e.g., "hr", "sales"

if search_keyword:
    matching_websites = con.execute(f"""
        SELECT DISTINCT websitename
        FROM page_inventory
        WHERE LOWER(websitename) LIKE LOWER('%{search_keyword}%')
        ORDER BY websitename
    """).df()
    print(f"Websites matching '{search_keyword}':")
    display(matching_websites)
else:
    print("Enter a search_keyword to find matching websites")

In [None]:
# Quick search: Find pages matching a URL pattern
url_pattern = ""  # Enter URL pattern to search, e.g., "/news/", "/hr/"

if url_pattern:
    matching_pages = con.execute(f"""
        SELECT 
            pagename,
            websitename,
            fullpageurl
        FROM page_inventory
        WHERE LOWER(fullpageurl) LIKE LOWER('%{url_pattern}%')
        ORDER BY websitename, pagename
        LIMIT 50
    """).df()
    print(f"Pages matching URL pattern '{url_pattern}':")
    display(matching_pages)
else:
    print("Enter a url_pattern to find matching pages")

In [None]:
# Check available date range in data
date_range = con.execute("""
    SELECT 
        MIN(visitdatekey) as earliest_date,
        MAX(visitdatekey) as latest_date,
        COUNT(DISTINCT visitdatekey) as total_days
    FROM fact
""").df()

print("Available date range in data:")
print(f"  Earliest: {date_range['earliest_date'].iloc[0]}")
print(f"  Latest:   {date_range['latest_date'].iloc[0]}")
print(f"  Days:     {date_range['total_days'].iloc[0]}")

---
## 8. Visualizations

Key charts for Intranet Analytics insights.

### 8.1 Rows per Website (Data Volume Distribution)

In [None]:
# Rows per website - shows data volume distribution
filter_clause = build_filter_clause()

rows_per_website = con.execute(f"""
    SELECT
        COALESCE(p.websitename, 'Unknown') as website,
        COUNT(*) as row_count,
        SUM(f.views) as total_views,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY COALESCE(p.websitename, 'Unknown')
    ORDER BY row_count DESC
""").df()

# Plot horizontal bar chart (better for many categories with long names)
fig, ax = plt.subplots(figsize=(12, max(6, len(rows_per_website) * 0.4)))
bars = ax.barh(rows_per_website['website'], rows_per_website['row_count'], color='steelblue')
ax.set_xlabel('Number of Rows')
ax.set_ylabel('Website')
ax.set_title('Data Volume: Rows per Website')
ax.invert_yaxis()  # Largest at top

# Add value labels
for bar, val in zip(bars, rows_per_website['row_count']):
    ax.text(val + max(rows_per_website['row_count']) * 0.01, bar.get_y() + bar.get_height()/2, 
            f'{val:,}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

# Summary table
print(f"\nTotal rows: {rows_per_website['row_count'].sum():,}")
rows_per_website

### 8.2 Daily UV Trend (Line Chart)

In [None]:
# Daily UV trend line chart
filter_clause = build_filter_clause()

daily_uv_chart = con.execute(f"""
    SELECT
        d.date,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as views
    FROM fact f
    JOIN dim_date d ON f.visitdatekey = d.datekey
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY d.date
    ORDER BY d.date
""").df()

fig, ax1 = plt.subplots(figsize=(14, 6))

# UV line
color_uv = 'steelblue'
ax1.plot(daily_uv_chart['date'], daily_uv_chart['unique_visitors'], color=color_uv, linewidth=1.5, label='Unique Visitors')
ax1.fill_between(daily_uv_chart['date'], daily_uv_chart['unique_visitors'], alpha=0.3, color=color_uv)
ax1.set_xlabel('Date')
ax1.set_ylabel('Unique Visitors', color=color_uv)
ax1.tick_params(axis='y', labelcolor=color_uv)

# Add trend line (7-day moving average)
if len(daily_uv_chart) >= 7:
    daily_uv_chart['uv_ma7'] = daily_uv_chart['unique_visitors'].rolling(window=7).mean()
    ax1.plot(daily_uv_chart['date'], daily_uv_chart['uv_ma7'], color='darkblue', 
             linewidth=2, linestyle='--', label='7-day Moving Avg')

ax1.set_title('Daily Unique Visitors Trend')
ax1.legend(loc='upper left')

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Summary stats
print(f"Average daily UV: {daily_uv_chart['unique_visitors'].mean():,.0f}")
print(f"Peak UV: {daily_uv_chart['unique_visitors'].max():,} on {daily_uv_chart.loc[daily_uv_chart['unique_visitors'].idxmax(), 'date']}")

### 8.3 Top 15 Pages by Unique Visitors

In [None]:
# Top 15 pages by unique visitors
filter_clause = build_filter_clause()

top_pages_chart = con.execute(f"""
    SELECT
        COALESCE(p.pagename, f.marketingpageid) as page,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY COALESCE(p.pagename, f.marketingpageid)
    ORDER BY unique_visitors DESC
    LIMIT 15
""").df()

# Truncate long page names for display
top_pages_chart['page_short'] = top_pages_chart['page'].apply(
    lambda x: (x[:40] + '...') if len(str(x)) > 43 else x
)

fig, ax = plt.subplots(figsize=(12, 8))
bars = ax.barh(top_pages_chart['page_short'], top_pages_chart['unique_visitors'], color='teal')
ax.set_xlabel('Unique Visitors')
ax.set_ylabel('Page')
ax.set_title('Top 15 Pages by Unique Visitors')
ax.invert_yaxis()

# Add value labels
for bar, val in zip(bars, top_pages_chart['unique_visitors']):
    ax.text(val + max(top_pages_chart['unique_visitors']) * 0.01, bar.get_y() + bar.get_height()/2, 
            f'{val:,}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

### 8.4 Engagement by Content Type

In [None]:
# Engagement by content type - stacked bar chart (Likes vs Comments)
filter_clause = build_filter_clause()

engagement_chart = con.execute(f"""
    SELECT
        COALESCE(p.contenttype, 'Unknown') as content_type,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) as likes,
        SUM(f.comments) as comments,
        SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments) as total_engagements
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY COALESCE(p.contenttype, 'Unknown')
    HAVING (SUM(CASE WHEN f.marketingpageidliked IS NOT NULL AND f.marketingpageidliked != '' THEN 1 ELSE 0 END) + SUM(f.comments)) > 0
    ORDER BY total_engagements DESC
    LIMIT 15
""").df()

fig, ax = plt.subplots(figsize=(12, 8))

# Create stacked horizontal bar chart
bars1 = ax.barh(engagement_chart['content_type'], engagement_chart['likes'], 
                color='coral', label='Likes')
bars2 = ax.barh(engagement_chart['content_type'], engagement_chart['comments'], 
                left=engagement_chart['likes'], color='mediumpurple', label='Comments')

ax.set_xlabel('Total Engagements')
ax.set_ylabel('Content Type')
ax.set_title('Engagement by Content Type (Likes + Comments)')
ax.invert_yaxis()
ax.legend(loc='lower right')

# Add total labels
for i, (likes, comments) in enumerate(zip(engagement_chart['likes'], engagement_chart['comments'])):
    total = likes + comments
    ax.text(total + max(engagement_chart['total_engagements']) * 0.01, i, 
            f'{total:,}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

### 8.5 UV by Website

In [None]:
# UV by website bar chart
filter_clause = build_filter_clause()

uv_website_chart = con.execute(f"""
    SELECT
        COALESCE(p.websitename, 'Unknown') as website,
        COUNT(DISTINCT f.viewingcontactid) as unique_visitors,
        SUM(f.views) as total_views
    FROM fact f
    LEFT JOIN page_inventory p ON f.marketingpageid = p.marketingpageid
    {filter_clause}
    GROUP BY COALESCE(p.websitename, 'Unknown')
    ORDER BY unique_visitors DESC
""").df()

fig, ax = plt.subplots(figsize=(12, max(6, len(uv_website_chart) * 0.4)))
bars = ax.barh(uv_website_chart['website'], uv_website_chart['unique_visitors'], color='seagreen')
ax.set_xlabel('Unique Visitors')
ax.set_ylabel('Website')
ax.set_title('Unique Visitors by Website')
ax.invert_yaxis()

# Add value labels
for bar, val in zip(bars, uv_website_chart['unique_visitors']):
    ax.text(val + max(uv_website_chart['unique_visitors']) * 0.01, bar.get_y() + bar.get_height()/2, 
            f'{val:,}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

# Summary
print(f"Total unique visitors across all websites: {uv_website_chart['unique_visitors'].sum():,}")

In [None]:
# Close connection when done
# con.close()