# FTC Contextualized Events - Duplicate Analysis

This notebook analyzes duplicate events in `out/ftc-contextualized-events.csv` to identify duplication patterns.

## Duplicate Detection Strategies

1. **Exact Duplicates (by event_id)**: Same `event_id` appearing multiple times - PRIMARY FOCUS
2. **Duplicates by event_id + group_id**: Same event_id within/across groups - SECONDARY FOCUS
3. **Duplicates by event_id + player_id**: Same event_id for same player
4. **Duplicates by event_id + session_id**: Same event_id within same session
5. **Content Duplicates**: Same `player_id` + `event_name` + `event_timestamp` + `payload`

## 1. Setup & Data Loading

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# Configure pandas display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.width', None)

# Configure matplotlib
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

# Load the data
df = pd.read_csv('out/ftc-contextualized-events.csv', low_memory=False)

# Explicitly convert timestamp column to datetime
# Use ISO8601 format to handle both with and without microseconds
df['event_timestamp'] = pd.to_datetime(df['event_timestamp'], format='ISO8601', utc=True)

print(f"Loaded {len(df):,} events")
print(f"\nColumns: {list(df.columns)}")

## 2. Data Overview

In [None]:
print("=" * 60)
print("DATA OVERVIEW")
print("=" * 60)

print(f"\nTotal events: {len(df):,}")
print(f"Unique event_ids: {df['event_id'].nunique():,}")
print(f"Unique group_ids: {df['group_id'].nunique():,}")
print(f"Unique players: {df['player_id'].nunique():,}")
print(f"Unique sessions: {df['session_id'].nunique():,}")

print(f"\nDate range: {df['event_timestamp'].min()} to {df['event_timestamp'].max()}")

print("\n" + "-" * 40)
print("Event Types Distribution:")
print("-" * 40)
event_counts = df['event_name'].value_counts()
for event_name, count in event_counts.items():
    print(f"  {event_name}: {count:,} ({count/len(df)*100:.1f}%)")

# Visualize event type distribution
fig, ax = plt.subplots(figsize=(10, 6))
event_counts.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_title('Event Type Distribution', fontsize=14, fontweight='bold')
ax.set_xlabel('Event Type')
ax.set_ylabel('Count')
ax.tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.show()

## 3. Exact Duplicates (by event_id) - PRIMARY FOCUS

Identifying events where the same `event_id` appears multiple times. This is the most basic and important form of duplication.

In [None]:
print("=" * 60)
print("EXACT DUPLICATES (by event_id) - PRIMARY FOCUS")
print("=" * 60)

# Count occurrences of each event_id
event_id_counts = df['event_id'].value_counts()
duplicated_event_ids = event_id_counts[event_id_counts > 1]

total_events = len(df)
unique_event_ids = df['event_id'].nunique()
duplicate_event_id_count = len(duplicated_event_ids)
events_with_dup_ids = df[df['event_id'].isin(duplicated_event_ids.index)]
total_duplicate_rows = len(events_with_dup_ids)

print(f"\nTotal events: {total_events:,}")
print(f"Unique event_ids: {unique_event_ids:,}")
print(f"Duplicated event_ids: {duplicate_event_id_count:,}")
print(f"Events with duplicated event_id: {total_duplicate_rows:,}")
print(f"Duplication rate: {(total_events - unique_event_ids) / total_events * 100:.2f}%")

if duplicate_event_id_count > 0:
    print("\n" + "-" * 40)
    print("Sample Duplicated event_ids (top 10):")
    print("-" * 40)
    for event_id, count in duplicated_event_ids.head(10).items():
        print(f"  {event_id}: {count} occurrences")
    
    print("\n" + "-" * 40)
    print("Sample duplicate events (first duplicated event_id):")
    print("-" * 40)
    first_dup_id = duplicated_event_ids.index[0]
    display(df[df['event_id'] == first_dup_id][['event_id', 'player_id', 'session_id', 'event_name', 'event_timestamp', 'group_id']])
else:
    print("\nNo exact duplicates found by event_id.")

## 4. Duplicates by event_id + group_id - SECONDARY FOCUS

Analyzing whether the same `event_id` appears in the same group or across different groups. This helps understand if duplicates are within-batch or cross-batch.

In [None]:
print("=" * 60)
print("DUPLICATES BY event_id + group_id - SECONDARY FOCUS")
print("=" * 60)

# remove event_name = 'session_end'
df = df[df['event_name'] != 'session_end']

# Group by event_id + group_id
df['event_group_key'] = df['event_id'] + '_' + df['group_id'].astype(str)
event_group_counts = df['event_group_key'].value_counts()
duplicated_event_group = event_group_counts[event_group_counts > 1]

unique_event_group_combos = df['event_group_key'].nunique()
duplicate_combo_count = len(duplicated_event_group)

print(f"\nUnique event_id + group_id combinations: {unique_event_group_combos:,}")
print(f"Duplicated combinations: {duplicate_combo_count:,}")
print(f"Duplication rate: {(len(df) - unique_event_group_combos) / len(df) * 100:.2f}%")

# Analyze within-group vs cross-group duplicates
event_id_counts = df['event_id'].value_counts()
duplicated_event_ids = event_id_counts[event_id_counts > 1].index

if len(duplicated_event_ids) > 0:
    dup_events = df[df['event_id'].isin(duplicated_event_ids)]
    groups_per_event = dup_events.groupby('event_id')['group_id'].nunique()
    
    within_group = (groups_per_event == 1).sum()
    cross_group = (groups_per_event > 1).sum()
    
    print(f"\n" + "-" * 40)
    print("Duplicate Distribution:")
    print("-" * 40)
    print(f"Total duplicated event_ids: {len(duplicated_event_ids):,}")
    print(f"Within-group duplicates: {within_group:,} ({within_group/len(duplicated_event_ids)*100:.1f}%)")
    print(f"Cross-group duplicates: {cross_group:,} ({cross_group/len(duplicated_event_ids)*100:.1f}%)")
    
    if duplicate_combo_count > 0:
        print("\n" + "-" * 40)
        print("Sample duplicated event_id + group_id (top 10):")
        print("-" * 40)
        for key, count in duplicated_event_group.head(10).items():
            parts = key.rsplit('_', 1)
            event_id = parts[0] if len(parts) > 1 else key[:36]
            group_id = parts[1] if len(parts) > 1 else 'N/A'
            print(f"  event_id={event_id[:30]}..., group_id={group_id}: {count} occurrences")
    
    # Visualization
    fig, ax = plt.subplots(figsize=(8, 6))
    labels = ['Within-Group', 'Cross-Group']
    sizes = [within_group, cross_group]
    colors = ['steelblue', 'coral']
    if sum(sizes) > 0:
        ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
        ax.set_title('Distribution of Duplicate Types (Within vs Cross Group)', fontsize=12, fontweight='bold')
    plt.tight_layout()
    plt.show()
else:
    print("\nNo duplicated event_ids found to analyze by group.")

# Clean up temporary column
df.drop('event_group_key', axis=1, inplace=True)

# Print Cross-group duplicates event_id, group_id, player_id, session_id and event_name
print("\nCross-group duplicate event details (event_id, group_id, player_id, session_id, event_name):")
if cross_group > 0:
    cross_group_event_ids = groups_per_event[groups_per_event > 1].index
    cross_dup_events = df[df['event_id'].isin(cross_group_event_ids)][['event_id', 'group_id', 'player_id', 'session_id', 'event_name', 'event_timestamp', 'payload']]
    display(cross_dup_events.head(10))
else:
    print("No cross-group duplicate event_ids found.")





## 5. Duplicates by event_id + player_id

Identifying the same `event_id` appearing for the same player multiple times. This detects re-processing of the same event for a specific player.

In [None]:
print("=" * 60)
print("DUPLICATES BY event_id + player_id")
print("=" * 60)

# Group by event_id + player_id
df['event_player_key'] = df['event_id'] + '_' + df['player_id'].astype(str)
event_player_counts = df['event_player_key'].value_counts()
duplicated_event_player = event_player_counts[event_player_counts > 1]

unique_event_player_combos = df['event_player_key'].nunique()
duplicate_combo_count = len(duplicated_event_player)

print(f"\nUnique event_id + player_id combinations: {unique_event_player_combos:,}")
print(f"Duplicated combinations: {duplicate_combo_count:,}")
print(f"Duplication rate: {(len(df) - unique_event_player_combos) / len(df) * 100:.2f}%")

if duplicate_combo_count > 0:
    print("\n" + "-" * 40)
    print("Sample duplicated event_id + player_id (top 10):")
    print("-" * 40)
    for key, count in duplicated_event_player.head(10).items():
        parts = key.rsplit('_', 1)
        event_id = parts[0] if len(parts) > 1 else key[:36]
        player_id = parts[1] if len(parts) > 1 else 'N/A'
        print(f"  event_id={event_id[:30]}..., player_id={player_id}: {count} occurrences")
    
    print("\n" + "-" * 40)
    print("Sample duplicate events (first combo):")
    print("-" * 40)
    first_dup_key = duplicated_event_player.index[0]
    display(df[df['event_player_key'] == first_dup_key][['event_id', 'player_id', 'session_id', 'event_name', 'event_timestamp', 'group_id']])
else:
    print("\nNo duplicates found by event_id + player_id.")

# Clean up temporary column
df.drop('event_player_key', axis=1, inplace=True)

## 6. Duplicates by event_id + session_id

Identifying the same `event_id` appearing within the same session multiple times.

In [None]:
print("=" * 60)
print("DUPLICATES BY event_id + session_id")
print("=" * 60)

# Group by event_id + session_id
df['event_session_key'] = df['event_id'] + '_' + df['session_id'].astype(str)
event_session_counts = df['event_session_key'].value_counts()
duplicated_event_session = event_session_counts[event_session_counts > 1]

unique_event_session_combos = df['event_session_key'].nunique()
duplicate_combo_count = len(duplicated_event_session)

print(f"\nUnique event_id + session_id combinations: {unique_event_session_combos:,}")
print(f"Duplicated combinations: {duplicate_combo_count:,}")
print(f"Duplication rate: {(len(df) - unique_event_session_combos) / len(df) * 100:.2f}%")

if duplicate_combo_count > 0:
    print("\n" + "-" * 40)
    print("Sample duplicated event_id + session_id (top 10):")
    print("-" * 40)
    for key, count in duplicated_event_session.head(10).items():
        parts = key.rsplit('_', 1)
        event_id = parts[0] if len(parts) > 1 else key[:36]
        session_id = parts[1] if len(parts) > 1 else 'N/A'
        print(f"  event_id={event_id[:30]}..., session_id={session_id[:20]}...: {count} occurrences")
    
    print("\n" + "-" * 40)
    print("Sample duplicate events (first combo):")
    print("-" * 40)
    first_dup_key = duplicated_event_session.index[0]
    display(df[df['event_session_key'] == first_dup_key][['event_id', 'player_id', 'session_id', 'event_name', 'event_timestamp', 'group_id']])
else:
    print("\nNo duplicates found by event_id + session_id.")

# Clean up temporary column
df.drop('event_session_key', axis=1, inplace=True)

## 7. Content-Based Duplicates

Identifying events with the same `player_id` + `event_name` + `event_timestamp` + `payload`. These are events that are identical in content regardless of their `event_id`.

In [None]:
print("=" * 60)
print("CONTENT-BASED DUPLICATES")
print("=" * 60)

# Create content key
df['payload_str'] = df['payload'].fillna('').astype(str)
df['content_key'] = df['player_id'].astype(str) + '_' + df['event_name'] + '_' + df['event_timestamp'].astype(str) + '_' + df['payload_str']

content_counts = df['content_key'].value_counts()
duplicated_content = content_counts[content_counts > 1]

unique_content = df['content_key'].nunique()
duplicate_content_count = len(duplicated_content)
events_with_dup_content = df[df['content_key'].isin(duplicated_content.index)]

print(f"\nUnique content combinations: {unique_content:,}")
print(f"Duplicated content combinations: {duplicate_content_count:,}")
print(f"Events with duplicated content: {len(events_with_dup_content):,}")
print(f"Duplication rate: {(len(df) - unique_content) / len(df) * 100:.2f}%")

if duplicate_content_count > 0:
    print("\n" + "-" * 40)
    print("Sample content duplicates (top 10):")
    print("-" * 40)
    for key, count in duplicated_content.head(10).items():
        print(f"  {count} occurrences: {key[:80]}...")
    
    print("\n" + "-" * 40)
    print("Sample duplicate events (first content key):")
    print("-" * 40)
    first_dup_key = duplicated_content.index[0]
    display(df[df['content_key'] == first_dup_key][['event_id', 'player_id', 'event_name', 'event_timestamp', 'group_id']])
else:
    print("\nNo content-based duplicates found.")

# Clean up temporary columns
df.drop(['payload_str', 'content_key'], axis=1, inplace=True)

## 8. Duplicate Trends by Event Type

Analyzing which event types have the most duplicates.

In [None]:
print("=" * 60)
print("DUPLICATE TRENDS BY EVENT TYPE")
print("=" * 60)

# Identify duplicates by event_id
event_id_counts = df['event_id'].value_counts()
duplicated_event_ids = event_id_counts[event_id_counts > 1].index
df['is_duplicate'] = df['event_id'].isin(duplicated_event_ids)

# Analyze by event type
event_type_analysis = df.groupby('event_name').agg(
    total_events=('event_id', 'count'),
    duplicate_events=('is_duplicate', 'sum'),
    unique_event_ids=('event_id', 'nunique')
).reset_index()

event_type_analysis['duplication_rate'] = (event_type_analysis['total_events'] - event_type_analysis['unique_event_ids']) / event_type_analysis['total_events'] * 100
event_type_analysis = event_type_analysis.sort_values('duplicate_events', ascending=False)

print("\nDuplication by Event Type:")
print("-" * 80)
print(f"{'Event Type':<25} {'Total':>10} {'Duplicates':>12} {'Rate':>10}")
print("-" * 80)
for _, row in event_type_analysis.iterrows():
    print(f"{row['event_name']:<25} {row['total_events']:>10,} {int(row['duplicate_events']):>12,} {row['duplication_rate']:>9.2f}%")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Plot 1: Duplicate count by event type
ax1 = axes[0]
event_type_analysis_sorted = event_type_analysis.sort_values('duplicate_events', ascending=True)
ax1.barh(event_type_analysis_sorted['event_name'], event_type_analysis_sorted['duplicate_events'], color='coral', edgecolor='black')
ax1.set_xlabel('Number of Duplicate Events')
ax1.set_title('Duplicate Events by Type', fontsize=12, fontweight='bold')

# Plot 2: Duplication rate by event type
ax2 = axes[1]
event_type_analysis_rate = event_type_analysis.sort_values('duplication_rate', ascending=True)
ax2.barh(event_type_analysis_rate['event_name'], event_type_analysis_rate['duplication_rate'], color='steelblue', edgecolor='black')
ax2.set_xlabel('Duplication Rate (%)')
ax2.set_title('Duplication Rate by Event Type', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

# Clean up
df.drop('is_duplicate', axis=1, inplace=True)

## 9. Duplicate Trends Over Time

Analyzing how duplicates are distributed over time (hourly).

In [None]:
print("=" * 60)
print("DUPLICATE TRENDS OVER TIME")
print("=" * 60)

# Identify duplicates
event_id_counts = df['event_id'].value_counts()
duplicated_event_ids = event_id_counts[event_id_counts > 1].index
df['is_duplicate'] = df['event_id'].isin(duplicated_event_ids)

# Add time-based columns
df['hour'] = df['event_timestamp'].dt.floor('h')
df['date'] = df['event_timestamp'].dt.date

# Hourly analysis
hourly_analysis = df.groupby('hour').agg(
    total_events=('event_id', 'count'),
    duplicate_events=('is_duplicate', 'sum'),
    unique_event_ids=('event_id', 'nunique')
).reset_index()
hourly_analysis['duplication_rate'] = (hourly_analysis['total_events'] - hourly_analysis['unique_event_ids']) / hourly_analysis['total_events'] * 100

print(f"\nTime range: {df['event_timestamp'].min()} to {df['event_timestamp'].max()}")
print(f"Total hours covered: {hourly_analysis['hour'].nunique()}")

# Visualization
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Plot 1: Events over time
ax1 = axes[0]
ax1.fill_between(hourly_analysis['hour'], hourly_analysis['total_events'], alpha=0.3, label='Total Events', color='steelblue')
ax1.plot(hourly_analysis['hour'], hourly_analysis['total_events'], color='steelblue', linewidth=2, label='Total Events')
ax1.fill_between(hourly_analysis['hour'], hourly_analysis['duplicate_events'], alpha=0.5, label='Duplicate Events', color='coral')
ax1.plot(hourly_analysis['hour'], hourly_analysis['duplicate_events'], color='coral', linewidth=2, label='Duplicate Events')
ax1.set_xlabel('Time')
ax1.set_ylabel('Number of Events')
ax1.set_title('Events Over Time', fontsize=12, fontweight='bold')
ax1.legend()
ax1.tick_params(axis='x', rotation=45)

# Plot 2: Duplication rate over time
ax2 = axes[1]
ax2.plot(hourly_analysis['hour'], hourly_analysis['duplication_rate'], color='darkred', linewidth=2, marker='o', markersize=3)
ax2.fill_between(hourly_analysis['hour'], hourly_analysis['duplication_rate'], alpha=0.3, color='darkred')
ax2.set_xlabel('Time')
ax2.set_ylabel('Duplication Rate (%)')
ax2.set_title('Duplication Rate Over Time', fontsize=12, fontweight='bold')
ax2.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Clean up
df.drop(['is_duplicate', 'hour', 'date'], axis=1, inplace=True)

## 10. Duplicates by Group (Batch Analysis)

Analyzing which groups have the most duplicates and understanding the distribution across groups.

In [None]:
print("=" * 60)
print("DUPLICATES BY GROUP")
print("=" * 60)

# Group-level analysis
group_analysis = df.groupby('group_id').agg(
    total_events=('event_id', 'count'),
    unique_event_ids=('event_id', 'nunique')
).reset_index()
group_analysis['duplicates_in_group'] = group_analysis['total_events'] - group_analysis['unique_event_ids']
group_analysis['duplication_rate'] = group_analysis['duplicates_in_group'] / group_analysis['total_events'] * 100
group_analysis = group_analysis.sort_values('duplicates_in_group', ascending=False)

groups_with_dups = group_analysis[group_analysis['duplicates_in_group'] > 0]
print(f"\nTotal groups: {len(group_analysis):,}")
print(f"Groups with duplicates: {len(groups_with_dups):,}")

if len(groups_with_dups) > 0:
    print("\n" + "-" * 40)
    print("Groups with most duplicates (top 10):")
    print("-" * 40)
    for _, row in group_analysis.head(10).iterrows():
        group_id_str = str(row['group_id'])[:30] if len(str(row['group_id'])) > 30 else str(row['group_id'])
        print(f"  Group {group_id_str}...: {int(row['duplicates_in_group']):,} duplicates ({row['duplication_rate']:.1f}%) of {int(row['total_events']):,} events")
    
    # Visualization - Duplication rate distribution across groups
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot 1: Histogram of duplication rates
    ax1 = axes[0]
    ax1.hist(group_analysis['duplication_rate'], bins=20, color='steelblue', edgecolor='black', alpha=0.7)
    ax1.set_xlabel('Duplication Rate (%)')
    ax1.set_ylabel('Number of Groups')
    ax1.set_title('Distribution of Duplication Rates Across Groups', fontsize=12, fontweight='bold')
    
    # Plot 2: Top groups by duplicate count
    ax2 = axes[1]
    top_groups = group_analysis.head(10).copy()
    top_groups['group_label'] = top_groups['group_id'].astype(str).str[:15]
    ax2.barh(top_groups['group_label'], top_groups['duplicates_in_group'], color='coral', edgecolor='black')
    ax2.set_xlabel('Number of Duplicates')
    ax2.set_title('Top 10 Groups by Duplicate Count', fontsize=12, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
else:
    print("\nNo duplicates found within any group.")

## 11. Summary Statistics

Overall summary of duplication findings across all detection strategies.

In [None]:
print("=" * 70)
print("SUMMARY STATISTICS")
print("=" * 70)

# Calculate all metrics
total_events = len(df)

# Strategy 1: Exact duplicates by event_id
unique_event_ids = df['event_id'].nunique()
exact_dup_rate = (total_events - unique_event_ids) / total_events * 100

# Strategy 2: Duplicates by event_id + group_id
df['event_group_key'] = df['event_id'] + '_' + df['group_id'].astype(str)
unique_event_group = df['event_group_key'].nunique()
event_group_dup_rate = (total_events - unique_event_group) / total_events * 100
df.drop('event_group_key', axis=1, inplace=True)

# Strategy 3: Duplicates by event_id + player_id
df['event_player_key'] = df['event_id'] + '_' + df['player_id'].astype(str)
unique_event_player = df['event_player_key'].nunique()
event_player_dup_rate = (total_events - unique_event_player) / total_events * 100
df.drop('event_player_key', axis=1, inplace=True)

# Strategy 4: Duplicates by event_id + session_id
df['event_session_key'] = df['event_id'] + '_' + df['session_id'].astype(str)
unique_event_session = df['event_session_key'].nunique()
event_session_dup_rate = (total_events - unique_event_session) / total_events * 100
df.drop('event_session_key', axis=1, inplace=True)

# Strategy 5: Content-based duplicates
df['payload_str'] = df['payload'].fillna('').astype(str)
df['content_key'] = df['player_id'].astype(str) + '_' + df['event_name'] + '_' + df['event_timestamp'].astype(str) + '_' + df['payload_str']
unique_content = df['content_key'].nunique()
content_dup_rate = (total_events - unique_content) / total_events * 100
df.drop(['payload_str', 'content_key'], axis=1, inplace=True)

# Create summary table
summary_data = {
    'Detection Strategy': [
        '1. Exact (by event_id) - PRIMARY',
        '2. event_id + group_id - SECONDARY',
        '3. event_id + player_id',
        '4. event_id + session_id',
        '5. Content-based (player+event+time+payload)'
    ],
    'Unique Count': [
        unique_event_ids,
        unique_event_group,
        unique_event_player,
        unique_event_session,
        unique_content
    ],
    'Duplicate Count': [
        total_events - unique_event_ids,
        total_events - unique_event_group,
        total_events - unique_event_player,
        total_events - unique_event_session,
        total_events - unique_content
    ],
    'Duplication Rate': [
        f"{exact_dup_rate:.2f}%",
        f"{event_group_dup_rate:.2f}%",
        f"{event_player_dup_rate:.2f}%",
        f"{event_session_dup_rate:.2f}%",
        f"{content_dup_rate:.2f}%"
    ]
}

summary_df = pd.DataFrame(summary_data)

print(f"\nTotal Events: {total_events:,}")
print("\n" + "-" * 70)
print("Duplication Summary by Detection Strategy:")
print("-" * 70)
display(summary_df)

# Key findings
print("\n" + "=" * 70)
print("KEY FINDINGS")
print("=" * 70)

if exact_dup_rate > 0:
    print(f"\n- Found {exact_dup_rate:.2f}% exact duplicates by event_id")
else:
    print("\n- No exact duplicates found by event_id")

if event_group_dup_rate < exact_dup_rate:
    print(f"- event_id + group_id has lower rate ({event_group_dup_rate:.2f}%), indicating some duplicates span groups")

if content_dup_rate > exact_dup_rate:
    print(f"- Content-based analysis reveals {content_dup_rate:.2f}% duplicates (higher than exact ID check)")
    print("  This suggests some duplicate content has different event_ids")

# Visualization
fig, ax = plt.subplots(figsize=(12, 6))
strategies = ['Exact\n(event_id)\nPRIMARY', 'event_id +\ngroup_id\nSECONDARY', 'event_id +\nplayer_id', 'event_id +\nsession_id', 'Content-\nbased']
rates = [exact_dup_rate, event_group_dup_rate, event_player_dup_rate, event_session_dup_rate, content_dup_rate]
colors = ['coral', 'steelblue', 'mediumseagreen', 'orchid', 'gold']

bars = ax.bar(strategies, rates, color=colors, edgecolor='black')
ax.set_ylabel('Duplication Rate (%)')
ax.set_title('Duplication Rate by Detection Strategy', fontsize=14, fontweight='bold')

# Add value labels on bars
for bar, rate in zip(bars, rates):
    height = bar.get_height()
    ax.annotate(f'{rate:.2f}%',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),
                textcoords="offset points",
                ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("\n" + "=" * 70)
print("ANALYSIS COMPLETE")
print("=" * 70)