# your prompt:

Help me debug and fix this user engagement code. The current code shows bounce rates over 100% and negative session times. 
Both of which are impossible. Please point out where and what the fixes are. Please include data validation and error handling in the code prior to making the calculations. Also include code for data visualization of the calculations using seaborn and matploblib. The code is as follows:

def analyze_user_engagement(logs_df):
    metrics = {
        'bounce_rate': logs_df.groupby('session_id')['page_views'].apply(
            lambda x: x == 1).mean(),
        'avg_session_time': logs_df.groupby('session_id')['duration'].sum(),
        'pages_per_session': logs_df.groupby('session_id')['page_views'].mean()
    }
    
    device_metrics = logs_df.groupby('device_type').agg({
        'session_id': 'count',
        'duration': 'mean',
        'page_views': 'sum'
    })
    
    return metrics, device_metrics

asked the AI to debug, pointing out specifically the 2 issues. Asked to fix and point out where they needed to be fixed. 
Asked to first include in the funciton data validation and error handling before making the calculations. And asked
for code to visualize in seaborn/matplotlib for the 2 calculations.

In [2]:
# generated code:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_user_engagement(logs_df: pd.DataFrame):
    try:
        # 1. Validate required columns
        required_cols = {'session_id', 'page_views', 'duration', 'device_type'}
        if not required_cols.issubset(logs_df.columns):
            missing = required_cols - set(logs_df.columns)
            raise ValueError(f"Missing required columns: {missing}")

        # 2. Ensure numeric and drop invalid data
        logs_df = logs_df.copy()
        logs_df['page_views'] = pd.to_numeric(logs_df['page_views'], errors='coerce')
        logs_df['duration'] = pd.to_numeric(logs_df['duration'], errors='coerce')
        logs_df.dropna(subset=['session_id', 'page_views', 'duration', 'device_type'], inplace=True)

        # 3. Remove impossible values
        logs_df = logs_df[(logs_df['page_views'] >= 1) & (logs_df['duration'] >= 0)]

        # 4. Bounce rate (sessions with exactly 1 page view)
        session_views = logs_df.groupby('session_id')['page_views'].sum()
        bounce_rate = (session_views == 1).mean() * 100

        # 5. Average session time
        session_duration = logs_df.groupby('session_id')['duration'].sum()
        avg_session_time = session_duration.mean()

        # 6. Pages per session
        pages_per_session = session_views.mean()

        metrics = {
            'bounce_rate (%)': round(bounce_rate, 2),
            'avg_session_time (s)': round(avg_session_time, 2),
            'pages_per_session': round(pages_per_session, 2)
        }

        # 7. Device-level metrics
        device_metrics = logs_df.groupby('device_type').agg(
            sessions=('session_id', pd.Series.nunique),
            avg_duration=('duration', 'mean'),
            total_page_views=('page_views', 'sum')
        ).reset_index()

        # 8. Visualizations
        plt.figure(figsize=(18, 5))

        # Bounce Rate
        plt.subplot(1, 3, 1)
        sns.barplot(x=['Bounce Rate'], y=[metrics['bounce_rate (%)']], color='salmon')
        plt.ylabel('Percentage')
        plt.title('Bounce Rate (%)')

        # Average Session Time
        plt.subplot(1, 3, 2)
        sns.barplot(x=['Avg Session Time'], y=[metrics['avg_session_time (s)']], color='skyblue')
        plt.ylabel('Seconds')
        plt.title('Average Session Time')

        # Device Breakdown
        plt.subplot(1, 3, 3)
        sns.barplot(data=device_metrics, x='device_type', y='avg_duration', palette='viridis')
        plt.ylabel('Avg Duration (s)')
        plt.title('Avg Session Duration by Device')

        plt.tight_layout()
        plt.show()

        return metrics, device_metrics

    except Exception as e:
        print(f"Error analyzing user engagement: {e}")
        return {}, pd.DataFrame()


# 📌 Example Usage:
if __name__ == "__main__":
    # Simulated sample data — replace with actual logs
    data = {
        'session_id': ['s1', 's2', 's2', 's3', 's4', 's5', 's5'],
        'user_id': ['u1', 'u2', 'u2', 'u3', 'u4', 'u5', 'u5'],
        'device_type': ['mobile', 'desktop', 'desktop', 'tablet', 'mobile', 'mobile', 'mobile'],
        'page_views': [1, 2, 3, 1, 1, 2, 1],
        'duration': [30, 60, 120, -15, 45, 180, 60]  # includes invalid negative time
    }
    df = pd.DataFrame(data)
    analyze_user_engagement(df)


# follow up prompts:
Can you break down bounce rate by device type.
What does the session duration distribution look like?
Plot a histogram of session durations
Create a boxplot comparing session duration across devices

the debug went fine so asked the AI for some more calculations and visualizations to further explore the issues at hand 
and potentially discover new insights

In [5]:
# final solution:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_user_engagement(logs_df: pd.DataFrame):
    try:
        # 1. Validate required columns
        required_cols = {'session_id', 'page_views', 'duration', 'device_type'}
        if not required_cols.issubset(logs_df.columns):
            missing = required_cols - set(logs_df.columns)
            raise ValueError(f"Missing required columns: {missing}")

        # 2. Clean and filter data
        logs_df = logs_df.copy()
        logs_df['page_views'] = pd.to_numeric(logs_df['page_views'], errors='coerce')
        logs_df['duration'] = pd.to_numeric(logs_df['duration'], errors='coerce')
        logs_df.dropna(subset=['session_id', 'page_views', 'duration', 'device_type'], inplace=True)
        logs_df = logs_df[(logs_df['page_views'] >= 1) & (logs_df['duration'] >= 0)]

        # 3. Calculate session-level data
        session_views = logs_df.groupby('session_id')['page_views'].sum()
        session_duration = logs_df.groupby('session_id')['duration'].sum()
        bounce_rate = (session_views == 1).mean() * 100
        avg_session_time = session_duration.mean()
        pages_per_session = session_views.mean()

        metrics = {
            'bounce_rate (%)': round(bounce_rate, 2),
            'avg_session_time (s)': round(avg_session_time, 2),
            'pages_per_session': round(pages_per_session, 2)
        }

        # 4. Device-level metrics
        device_metrics = logs_df.groupby('device_type').agg(
            sessions=('session_id', pd.Series.nunique),
            avg_duration=('duration', 'mean'),
            total_page_views=('page_views', 'sum')
        ).reset_index()

        # 5. Additional session-level view for breakdowns
        session_summary = logs_df.groupby(['session_id', 'device_type']).agg(
            total_views=('page_views', 'sum'),
            total_duration=('duration', 'sum')
        ).reset_index()
        session_summary['bounced'] = session_summary['total_views'] == 1
        bounce_by_device = session_summary.groupby('device_type')['bounced'].mean().reset_index()
        bounce_by_device['bounce_rate (%)'] = (bounce_by_device['bounced'] * 100).round(2)

        # 6. Visualization block
        plt.figure(figsize=(18, 10))

        # A. Overall bounce rate
        plt.subplot(2, 2, 1)
        sns.barplot(x=['Bounce Rate'], y=[metrics['bounce_rate (%)']], color='salmon')
        plt.ylabel('Percentage')
        plt.title('Overall Bounce Rate')

        # B. Avg session time
        plt.subplot(2, 2, 2)
        sns.barplot(x=['Avg Session Time'], y=[metrics['avg_session_time (s)']], color='skyblue')
        plt.ylabel('Seconds')
        plt.title('Average Session Time')

        # C. Bounce rate by device
        plt.subplot(2, 2, 3)
        sns.barplot(data=bounce_by_device, x='device_type', y='bounce_rate (%)', palette='coolwarm')
        plt.title("Bounce Rate by Device")
        plt.ylabel("Bounce Rate (%)")
        plt.xlabel("Device Type")

        # D. Histogram of session durations
        plt.subplot(2, 2, 4)
        sns.histplot(session_summary['total_duration'], bins=10, kde=True, color='skyblue')
        plt.title("Histogram of Session Durations")
        plt.xlabel("Session Duration (s)")
        plt.ylabel("Count")

        plt.tight_layout()
        plt.show()

        # Additional Boxplot of session durations by device
        plt.figure(figsize=(8, 5))
        sns.boxplot(data=session_summary, x='device_type', y='total_duration', palette='Set2')
        plt.title("Session Duration by Device Type")
        plt.xlabel("Device Type")
        plt.ylabel("Session Duration (s)")
        plt.tight_layout()
        plt.show()

        return metrics, device_metrics

    except Exception as e:
        print(f"Error analyzing user engagement: {e}")
        return {}, pd.DataFrame()

# 📌 Example Usage:
if __name__ == "__main__":
    data = {
        'session_id': ['s1', 's2', 's2', 's3', 's4', 's5', 's5'],
        'user_id': ['u1', 'u2', 'u2', 'u3', 'u4', 'u5', 'u5'],
        'device_type': ['mobile', 'desktop', 'desktop', 'tablet', 'mobile', 'mobile', 'mobile'],
        'page_views': [1, 2, 3, 1, 1, 2, 1],
        'duration': [30, 60, 120, -15, 45, 180, 60]  # Includes a negative
    }
    df = pd.DataFrame(data)
    analyze_user_engagement(df)

Discussion Questions:

How did different prompts approach error identification?
What validation methods were suggested?
How was time handling improved?