In [None]:
# ==============================================================================
# Python Scripts for "Video Engagement" Analysis
#
# This file contains two separate scripts used to generate the visualizations
# for the video engagement section of the thesis.
#
# Script 1: Generates a facet grid of bar charts for aggregate metrics.
# Script 2: Generates a line chart for weekly average viewing session duration.
#
# Author: Ziyun Ke
# Date: August 2025
# ==============================================================================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates

# ==============================================================================
# SCRIPT 1: AGGREGATE VIDEO ENGAGEMENT METRICS (FACET GRID BAR CHART)
# ==============================================================================
print("--- Running Script 1: Aggregate Video Engagement Metrics ---")

# --- 0. Configuration ---
# Define the three courses and their corresponding raw data filenames
raw_data_files_metrics = {
    'HIS': 'video_interactions_course-v1_KULeuvenX+EUROHISx+1T2023.csv',
    'LAW': 'video_interactions_course-v1_KULeuvenX+EUROLAWx+1T2023.csv',
    'GOV': 'video_interactions_course-v1_KULeuvenX+EUROGOVx+1T2023.csv'
}

# Define the column names for analysis
USER_ID_COL_METRICS = 'course_learner_id'
METRIC_COLS = ['duration', 'times_pause', 'times_forward_seek', 'times_backward_seek']

# Define display names for the final plot
METRIC_RENAMES = {
    'duration': 'Average Duration (s)',
    'times_pause': 'Average Pauses',
    'times_forward_seek': 'Average Forward Seeks',
    'times_backward_seek': 'Average Backward Seeks'
}

# Define the color palette for the courses
course_color_palette_metrics = {
    'GOV': '#0072B2',  # Professional Blue
    'HIS': '#D55E00',  # Warm Orange-Red
    'LAW': '#E69F00'   # Golden Yellow
}
# --- End of Configuration ---

# --- 1. Loop through and process raw data for each course ---
all_course_metrics = []
print("--- Step 1: Processing raw data for each course (Metrics) ---")

for course_name, file_name in raw_data_files_metrics.items():
    try:
        print(f"\nProcessing course: {course_name} (File: {file_name})...")
        df_raw = pd.read_csv(file_name)

        # Ensure metric columns are numeric and handle errors
        for col in METRIC_COLS:
            df_raw[col] = pd.to_numeric(df_raw[col], errors='coerce')
        df_raw.dropna(subset=[USER_ID_COL_METRICS] + METRIC_COLS, inplace=True)

        # Calculate per-student totals and then the course-wide average
        per_student_totals = df_raw.groupby(USER_ID_COL_METRICS)[METRIC_COLS].sum()
        average_metrics_for_course = per_student_totals.mean()
        average_metrics_for_course['Course'] = course_name
        all_course_metrics.append(average_metrics_for_course)
        print(f" -> {course_name} course processed successfully.")

    except FileNotFoundError:
        print(f" -> WARNING: File not found: {file_name}. Skipping this course.")
    except KeyError as e:
        print(f" -> WARNING: Column {e} not found in {file_name}. Skipping this course.")

# --- 2. Create and prepare the final DataFrame for plotting ---
if not all_course_metrics:
    print("\nERROR: No course data was processed successfully. Cannot generate plot.")
else:
    df_plot = pd.DataFrame(all_course_metrics)
    df_plot.rename(columns=METRIC_RENAMES, inplace=True)

    # Reshape data from wide to long format for FacetGrid
    df_plot_long = df_plot.melt(
        id_vars='Course',
        var_name='Metric',
        value_name='Average Value'
    )
    print("\n--- Step 2: Data preparation complete! ---")

    # --- 3. Generate the final comparison plot ---
    print("\n--- Step 3: Generating facet grid bar chart ---")

    g = sns.FacetGrid(
        df_plot_long,
        col="Metric",
        col_wrap=2,
        height=6,
        aspect=1.1,
        sharey=False,
        col_order=list(METRIC_RENAMES.values())
    )

    g.map_dataframe(
        sns.barplot,
        x="Course",
        y="Average Value",
        palette=course_color_palette_metrics,
        order=['HIS', 'LAW', 'GOV']
    )

    g.fig.suptitle('Comparison of Video Engagement Metrics Across Courses', fontsize=24, fontweight='bold', y=1.03)

    # --- Beautify plot and add panel labels (a, b, c, d) ---
    panel_labels = ['(a)', '(b)', '(c)', '(d)']
    for i, ax in enumerate(g.axes.flat):
        # Extract the clean metric name from Seaborn's automatic title
        metric_name = ax.get_title().replace("Metric = ", "")
        # Create the new title including the panel label
        new_title = f"{panel_labels[i]} {metric_name}"
        # Set the new subplot title
        ax.set_title(new_title, fontsize=16, fontweight='bold')

        # Further beautification
        ax.set_ylabel("Average Value", fontsize=12)
        ax.set_xlabel("Course", fontsize=12)
        ax.grid(axis='y', linestyle='--', alpha=0.7)

        # Add data labels on top of each bar
        for container in ax.containers:
            ax.bar_label(container, fmt='%.1f', label_type='edge', padding=3, fontsize=11)

        # Add some padding to the y-axis limit
        ax.set_ylim(0, ax.get_ylim()[1] * 1.15)
    # --- End of beautification ---

    g.fig.tight_layout(rect=[0, 0, 1, 0.97])
    plt.show()

# ==============================================================================
# SCRIPT 2: AVERAGE VIEWING SESSION DURATION (WEEKLY LINE CHART)
# ==============================================================================
print("\n\n--- Running Script 2: Average Viewing Session Duration ---")

# --- 0. Configuration ---
raw_data_files_duration = {
    'GOV': 'video_interactions_course-v1_KULeuvenX+EUROGOVx+1T2023.csv',
    'HIS': 'video_interactions_course-v1_KULeuvenX+EUROHISx+1T2023.csv',
    'LAW': 'video_interactions_course-v1_KULeuvenX+EUROLAWx+1T2023.csv'
}

TIMESTAMP_COL = 'start_time'
DURATION_COL = 'duration'

course_color_palette_duration = {
    'GOV': '#0072B2',  # Professional Blue
    'HIS': '#D55E00',  # Warm Orange-Red
    'LAW': '#E69F00'   # Golden Yellow
}
# --- End of Configuration ---

# Dictionary to store the time series results for each course
all_course_series = {}

print("--- Step 1: Processing data for weekly session duration ---")

for course_name, file_name in raw_data_files_duration.items():
    try:
        print(f"\nProcessing course: {course_name}...")
        df = pd.read_csv(file_name)

        # --- Data Preprocessing ---
        # Clean and convert timestamp format
        df[TIMESTAMP_COL] = df[TIMESTAMP_COL].str.replace(r' GMT\+\d{4} \(.*\)', '', regex=True)
        df[TIMESTAMP_COL] = pd.to_datetime(df[TIMESTAMP_COL], format='%a %b %d %Y %H:%M:%S', errors='coerce')

        # Ensure duration is a numeric type
        df[DURATION_COL] = pd.to_numeric(df[DURATION_COL], errors='coerce')

        # Drop rows with missing key information
        df.dropna(subset=[TIMESTAMP_COL, DURATION_COL], inplace=True)

        # Set the timestamp as the index
        df.set_index(TIMESTAMP_COL, inplace=True)

        # --- Core Calculation ---
        # Resample by week and calculate the mean duration of all viewing events
        weekly_avg_duration = df[DURATION_COL].resample('W-SUN').mean()

        # Store the result in the dictionary
        all_course_series[course_name] = weekly_avg_duration

        print(f" -> {course_name} course processed successfully.")

    except FileNotFoundError:
        print(f" -> WARNING: File not found: {file_name}. Skipping this course.")
    except Exception as e:
        print(f" -> An error occurred while processing {course_name}: {e}")

# --- 2. Prepare final data and generate the plot ---
if not all_course_series:
    print("\nERROR: No course data was processed successfully. Cannot generate plot.")
else:
    # Combine the time series results from all courses into a single DataFrame
    plot_df = pd.DataFrame(all_course_series)

    print("\n--- Step 2: Data preparation complete. Generating line chart... ---")

    # --- Plotting ---
    plt.style.use('seaborn-v0_8-talk')
    fig, ax = plt.subplots(figsize=(16, 8))

    # Loop through and plot the line for each course
    for course_name in plot_df.columns:
        ax.plot(
            plot_df.index,
            plot_df[course_name],
            marker='o',
            linestyle='-',
            label=course_name,
            color=course_color_palette_duration.get(course_name) # Use the defined theme color
        )

    # --- Beautify Plot ---
    ax.set_title('Comparison of Average Viewing Session Duration Across Courses', fontsize=20, pad=20)
    ax.set_xlabel('Date (by Week)', fontsize=14)
    ax.set_ylabel('Average Duration per Session (Seconds)', fontsize=14)
    ax.legend(title='Course', fontsize=12)
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)

    # Format the x-axis to show dates clearly
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.xticks(rotation=30, ha="right")
    ax.set_ylim(bottom=0)

    fig.tight_layout()
    plt.show()