In [None]:
# ==============================================================================
# Python Script for "Number of Active Weeks" Analysis
#
# This script calculates the number of unique weeks each learner was active
# based on session log data. It then generates a histogram to visualize the
# distribution of learner persistence.
#
# Author: Ziyun Ke
# Date: August 2025
# ==============================================================================

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# --- 0. Configuration: File and Column Names ---
# NOTE: This script needs to be run for each course by changing the input file.
input_file = 'sessions_course-v1_KULeuvenX+EUROHISx+1T2023.csv'
USER_ID_COL = 'course_learner_id'
TIMESTAMP_COL = 'start_time'
# --- End of Configuration ---


try:
    # --- 1. Data Loading and Preprocessing ---
    print(f"--- Loading session data from: {input_file} ---")
    df_sessions = pd.read_csv(input_file)
    print(f"File loaded successfully! Found {len(df_sessions)} session records.")

    # --- Key Step: Remove invalid user IDs ending in '_null' ---
    print("Removing invalid user IDs...")
    # .str.endswith('_null') finds all IDs ending with '_null'
    # The tilde ~ acts as a "NOT" operator, keeping all rows that do NOT end with '_null'
    original_rows = len(df_sessions)
    df_sessions = df_sessions[~df_sessions[USER_ID_COL].str.endswith('_null', na=False)]
    print(f"Removed {original_rows - len(df_sessions)} invalid user records.")
    # --- End of new step ---

    # Clean the timestamp format
    print("Cleaning timestamp format...")
    df_sessions[TIMESTAMP_COL] = df_sessions[TIMESTAMP_COL].str.replace(r' GMT\+\d{4} \(.*\)', '', regex=True)
    df_sessions[TIMESTAMP_COL] = pd.to_datetime(df_sessions[TIMESTAMP_COL], format='%a %b %d %Y %H:%M:%S', errors='coerce')

    # Drop rows where key information is missing
    df_sessions.dropna(subset=[USER_ID_COL, TIMESTAMP_COL], inplace=True)

    print("Data preprocessing complete.")

    # --- 2. Calculate the Number of Active Weeks for Each User ---
    print(f"\n--- Grouping by '{USER_ID_COL}' and calculating unique weeks... ---")
    df_sessions['activity_week'] = df_sessions[TIMESTAMP_COL].dt.to_period('W')
    active_weeks_per_user = df_sessions.groupby(USER_ID_COL)['activity_week'].nunique()

    # --- 3. Format and Save the Results ---
    active_weeks_df = active_weeks_per_user.reset_index()
    active_weeks_df.columns = [USER_ID_COL, 'Number_of_Active_Weeks']

    print("Calculation complete!")
    print("\n--- Number of Active Weeks per User (First 10 rows) ---")
    print(active_weeks_df.head(10))

    # --- 4. Visualize Results: Histogram of Active Weeks Distribution ---
    print("\n--- Step 4: Generating histogram for the distribution of active weeks ---")
    plt.style.use('seaborn-v0_8-talk')
    fig, ax = plt.subplots(figsize=(12, 7))

    data_to_plot = active_weeks_df['Number_of_Active_Weeks']

    # Set up bins to be centered on integer values
    bins = np.arange(data_to_plot.min(), data_to_plot.max() + 2) - 0.5

    ax.hist(data_to_plot, bins=bins, color='#D55E00', edgecolor='white', rwidth=0.8)

    ax.set_title('Distribution of Active Weeks per Learner (HIS)', fontsize=18, pad=15)
    ax.set_xlabel('Number of Active Weeks', fontsize=12)
    ax.set_ylabel('Number of Learners', fontsize=12)

    # Set x-axis ticks to be readable
    max_weeks = int(data_to_plot.max())
    ax.set_xticks(np.arange(0, max_weeks + 1, step=max(1, max_weeks // 12)))

    ax.grid(axis='y', linestyle='--', alpha=0.7)
    fig.tight_layout()
    plt.show()

except FileNotFoundError:
    print(f"ERROR: File not found '{input_file}'. Please ensure the file has been uploaded and the filename is correct.")
except KeyError as e:
    print(f"ERROR: The specified column {e} was not found in the file. Please check the file content and column name configuration.")
except Exception as e:
    print(f"An unexpected error occurred during processing: {e}")
