In [None]:
# ==============================================================================
# Python Scripts for "Assessment Engagement" Analysis
#
# This file contains three separate scripts used to generate the visualizations
# for the assessment engagement section of the thesis.
#
# Script 1: Calculates and visualizes the distribution of overall performance.
# Script 2: Calculates and visualizes the distribution of submission counts per learner.
# Script 3: Generates a line chart for weekly assessment submission patterns.
#
# Author: Ziyun Ke
# Date: August 2025
# ==============================================================================

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates

# ==============================================================================
# SCRIPT 1: DISTRIBUTION OF LEARNER OVERALL PERFORMANCE (HISTOGRAM)
# ==============================================================================
print("--- Running Script 1: Distribution of Learner Overall Performance ---")

# --- 0. Configuration: Please configure your file and column names here ---
# NOTE: This script needs to be run for each course by changing the input file.
file_with_grades = 'assessments_course-v1_KULeuvenX+EUROGOVx+1T2023.csv'

# Please verify these column names match your file
USER_ID_COL_GRADES = 'course_learner_id'
GRADE_COL = 'grade'
MAX_GRADE_COL = 'max_grade'
# --- End of Configuration ---

try:
    print(f"--- Loading grade data for histogram: {file_with_grades} ---")
    df_grades = pd.read_csv(file_with_grades)
    print("Grade file loaded successfully!")

    # Check if the required columns exist
    required_cols = [USER_ID_COL_GRADES, GRADE_COL, MAX_GRADE_COL]
    if not all(col in df_grades.columns for col in required_cols):
        print(f"ERROR: The file is missing required columns. Needed {required_cols}, but found {list(df_grades.columns)}")
    else:
        # --- 1. Calculate Overall_Performance for each student ---
        # Calculate the score percentage (between 0 and 1) for each submission
        df_grades[MAX_GRADE_COL] = df_grades[MAX_GRADE_COL].replace(0, np.nan)
        df_grades.dropna(subset=[MAX_GRADE_COL], inplace=True)
        df_grades['score_percentage'] = (df_grades[GRADE_COL] / df_grades[MAX_GRADE_COL])

        # Group by student ID and calculate the mean score percentage across all submissions
        student_performance = df_grades.groupby(USER_ID_COL_GRADES)['score_percentage'].mean()

        # Convert the score ratio to a percentage scale (0-100)
        student_performance_percent = student_performance * 100
        print(f"Calculated average performance for {len(student_performance_percent)} students.")

        # --- 2. Plot the histogram ---
        plt.style.use('seaborn-v0_8-talk')
        fig, ax = plt.subplots(figsize=(12, 7))

        bins = range(0, 101, 10)
        ax.hist(student_performance_percent, bins=bins, color='#0072B2', edgecolor='white', rwidth=0.9)

        ax.set_title('Distribution of Learner Overall Performance (GOV)', fontsize=20, pad=20)
        ax.set_xlabel('Average Score Range (%)', fontsize=14)
        ax.set_ylabel('Number of Students', fontsize=14)
        ax.set_xticks(bins)
        ax.grid(axis='y', linestyle='--', alpha=0.7)
        plt.show()

except FileNotFoundError:
    print(f"ERROR: File not found '{file_with_grades}'. Please ensure the file has been uploaded and the filename is correct.")

# ==============================================================================
# SCRIPT 2: DISTRIBUTION OF SUBMISSION COUNT PER LEARNER (HISTOGRAM)
# ==============================================================================
print("\n\n--- Running Script 2: Distribution of Submission Count per Learner ---")

# --- 0. Configuration ---
# NOTE: This script needs to be run for each course by changing the input file and course_name.
course_name = 'HIS'
input_file_subs_count = 'submissions_course-v1_KULeuvenX+EUROHISx+1T2023.csv'

# Define column names
USER_ID_COL_SUBS_COUNT = 'course_learner_id'
# We just need an existing column for counting purposes
COUNT_COL = 'submission_timestamp'
# --- End of Configuration ---

try:
    # --- 1. Load Data ---
    print(f"--- Loading submission data for {course_name} course: {input_file_subs_count} ---")
    df_subs = pd.read_csv(input_file_subs_count)
    print("File loaded successfully!")

    # Clean data, remove rows with null user IDs
    df_subs.dropna(subset=[USER_ID_COL_SUBS_COUNT], inplace=True)
    print("Data preprocessing complete.")

    # --- 2. Calculate total submissions per student ---
    print("\n--- Step 2: Calculating total submissions per student ---")
    # Group by user ID and use .size() to count the number of rows (i.e., total submissions for that student)
    submission_counts_per_student = df_subs.groupby(USER_ID_COL_SUBS_COUNT).size()
    print(f"Calculated total submissions for {len(submission_counts_per_student)} students with at least one submission.")

    # --- 3. Plotting the distribution of submission counts ---
    print("\n--- Step 3: Generating histogram for submission count distribution ---")
    plt.style.use('seaborn-v0_8-talk')
    fig, ax = plt.subplots(figsize=(12, 7))

    # Since this data typically has a long tail, we can focus on the core range for a clearer view
    count_quantile_98 = submission_counts_per_student.quantile(0.98)
    print(f"Data diagnosis: 98% of students have {count_quantile_98:.0f} or fewer submissions.")

    # Define bins to ensure each integer count has its own bar
    bins = np.arange(0, count_quantile_98 + 2) - 0.5
    ax.hist(submission_counts_per_student, bins=bins, color='#D55E00', edgecolor='white', rwidth=0.8)

    # --- 4. Beautify Plot ---
    ax.set_title(f'Distribution of Submission Counts per Learner ({course_name})', fontsize=18, pad=15)
    ax.set_xlabel('Total Number of Submissions by a Single Learner', fontsize=12)
    ax.set_ylabel('Number of Learners', fontsize=12)
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    # Set x-axis ticks to integers for readability
    max_count = int(count_quantile_98)
    ax.set_xticks(np.arange(0, max_count + 1, step=max(1, max_count // 15)))

    fig.tight_layout()
    plt.show()

except FileNotFoundError:
    print(f"ERROR: File not found '{input_file_subs_count}'.")
except KeyError as e:
    print(f"ERROR: The specified column {e} was not found in the file.")
except Exception as e:
    print(f"An unexpected error occurred during processing: {e}")

# ==============================================================================
# SCRIPT 3: WEEKLY ASSESSMENT SUBMISSION PATTERN (LINE CHART)
# ==============================================================================
print("\n\n--- Running Script 3: Weekly Assessment Submission Pattern ---")

# --- 0. Configuration ---
raw_data_files_subs_weekly = {
    'HIS': 'submissions_course-v1_KULeuvenX+EUROHISx+1T2023.csv',
    'LAW': 'submissions_course-v1_KULeuvenX+EUROLAWx+1T2023.csv',
    'GOV': 'submissions_course-v1_KULeuvenX+EUROGOVx+1T2023.csv'
}

TIMESTAMP_COL_SUBS_WEEKLY = 'submission_timestamp'

course_color_palette_subs_weekly = {
    'GOV': '#0072B2',  # Professional Blue
    'HIS': '#D55E00',  # Warm Orange-Red
    'LAW': '#E69F00'   # Golden Yellow
}
# --- End of Configuration ---

# Dictionary to store the time series results for each course
all_course_series_subs_weekly = {}

print("--- Step 1: Processing data for each course (Weekly Submissions) ---")

for course_name, file_name in raw_data_files_subs_weekly.items():
    try:
        print(f"\nProcessing course: {course_name}...")
        df = pd.read_csv(file_name)

        # --- Data Preprocessing ---
        if TIMESTAMP_COL_SUBS_WEEKLY not in df.columns:
            print(f" -> WARNING: Timestamp column '{TIMESTAMP_COL_SUBS_WEEKLY}' not found in {file_name}. Skipping this course.")
            continue

        df[TIMESTAMP_COL_SUBS_WEEKLY] = df[TIMESTAMP_COL_SUBS_WEEKLY].str.replace(r' GMT\+\d{4} \(.*\)', '', regex=True)
        df[TIMESTAMP_COL_SUBS_WEEKLY] = pd.to_datetime(df[TIMESTAMP_COL_SUBS_WEEKLY], format='%a %b %d %Y %H:%M:%S', errors='coerce')
        df.dropna(subset=[TIMESTAMP_COL_SUBS_WEEKLY], inplace=True)
        df.set_index(TIMESTAMP_COL_SUBS_WEEKLY, inplace=True)

        # --- Core Calculation ---
        weekly_total_submissions = df.resample('W-SUN').size()
        all_course_series_subs_weekly[course_name] = weekly_total_submissions
        print(f" -> {course_name} course processed successfully.")

    except FileNotFoundError:
        print(f" -> WARNING: File not found: {file_name}. Skipping this course.")
    except Exception as e:
        print(f" -> An error occurred while processing {course_name}: {e}")

# --- 2. Prepare final data and generate the plot ---
if not all_course_series_subs_weekly:
    print("\nERROR: No course data was processed successfully. Cannot generate plot.")
else:
    # Combine the time series results into a single DataFrame
    # Correct structure should be: index=time, columns=courses
    plot_df = pd.DataFrame(all_course_series_subs_weekly)

    # Fill missing weeks with 0 (i.e., weeks with no submissions) for a continuous line chart
    plot_df.fillna(0, inplace=True)

    print("\n--- Step 2: Data preparation complete. Generating line chart... ---")

    # --- Plotting ---
    plt.style.use('seaborn-v0_8-talk')
    fig, ax = plt.subplots(figsize=(16, 9))

    # Plot in the desired order
    plot_order = ['HIS', 'LAW', 'GOV']
    for course_name in plot_order:
        if course_name in plot_df.columns:
            ax.plot(
                plot_df.index,
                plot_df[course_name],
                marker='o',
                linestyle='-',
                label=course_name,
                color=course_color_palette_subs_weekly.get(course_name)
            )

    # --- Beautify Plot ---
    ax.set_title('Comparison of Weekly Assessment Submissions Across Courses', fontsize=20, pad=20)
    ax.set_xlabel('Date (by Week)', fontsize=14)
    ax.set_ylabel('Total Number of Submissions', fontsize=14)
    ax.legend(title='Course', fontsize=12)
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)

    # Format the x-axis to show dates clearly
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.xticks(rotation=30, ha="right")
    ax.set_ylim(bottom=0)

    fig.tight_layout()
    plt.show()