In [None]:
# ==============================================================================
# Python Scripts for "Discussion Forum Engagement" Analysis
#
# This file contains three separate scripts used to generate the visualizations
# for the discussion forum engagement section of the thesis.
#
# Script 1: Calculates and visualizes the distribution of post counts per learner.
# Script 2: Calculates and visualizes the distribution of post content length.
# Script 3: Generates a line chart for weekly forum post patterns.
#
# Author: Ziyun Ke
# Date: August 2025
# ==============================================================================

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates

# ==============================================================================
# SCRIPT 1: DISTRIBUTION OF POST COUNT PER LEARNER (HISTOGRAM)
# ==============================================================================
print("--- Running Script 1: Distribution of Post Count per Learner ---")

# --- 0. Configuration ---
# NOTE: This script needs to be run for each course by changing the input file.
forum_file_count = 'forum_interaction_course-v1_KULeuvenX+EUROGOVx+1T2023.csv'
USER_ID_COL_COUNT = 'course_learner_id'
# We just need a column that exists for every post record for counting purposes
COUNT_COL = 'post_id'
output_image_file_count = 'forum_post_count_distribution.png'
# --- End of Configuration ---

try:
    # --- 1. Data Loading and Calculation ---
    print(f"--- Loading forum data: {forum_file_count} ---")
    df_forum_count = pd.read_csv(forum_file_count)
    print("File loaded successfully!")

    # Core step: Group by student ID and use .size() to count the number of rows (i.e., post count)
    post_counts_per_student = df_forum_count.groupby(USER_ID_COL_COUNT).size()

    print(f"Calculated total post counts for {len(post_counts_per_student)} students.")

    # --- 2. Plotting the Histogram ---
    print("Generating histogram for post count distribution...")
    plt.style.use('seaborn-v0_8-talk')
    fig, ax = plt.subplots(figsize=(12, 7))

    # Since the data has a long tail, we can focus on the core range for a clearer view.
    # For example, view the distribution for 98% of students, ignoring extreme outliers.
    count_quantile_98 = post_counts_per_student.quantile(0.98)
    print(f"Data diagnosis: 98% of students have {count_quantile_98:.0f} or fewer posts.")

    # Define bin edges to ensure integer ticks
    # E.g., if the upper limit is 15, we create bins for 0, 1, 2, ..., 15
    bins = np.arange(0, count_quantile_98 + 2) - 0.5

    ax.hist(post_counts_per_student, bins=bins, color='#0072B2', edgecolor='white', rwidth=0.8)

    # Beautify the plot
    ax.set_title(f'Distribution of Post Counts per Learner (GOV, up to {count_quantile_98:.0f} posts)', fontsize=18, pad=15)
    ax.set_xlabel('Total Number of Posts by a Single Learner', fontsize=12)
    ax.set_ylabel('Number of Learners', fontsize=12)
    ax.grid(axis='y', linestyle='--', alpha=0.7)

    # Set x-axis ticks to integers
    ax.set_xticks(np.arange(0, count_quantile_98 + 1, step=max(1, int(count_quantile_98/15))))

    fig.tight_layout()
    plt.savefig(output_image_file_count, dpi=300) # Save the figure
    plt.show()

except FileNotFoundError:
    print(f"ERROR: File not found '{forum_file_count}'.")
except Exception as e:
    print(f"An error occurred during processing: {e}")

# ==============================================================================
# SCRIPT 2: DISTRIBUTION OF POST CONTENT LENGTH (HISTOGRAM)
# ==============================================================================
print("\n\n--- Running Script 2: Distribution of Post Content Length ---")

# --- 0. Configuration ---
# NOTE: This script needs to be run for each course by changing the input file.
forum_file_len = 'forum_interaction_course-v1_KULeuvenX+EUROGOVx+1T2023.csv'

# Use confirmed column names
POST_ID_COL_LEN = 'post_id'
USER_ID_COL_LEN = 'course_learner_id'
TIMESTAMP_COL_LEN = 'post_timestamp'
BODY_COL = 'post_content'
# --- End of Configuration ---

try:
    # --- 1. Data Loading and Preprocessing ---
    print(f"--- Loading and preprocessing data: {forum_file_len} ---")
    df_forum_len = pd.read_csv(forum_file_len)

    # Clean and convert timestamp format
    df_forum_len[TIMESTAMP_COL_LEN] = df_forum_len[TIMESTAMP_COL_LEN].str.replace(r' GMT\+\d{4} \(.*\)', '', regex=True)
    df_forum_len[TIMESTAMP_COL_LEN] = pd.to_datetime(df_forum_len[TIMESTAMP_COL_LEN], format='%a %b %d %Y %H:%M:%S', errors='coerce')
    df_forum_len.dropna(subset=[TIMESTAMP_COL_LEN], inplace=True)

    # Calculate post length
    df_forum_len['post_content_length'] = df_forum_len[BODY_COL].str.len().fillna(0)

    print("Data preprocessing complete.")
    print("-" * 50)

    plt.style.use('seaborn-v0_8-talk')

    # --- Plotting: Histogram of Post Content Length ---
    print("\nGenerating plot for post length distribution...")
    fig_len, ax_len = plt.subplots(figsize=(12, 7))

    # Calculate the 95th percentile as the upper limit for plotting to make the distribution clearer
    length_quantile_95 = df_forum_len['post_content_length'].quantile(0.95)
    print(f"Data diagnosis: 95% of posts have a length of {length_quantile_95:.0f} characters or less.")

    # Filter the data for the core range to plot
    length_to_plot = df_forum_len[df_forum_len['post_content_length'] <= length_quantile_95]['post_content_length']

    ax_len.hist(length_to_plot, bins=40, color='#0072B2', edgecolor='white', rwidth=0.9)
    ax_len.set_title(f'Distribution of Post Content Length (GOV, up to {length_quantile_95:.0f} chars)', fontsize=18, pad=15)
    ax_len.set_xlabel('Post Length (Number of Characters)', fontsize=12)
    ax_len.set_ylabel('Number of Posts', fontsize=12)
    ax_len.grid(axis='y', linestyle='--', alpha=0.7)
    fig_len.tight_layout()
    plt.show()
    print("-" * 50)

except FileNotFoundError:
    print(f"ERROR: File not found '{forum_file_len}'.")
except Exception as e:
    print(f"An error occurred during processing: {e}")

# ==============================================================================
# SCRIPT 3: WEEKLY FORUM POST COUNTS (LINE CHART)
# ==============================================================================
print("\n\n--- Running Script 3: Weekly Forum Post Counts ---")

# --- 0. Configuration ---
raw_data_files_weekly = {
    'GOV': 'forum_interaction_course-v1_KULeuvenX+EUROGOVx+1T2023.csv',
    'HIS': 'forum_interaction_course-v1_KULeuvenX+EUROHISx+1T2023.csv',
    'LAW': 'forum_interaction_course-v1_KULeuvenX+EUROLAWx+1T2023.csv'
}

TIMESTAMP_COL_WEEKLY = 'post_timestamp'

course_color_palette_weekly = {
    'GOV': '#0072B2',  # Professional Blue
    'HIS': '#D55E00',  # Warm Orange-Red
    'LAW': '#E69F00'   # Golden Yellow
}
# --- End of Configuration ---

# Dictionary to store the time series results for each course
all_course_series_weekly = {}

print("--- Step 1: Processing data for each course (Weekly Posts) ---")

for course_name, file_name in raw_data_files_weekly.items():
    try:
        print(f"\nProcessing course: {course_name}...")
        df = pd.read_csv(file_name)

        # --- Data Preprocessing ---
        if TIMESTAMP_COL_WEEKLY not in df.columns:
            print(f" -> WARNING: Timestamp column '{TIMESTAMP_COL_WEEKLY}' not found in {file_name}. Skipping this course.")
            continue

        df[TIMESTAMP_COL_WEEKLY] = df[TIMESTAMP_COL_WEEKLY].str.replace(r' GMT\+\d{4} \(.*\)', '', regex=True)
        df[TIMESTAMP_COL_WEEKLY] = pd.to_datetime(df[TIMESTAMP_COL_WEEKLY], format='%a %b %d %Y %H:%M:%S', errors='coerce')
        df.dropna(subset=[TIMESTAMP_COL_WEEKLY], inplace=True)
        df.set_index(TIMESTAMP_COL_WEEKLY, inplace=True)

        # Filter data for the course duration
        start_date = '2023-09-25'
        end_date = '2024-05-05'
        df_filtered = df.loc[start_date:end_date].copy()

        # --- Core Calculation ---
        # Resample by week and calculate the total number of posts
        weekly_total_posts = df_filtered.resample('W-SUN').size()

        # Store the result in the dictionary
        all_course_series_weekly[course_name] = weekly_total_posts

        print(f" -> {course_name} course processed successfully.")

    except FileNotFoundError:
        print(f" -> WARNING: File not found: {file_name}. Skipping this course.")
    except Exception as e:
        print(f" -> An error occurred while processing {course_name}: {e}")

# --- 2. Prepare final data and generate the plot ---
if not all_course_series_weekly:
    print("\nERROR: No course data was processed successfully. Cannot generate plot.")
else:
    # Combine the time series results into a single DataFrame
    plot_df = pd.DataFrame(all_course_series_weekly)

    # Fill missing weeks with 0 (i.e., weeks with no posts)
    plot_df.fillna(0, inplace=True)

    print("\n--- Step 2: Data preparation complete. Generating line chart... ---")

    # --- Plotting ---
    plt.style.use('seaborn-v0_8-talk')
    fig, ax = plt.subplots(figsize=(16, 9))

    # Loop through and plot the line for each course
    for course_name in plot_df.columns:
        ax.plot(
            plot_df.index,
            plot_df[course_name],
            marker='o',
            linestyle='-',
            label=course_name,
            color=course_color_palette_weekly.get(course_name) # Use the defined theme color
        )

    # --- Beautify Plot ---
    ax.set_title('Comparison of Weekly Forum Post Counts Across Courses', fontsize=20, pad=20)
    ax.set_xlabel('Date (by Week)', fontsize=14)
    ax.set_ylabel('Total Number of Posts', fontsize=14)
    ax.legend(title='Course', fontsize=12)
    ax.grid(True, which='both', linestyle='--', linewidth=0.5)

    # Format the x-axis to show dates clearly
    ax.xaxis.set_major_locator(mdates.MonthLocator(interval=1))
    ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))
    plt.xticks(rotation=30, ha="right")
    ax.set_ylim(bottom=0)

    fig.tight_layout()
    plt.show()