In [None]:
# ==============================================================================
# Python Scripts for Follow-up Analysis of Behavioral Patterns
#
# This file contains two separate scripts used to perform the follow-up t-tests
# for the thesis.
#
# Script 1: Analyzes and compares 'Total_Backward_Seeks' between clusters.
# Script 2: Analyzes and compares 'Assessment_Re-attempt_Rate' between clusters.
#
# Author: Ziyun Ke
# Date: August 2025
# ==============================================================================

# --- 0. Setup ---
# If pingouin is not installed in your environment, you may need to install it.
# Uncomment the line below to install.
# !pip install pingouin

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pingouin as pg # Import the pingouin library

# ==============================================================================
# SCRIPT 1: ANALYSIS OF TOTAL BACKWARD SEEKS
# ==============================================================================
print("--- Running Script 1: Analysis of Total Backward Seeks ---")

# --- 0. File and Column Configuration ---
clusters_file_seeks = 'student_cluster_labels.csv'
video_file_seeks = 'video_interactions_course-v1_KULeuvenX+EUROHISx+1T2023.csv'
USER_ID_COL_SEEKS = 'course_learner_id'
BACKWARD_SEEKS_COL = 'times_backward_seek'
# --- End of Configuration ---

try:
    # --- 1. Load Data ---
    print("--- Step 1: Loading data files ---")
    df_clusters_seeks = pd.read_csv(clusters_file_seeks)
    df_video_seeks = pd.read_csv(video_file_seeks)
    print("Files loaded successfully!")

    # --- 2. Calculate Total Backward Seeks per User ---
    print("\n--- Step 2: Calculating total backward seeks per user ---")
    df_video_seeks[BACKWARD_SEEKS_COL] = pd.to_numeric(df_video_seeks[BACKWARD_SEEKS_COL], errors='coerce')
    df_video_seeks.dropna(subset=[USER_ID_COL_SEEKS, BACKWARD_SEEKS_COL], inplace=True)
    backward_seeks_per_user = df_video_seeks.groupby(USER_ID_COL_SEEKS)[BACKWARD_SEEKS_COL].sum().reset_index()
    backward_seeks_per_user.columns = [USER_ID_COL_SEEKS, 'Total_Backward_Seeks']
    print("Calculation complete.")

    # --- 3. Merge Data ---
    print("\n--- Step 3: Merging cluster labels with backward seeks data ---")
    merged_df_seeks = pd.merge(df_clusters_seeks, backward_seeks_per_user, on=USER_ID_COL_SEEKS, how='left')
    merged_df_seeks['Total_Backward_Seeks'].fillna(0, inplace=True)
    print("Data merge and fill complete.")

    # --- 4. Group Comparison and Statistical Test ---
    print("\n--- Step 4: Grouping by cluster, calculating means, and performing t-test ---")

    # Extract data for the two clusters
    cluster0_seeks = merged_df_seeks[merged_df_seeks['Cluster'] == 0]['Total_Backward_Seeks']
    cluster1_seeks = merged_df_seeks[merged_df_seeks['Cluster'] == 1]['Total_Backward_Seeks']

    # Calculate and print descriptive statistics (mean, std, etc.)
    print("\nDescriptive Statistics:")
    print(merged_df_seeks.groupby('Cluster_Name')['Total_Backward_Seeks'].describe().round(2))

    # --- Perform independent t-test using pingouin ---
    # Pingouin automatically calculates Cohen's d
    print("\nIndependent Samples T-test Results (from pingouin):")
    # correction=True performs Welch's t-test (equivalent to equal_var=False in scipy)
    ttest_result_pg_seeks = pg.ttest(cluster0_seeks, cluster1_seeks, correction=True)

    # Print the formatted results table
    print(ttest_result_pg_seeks.round(3))

    # Extract p-value and Cohen's d for interpretation
    p_value_seeks = ttest_result_pg_seeks['p-val'].iloc[0]
    cohen_d_seeks = ttest_result_pg_seeks["cohen-d"].iloc[0]

    print(f"\nConclusion:")
    if p_value_seeks < 0.05:
        print(f"The p-value ({p_value_seeks:.5f}) is less than 0.05, indicating a statistically significant difference in mean backward seeks between the two groups.")
        print(f"Cohen's d is {cohen_d_seeks:.3f}, which indicates the effect size.")
    else:
        print(f"The p-value ({p_value_seeks:.5f}) is not less than 0.05, so we cannot conclude there is a significant difference in mean backward seeks between the groups.")

    # --- 5. Visualization ---
    print("\n--- Step 5: Generating visualization ---")
    plt.style.use('seaborn-v0_8-talk')
    # ... (Visualization code would go here) ...

except FileNotFoundError as e:
    print(f"\nERROR: File not found {e.filename}. Please ensure both CSV files are available.")
except Exception as e:
    print(f"An unexpected error occurred during processing: {e}")


# ==============================================================================
# SCRIPT 2: ANALYSIS OF ASSESSMENT RE-ATTEMPT RATE
# ==============================================================================
print("\n\n--- Running Script 2: Analysis of Assessment Re-attempt Rate ---")

# --- 0. File and Column Configuration ---
clusters_file_rate = 'student_cluster_labels.csv'
submissions_file_rate = 'submissions_course-v1_KULeuvenX+EUROHISx+1T2023.csv'

# Define common user ID column
USER_ID_COL_RATE = 'course_learner_id'
# Define the quiz/assessment ID column
QUESTION_ID_COL = 'question_id'
# --- End of Configuration ---

try:
    # --- 1. Load Data ---
    print("--- Step 1: Loading data files ---")
    df_clusters_rate = pd.read_csv(clusters_file_rate)
    df_subs_rate = pd.read_csv(submissions_file_rate)
    print("Files loaded successfully!")

    # --- 2. Feature Extraction and Data Preparation ---
    print("\n--- Step 2: Extracting clean assessment IDs and preparing data ---")

    regex_pattern = r'\+block@([a-f0-9]+)'
    df_subs_rate['assessment_id'] = df_subs_rate[QUESTION_ID_COL].str.extract(regex_pattern)
    df_subs_rate.dropna(subset=[USER_ID_COL_RATE, 'assessment_id'], inplace=True)
    print("Assessment ID extraction and data cleaning complete.")

    # --- 3. Calculate Quiz Re-attempt Rate for Each Student ---
    print("\n--- Step 3: Calculating quiz re-attempt rate for each student ---")

    attempts_per_quiz = df_subs_rate.groupby([USER_ID_COL_RATE, 'assessment_id']).size()
    reattempted_quizzes = attempts_per_quiz[attempts_per_quiz > 1]
    num_reattempted_per_user = reattempted_quizzes.groupby(USER_ID_COL_RATE).count()
    total_quizzes_per_user = attempts_per_quiz.groupby(USER_ID_COL_RATE).count()
    re_attempt_rate = (num_reattempted_per_user / total_quizzes_per_user).fillna(0)
    re_attempt_rate_df = re_attempt_rate.reset_index(name='Quiz_Re-attempt_Rate')
    print("Re-attempt rate calculation complete.")

    # --- 4. Merge Data and Perform Final Analysis ---
    print("\n--- Step 4: Merging data and performing final statistical analysis ---")

    merged_df_rate = pd.merge(df_clusters_rate, re_attempt_rate_df, on=USER_ID_COL_RATE, how='left')
    merged_df_rate['Quiz_Re-attempt_Rate'].fillna(0, inplace=True)

    # Calculate descriptive statistics by group
    print("\nDescriptive Statistics:")
    print(merged_df_rate.groupby('Cluster_Name')['Quiz_Re-attempt_Rate'].describe().round(3))

    # --- Perform independent t-test using pingouin ---
    # Extract data for the two clusters for the t-test
    cluster0_rates = merged_df_rate[merged_df_rate['Cluster'] == 0]['Quiz_Re-attempt_Rate']
    cluster1_rates = merged_df_rate[merged_df_rate['Cluster'] == 1]['Quiz_Re-attempt_Rate']

    # Use pingouin to perform the t-test, which automatically calculates all necessary values
    ttest_results_rate = pg.ttest(cluster0_rates, cluster1_rates, correction=True)

    print("\nFull Independent Samples T-test Results (from pingouin):")
    print(ttest_results_rate.round(3)) # Print the full results table

    # Extract individual values for easy copying
    t_value = ttest_results_rate['T'].iloc[0]
    p_value_rate = ttest_results_rate['p-val'].iloc[0]
    d_value = ttest_results_rate['cohen-d'].iloc[0]
    df_value = ttest_results_rate['dof'].iloc[0] # Degrees of Freedom

    print("\n--- Key Values for Reporting ---")
    print(f"T-statistic: {t_value:.3f}")
    print(f"Degrees of Freedom (df): {df_value:.3f}")
    print(f"P-value: {p_value_rate:.5f}")
    print(f"Cohen's d: {d_value:.3f}")
    print("------------------------------")

    if p_value_rate < 0.05:
        print("Conclusion: The p-value is less than 0.05, indicating a statistically significant difference in mean re-attempt rates between the groups.")
    else:
        print("Conclusion: The p-value is not less than 0.05, so we cannot conclude there is a significant difference in mean re-attempt rates.")

    # --- 5. Visualization ---
    print("\n--- Step 5: Generating visualization ---")
    # ... (Visualization code would go here) ...

except FileNotFoundError as e:
    print(f"\nERROR: File not found {e.filename}.")
except Exception as e:
    print(f"An unexpected error occurred during processing: {e}")
