In [30]:
import pandas as pd
import numpy as np

# Set the name of your dataset for the final table
datasets = [
    {
        "name": "ASSIST2009",
        "path": "data/assist2009/ASSISTments_2009_2010_skill_builder_data.csv"
    },
    {
        "name": "ASSIST2012",
        "path": "data/assist2012/2012-2013-data-with-predictions-4-final.csv"
        # Note: Make sure this path points to your actual 2012 csv
    }
]

In [31]:
def get_dataset_stats(file_path, dataset_name):
    print(f"Loading {dataset_name}...")

    # 1. Load Data
    # low_memory=False is needed for large files like ASSIST2012
    try:
        df = pd.read_csv(file_path, encoding='latin-1', low_memory=False)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

    # 2. Standardize Column Names
    # This map covers both 2009 and 2012 naming conventions
    column_map = {
        'user_id': 'user_id',
        'skill_id': 'skill_id',
        'skill_name': 'skill_name',
        'problem_id': 'item_id',
        'correct': 'correct'
    }
    df = df.rename(columns={k: v for k, v in column_map.items() if k in df.columns})

    # 3. Clean Data (Standard AKT/DKT steps)
    # Drop rows without skills
    if 'skill_id' in df.columns:
        df = df.dropna(subset=['skill_id'])

    # Drop duplicates (Critical for 2009, harmless for 2012)
    df = df.drop_duplicates()

    # 4. Calculate Stats
    n_users = df['user_id'].nunique()
    n_interactions = len(df)

    # Items (Questions)
    n_items = df['item_id'].nunique() if 'item_id' in df.columns else 0

    # Skills
    n_skills = df['skill_id'].nunique() if 'skill_id' in df.columns else 0

    # Average Sequence Length
    user_counts = df.groupby('user_id').size()
    avg_len = user_counts.mean()

    # Return as a dictionary row
    return {
        'Dataset': dataset_name,
        '#Students': n_users,
        '#Questions': n_items,
        '#Skills': n_skills,
        '#Interactions': n_interactions,
        'Avg Length': round(avg_len, 2)
    }

In [32]:
stats_list = []

# Loop through the list
for ds in datasets:
    # We call the function defined in the previous cell
    # Make sure Cell 2 has been run!
    stats = get_dataset_stats(ds['path'], ds['name'])

    if stats:
        stats_list.append(stats)

# Create DataFrame
df_final = pd.DataFrame(stats_list)

# Display in Notebook
print("\n--- Final Table ---")
display(df_final)

# Generate LaTeX
print("\n--- LaTeX Code ---")
print(df_final.to_latex(
    index=False,
    caption="Dataset Statistics",
    label="tab:datasets",
    column_format="lccccc"  # Left align name, center stats
))

Loading ASSIST2009...
Loading ASSIST2012...

--- Final Table ---


Unnamed: 0,Dataset,#Students,#Questions,#Skills,#Interactions,Avg Length
0,ASSIST2009,4163,17751,123,459208,110.31
1,ASSIST2012,29018,53091,265,2711813,93.45



--- LaTeX Code ---
\begin{table}
\caption{Dataset Statistics}
\label{tab:datasets}
\begin{tabular}{lccccc}
\toprule
Dataset & #Students & #Questions & #Skills & #Interactions & Avg Length \\
\midrule
ASSIST2009 & 4163 & 17751 & 123 & 459208 & 110.310000 \\
ASSIST2012 & 29018 & 53091 & 265 & 2711813 & 93.450000 \\
\bottomrule
\end{tabular}
\end{table}



In [21]:
# Map your actual column names to standard names
# Format: 'ACTUAL_NAME_IN_CSV': 'standard_name'
column_map = {
    'user_id': 'user_id',
    'skill_id': 'skill_id',     # Sometimes called 'skill_name'
    'problem_id': 'item_id',    # Sometimes called 'item_id'
    'correct': 'correct'
}

# Rename them
df = df.rename(columns={k: v for k, v in column_map.items() if k in df.columns})

# Drop rows where skill_id is missing (Common practice in AKT/DKT papers)
if 'skill_id' in df.columns:
    original_len = len(df)
    df = df.dropna(subset=['skill_id'])
    print(f"Dropped {original_len - len(df)} rows with missing skills.")

Dropped 66326 rows with missing skills.


In [22]:
# 1. Count Uniques
n_users = df['user_id'].nunique()
n_interactions = len(df)

# Check if item/skill columns exist before counting
n_items = df['item_id'].nunique() if 'item_id' in df.columns else 0
n_skills = df['skill_id'].nunique() if 'skill_id' in df.columns else 0

# 2. Calculate Average Sequence Length
# Group by user to count how many interactions each user has
user_interactions = df.groupby('user_id').size()
avg_len = user_interactions.mean()

print(f"Users: {n_users}")
print(f"Interactions: {n_interactions}")
print(f"Avg Length: {avg_len:.2f}")

Users: 4163
Interactions: 459208
Avg Length: 110.31


In [23]:
# Create the summary dataframe
stats_data = {
    'Dataset': [dataset_name],
    'Users': [n_users],
    'Items': [n_items],
    'Skills': [n_skills],
    'Interactions': [n_interactions],
    'Avg Length': [round(avg_len, 2)]
}

df_stats = pd.DataFrame(stats_data)

# Display the table
df_stats

Unnamed: 0,Dataset,Users,Items,Skills,Interactions,Avg Length
0,ASSIST2009,4163,17751,123,459208,110.31


In [24]:
# Print LaTeX code without the index numbers
latex_code = df_stats.to_latex(index=False, caption=f"Statistics for {dataset_name} Dataset")
print(latex_code)

\begin{table}
\caption{Statistics for ASSIST2009 Dataset}
\begin{tabular}{lrrrrr}
\toprule
Dataset & Users & Items & Skills & Interactions & Avg Length \\
\midrule
ASSIST2009 & 4163 & 17751 & 123 & 459208 & 110.310000 \\
\bottomrule
\end{tabular}
\end{table}

