In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from google.colab import drive

from scipy.stats import skew, kurtosis

drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
quiz_df = pd.read_csv('/content/drive/My Drive/SIADS 699: Capstone/Clean Data/quiz_data_complete.csv')
prob_df = pd.read_csv('/content/drive/My Drive/SIADS 699: Capstone/Clean Data/prob_data_complete.csv')
mastery_df = pd.read_csv('/content/drive/My Drive/SIADS 699: Capstone/Clean Data/mastery_data.csv')
merged_df = pd.read_csv('/content/drive/My Drive/SIADS 699: Capstone/Clean Data/merged_data_complete.csv')

In [4]:
'''
Percent Correct Statistics (All Problems):
- descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max) overall, by student, by topic for ALL PROBLEMS
- descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max) overall, by student, by topic for QUIZZES ONLY
- heat map showing average percent_correct for each student/topic combination for ALL PROBLEMS
- heat map showing average percent_correct for each student/topic combination for QUIZZES ONLY
- line plot showing average percent_correct by day overall, by student, by topic for ALL PROBLEMS
- line plot showing average percent_correct by day overall, by student, by topic for QUIZZES ONLY
- histogram showing distribution of percent_correct overall, by student, by topic for ALL PROBLEMS
- histogram showing distribution of percent_correct overall, by student, by topic for QUIZZES ONLY
'''

print("Performance Statistics (All Problems) - Overall")
print(merged_df['percent_correct'].describe())
print()
print("Performance Statistics (All Problems) - Skew")
print(skew(merged_df['percent_correct']))
print()
print("Performance Statistics (All Problems) - Kurtosis")
print(kurtosis(merged_df['percent_correct']))
print()
print("Performance Statistics (All Problems) - By Student")
print(merged_df.groupby('username')['percent_correct'].describe())
print()
print("Performance Statistics (All Problems) - By Topic")
print(merged_df.groupby('prob_name')['percent_correct'].describe())

heatmap_df = merged_df.groupby(['username', 'prob_name'])['percent_correct'].mean().reset_index()
heatmap_fig = px.imshow(
    heatmap_df.pivot(index='username', columns='prob_name', values='percent_correct'),
    labels=dict(x="Topic", y="Student", color="Average Performance"),
    x=heatmap_df['prob_name'].unique(),
    y=heatmap_df['username'].unique(),
    color_continuous_scale='Viridis',
    height=800,
    width=1200
)
heatmap_fig.update_layout(
    title='Average Performance by Student and Topic',
    xaxis_title='Topic',
    yaxis_title='Student'
)
heatmap_fig.show()

merged_df['prob_timestamp'] = pd.to_datetime(merged_df['prob_timestamp'])
merged_df['prob_date'] = merged_df['prob_timestamp'].dt.date

line_fig1 = px.line(
    merged_df.groupby('prob_date')['percent_correct'].mean().reset_index(),
    x='prob_date',
    y='percent_correct',
    title="Average Performance by Day (Overall)"
)

line_fig1.show()

line_fig2 = px.line(
    merged_df.groupby(['prob_date', 'username'])['percent_correct'].mean().reset_index(),
    x='prob_date',
    y='percent_correct',
    color='username',
    title="Average Performance by Day (Per Student)"
)
line_fig2.show()

line_fig3 = px.line(
    merged_df.groupby(['prob_date', 'prob_name'])['percent_correct'].mean().reset_index(),
    x='prob_date',
    y='percent_correct',
    color='prob_name',
    title="Average Performance by Day (Per Topic)"
)
line_fig3.show()

dist_fig1 = px.histogram(
    merged_df,
    x='percent_correct',
    title="Distribution of Performance (Overall)"
)
dist_fig1.show()

dist_fig2 = px.histogram(
    merged_df,
    x='percent_correct',
    color='username',
    title="Distribution of Performance (Per Student)"
)
dist_fig2.show()

dist_fig3 = px.histogram(
    merged_df,
    x='percent_correct',
    color='prob_name',
    title="Distribution of Performance (Per Topic)"
)
dist_fig3.show()

Performance Statistics (All Problems) - Overall
count    25542.000000
mean         0.860483
std          0.310721
min          0.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: percent_correct, dtype: float64

Performance Statistics (All Problems) - Skew
-2.1487374394571335

Performance Statistics (All Problems) - Kurtosis
3.0425290809971237

Performance Statistics (All Problems) - By Student
               count      mean       std  min       25%  50%  75%  max
username                                                              
AyaA2         3102.0  0.836122  0.346483  0.0  1.000000  1.0  1.0  1.0
CharlotteC14  1406.0  0.886913  0.297860  0.0  1.000000  1.0  1.0  1.0
CyrusC3       4491.0  0.819611  0.354269  0.0  0.833333  1.0  1.0  1.0
FoxF2         1704.0  0.855164  0.315456  0.0  1.000000  1.0  1.0  1.0
IyanaI         575.0  0.822609  0.353692  0.0  0.900000  1.0  1.0  1.0
KnoxK2        7898.0  0.920136  0.246500  0.0  1.00000

In [13]:
'''
Percent Correct Statistics (Quizzes Only):
- descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max) overall, by student, by topic for QUIZZES ONLY
- heat map showing average percent_correct for each student/topic combination for QUIZZES ONLY
- line plot showing average percent_correct by day overall, by student, by topic for QUIZZES ONLY
- histogram showing distribution of percent_correct overall, by student, by topic for QUIZZES ONLY
'''

print("Performance Statistics (Quizzes Only) - Overall")
print(quiz_df['percent_correct'].describe())
print()
print("Performance Statistics (Quizzes Only) - Skew")
print(skew(quiz_df['percent_correct']))
print()
print("Performance Statistics (Quizzes Only) - Kurtosis")
print(kurtosis(quiz_df['percent_correct']))
print()
print("Performance Statistics (Quizzes Only) - By Student")
print(quiz_df.groupby('username')['percent_correct'].describe())
print()
print("Performance Statistics (Quizzes Only) - By Topic")
print(quiz_df.groupby('quiz_name')['percent_correct'].describe())

heatmap_df = quiz_df.groupby(['username', 'quiz_name'])['percent_correct'].mean().reset_index()
pivot_perf = heatmap_df.pivot(index='username', columns='quiz_name', values='percent_correct')
mastery_df = quiz_df.groupby(['username', 'quiz_name'])['mastery'].max().unstack(fill_value=0)

heatmap_fig = px.imshow(
    pivot_perf,
    labels=dict(x="Topic", y="Student", color="Average Performance"),
    x=pivot_perf.columns,
    y=pivot_perf.index,
    color_continuous_scale='Viridis',
    height=400,
    width=1200
)

mastery_y, mastery_x = np.where(mastery_df.values == 1)

heatmap_fig.add_trace(
    go.Scatter(
        x=[pivot_perf.columns[x] for x in mastery_x],
        y=[pivot_perf.index[y] for y in mastery_y],
        mode='markers',
        marker=dict(color='white', size=10, symbol='star'),
        name='Mastered',
        showlegend=False
    )
)

heatmap_fig.update_layout(
    title='Average Quiz Performance by Student and Topic',
    xaxis_title='Topic',
    yaxis_title='Student',
    coloraxis_colorbar=dict(
        title='Average Performance',
        titlefont=dict(size=12),
        tickfont=dict(size=10)
    ),
    margin=dict(l=50, r=20, t=60, b=50)
)
heatmap_fig.show()



line_fig1 = px.line(
    quiz_df.groupby('date')['percent_correct'].mean().reset_index(),
    x='date',
    y='percent_correct',
    title="Average Quiz Performance by Day (Overall)"
)

line_fig1.show()

line_fig2 = px.line(
    quiz_df.groupby(['date', 'username'])['percent_correct'].mean().reset_index(),
    x='date',
    y='percent_correct',
    color='username',
    title="Average Quiz Performance by Day (Per Student)",
    height=600,
    width=1200
)
line_fig2.show()

line_fig3 = px.line(
    quiz_df.groupby(['date', 'quiz_name'])['percent_correct'].mean().reset_index(),
    x='date',
    y='percent_correct',
    color='quiz_name',
    title="Average Quiz Performance by Day (Per Topic)"
)
line_fig3.show()

dist_fig1 = px.histogram(
    quiz_df,
    x='percent_correct',
    title="Distribution of Quiz Performance (Overall)"
)
dist_fig1.show()

dist_fig2 = px.histogram(
    quiz_df,
    x='percent_correct',
    color='username',
    title="Distribution of Quiz Performance (Per Student)"
)
dist_fig2.show()

dist_fig3 = px.histogram(
    quiz_df,
    x='percent_correct',
    color='quiz_name',
    title="Distribution of Quiz Performance (Per Topic)"
)
dist_fig3.show()

Performance Statistics (Quizzes Only) - Overall
count    1659.000000
mean        0.850738
std         0.213219
min         0.000000
25%         0.750000
50%         1.000000
75%         1.000000
max         1.000000
Name: percent_correct, dtype: float64

Performance Statistics (Quizzes Only) - Skew
-1.455079016709163

Performance Statistics (Quizzes Only) - Kurtosis
1.6629597099559694

Performance Statistics (Quizzes Only) - By Student
              count      mean       std  min    25%  50%  75%  max
username                                                          
AyaA2         150.0  0.857756  0.218997  0.0  0.750  1.0  1.0  1.0
CharlotteC14  139.0  0.929976  0.133117  0.4  0.930  1.0  1.0  1.0
CyrusC3       288.0  0.860946  0.201905  0.0  0.800  1.0  1.0  1.0
FoxF2         159.0  0.916509  0.153712  0.2  0.800  1.0  1.0  1.0
IyanaI         54.0  0.825926  0.243547  0.2  0.725  1.0  1.0  1.0
KnoxK2        236.0  0.869199  0.192621  0.2  0.750  1.0  1.0  1.0
OamirO        216.0  0.7