In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from google.colab import drive

from scipy.stats import skew, kurtosis

drive.mount('/content/drive')

In [None]:
merged_df = pd.read_csv('/content/drive/My Drive/SIADS 699: Capstone/Clean Data/merged_data_complete.csv')

In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25542 entries, 0 to 25541
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   username              25542 non-null  object 
 1   prob_name             25542 non-null  object 
 2   prob_category         25542 non-null  object 
 3   code                  25542 non-null  object 
 4   prob_grade_level      25542 non-null  int64  
 5   prob_timestamp        25542 non-null  object 
 6   correct               25542 non-null  int64  
 7   practice              25542 non-null  int64  
 8   mastery               25542 non-null  int64  
 9   mastery_timestamp     20015 non-null  object 
 10  quiz_name             25542 non-null  object 
 11  quiz_category         25542 non-null  object 
 12  quiz_grade_level      25542 non-null  int64  
 13  num_total             25542 non-null  int64  
 14  num_correct           25542 non-null  int64  
 15  percent_correct    

In [None]:
'''
Mastery Statistics:
- descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max) overall, by student, by topic
- heat map showing average mastery for each student/topic combination
- line plot showing average mastery by day overall, by student, by topic
'''

print("Mastery Statistics - Overall")
print(merged_df['mastery'].describe())
print()
print("Mastery Statistics - By Student")
print(merged_df.groupby('username')['mastery'].describe())
print()
print("Mastery Statistics - By Topic")
print(merged_df.groupby('prob_name')['mastery'].describe())

heatmap_df = merged_df.groupby(['username', 'prob_name'])['mastery'].mean().reset_index()
heatmap_fig = px.imshow(
    heatmap_df.pivot(index='username', columns='prob_name', values='mastery'),
    labels=dict(x="Topic", y="Student", color="Average Mastery"),
    x=heatmap_df['prob_name'].unique(),
    y=heatmap_df['username'].unique(),
    color_continuous_scale='Viridis',
    height=800,
    width=1200
)
heatmap_fig.update_layout(
    title='Average Mastery by Student and Topic',
    xaxis_title='Topic',
    yaxis_title='Student'
)
heatmap_fig.show()

merged_df['prob_timestamp'] = pd.to_datetime(merged_df['prob_timestamp'])
merged_df['prob_date'] = merged_df['prob_timestamp'].dt.date

line_fig1 = px.line(
    merged_df.groupby('prob_date')['mastery'].mean().reset_index(),
    x='prob_date',
    y='mastery',
    title="Average Mastery by Day (Overall)"
)

line_fig1.show()

line_fig2 = px.line(
    merged_df.groupby(['prob_date', 'username'])['mastery'].mean().reset_index(),
    x='prob_date',
    y='mastery',
    color='username',
    title="Average Mastery by Day (Per Student)"
)
line_fig2.show()

line_fig3 = px.line(
    merged_df.groupby(['prob_date', 'prob_name'])['mastery'].mean().reset_index(),
    x='prob_date',
    y='mastery',
    color='prob_name',
    title="Average Mastery by Day (Per Topic)"
)
line_fig3.show()

Mastery Statistics - Overall
count    25542.000000
mean         0.516013
std          0.499753
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: mastery, dtype: float64

Mastery Statistics - By Student
               count      mean       std  min  25%  50%  75%  max
username                                                         
AyaA2         3102.0  0.446486  0.497208  0.0  0.0  0.0  1.0  1.0
CharlotteC14  1406.0  0.286629  0.452347  0.0  0.0  0.0  1.0  1.0
CyrusC3       4491.0  0.509909  0.499957  0.0  0.0  1.0  1.0  1.0
FoxF2         1704.0  0.214202  0.410388  0.0  0.0  0.0  0.0  1.0
IyanaI         575.0  0.215652  0.411632  0.0  0.0  0.0  0.0  1.0
KnoxK2        7898.0  0.607116  0.488422  0.0  0.0  1.0  1.0  1.0
OamirO        2319.0  0.590772  0.491797  0.0  0.0  1.0  1.0  1.0
OarielO       1445.0  0.375779  0.484491  0.0  0.0  0.0  1.0  1.0
PaytonP4      2602.0  0.732129  0.442935  0.0  0.0  1.0  1.0  1.0

Maste

In [None]:
'''
Problems Before Mastery Statistics:
- descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max) overall, by student, by topic
- skew and kurtosis overall, by student, by topic
- histogram showing distribution overall, by student, by topic
- line plot showing average problems before mastery by month overall, by student, by topic
- heat map showing problems before mastery for each student/topic combination
'''

# filter merged_df to only include rows with a valid mastery_timestamp
filtered_merged_df = merged_df[merged_df['mastery_timestamp'].notna()]

print("Problems Before Mastery Statistics - Overall")
print(filtered_merged_df['probs_before_mastery'].describe())
print()
print("Problems Before Mastery Statistics - Skew")
print(skew(filtered_merged_df['probs_before_mastery']))
print()
print("Problems Before Mastery Statistics - Kurtosis")
print(kurtosis(filtered_merged_df['probs_before_mastery']))
print()
print("Problems Before Mastery Statistics - By Student")
print(filtered_merged_df.groupby('username')['probs_before_mastery'].describe())
print()
print("Problems Before Mastery Statistics - By Topic")
print(filtered_merged_df.groupby('prob_name')['probs_before_mastery'].describe())

dist_fig1 = px.histogram(
    filtered_merged_df,
    x='probs_before_mastery',
    title="Distribution of Problems Before Mastery (Overall)"
)
dist_fig1.show()

dist_fig2 = px.histogram(
    filtered_merged_df,
    x='probs_before_mastery',
    color='username',
    title="Distribution of Problems Before Mastery (Per Student)"
)
dist_fig2.show()

dist_fig3 = px.histogram(
    filtered_merged_df,
    x='probs_before_mastery',
    color='prob_name',
    title="Distribution of Problems Before Mastery (Per Topic)"
)
dist_fig3.show()

filtered_merged_df['prob_month'] = filtered_merged_df['prob_timestamp'].dt.month

line_fig1 = px.line(
    filtered_merged_df.groupby('prob_month')['probs_before_mastery'].mean().reset_index(),
    x='prob_month',
    y='probs_before_mastery',
    title="Average Problems Before Mastery by Month (Overall)"
)
line_fig1.show()

line_fig2 = px.line(
    filtered_merged_df.groupby(['prob_month', 'username'])['probs_before_mastery'].mean().reset_index(),
    x='prob_month',
    y='probs_before_mastery',
    color='username',
    title="Average Problems Before Mastery by Month (Per Student)"
)
line_fig2.show()

line_fig3 = px.line(
    filtered_merged_df.groupby(['prob_month', 'prob_name'])['probs_before_mastery'].mean().reset_index(),
    x='prob_month',
    y='probs_before_mastery',
    color='prob_name',
    title="Average Problems Before Mastery by Month (Per Topic)"
)
line_fig3.show()

heatmap_df = filtered_merged_df.groupby(['username', 'prob_name'])['probs_before_mastery'].mean().reset_index()

heatmap_fig = px.imshow(
    heatmap_df.pivot(index='username', columns='prob_name', values='probs_before_mastery'),
    labels=dict(x="Topic", y="Student", color="Average Problems Before Mastery"),
    x=heatmap_df['prob_name'].unique(),
    y=heatmap_df['username'].unique(),
    color_continuous_scale='Viridis',
    title="Average Problems Before Mastery by Student and Topic",
    height=800,
    width=1200
)
heatmap_fig.show()

Problems Before Mastery Statistics - Overall
count    20015.000000
mean       117.370472
std        161.247082
min          0.000000
25%          8.000000
50%         28.000000
75%        148.000000
max        515.000000
Name: probs_before_mastery, dtype: float64

Problems Before Mastery Statistics - Skew
1.3380197019548483

Problems Before Mastery Statistics - Kurtosis
0.1998724646765404

Problems Before Mastery Statistics - By Student
               count        mean         std  min   25%   50%    75%    max
username                                                                   
AyaA2         2041.0  142.274375  175.974768  0.0  20.0  32.0  239.0  462.0
CharlotteC14  1154.0  115.592721  174.948870  0.0   8.0  23.0   85.0  462.0
CyrusC3       3505.0   75.006562  116.060719  0.0   0.0  22.0   85.0  515.0
FoxF2         1193.0  105.547360  128.608341  0.0   1.0  47.0  179.0  462.0
IyanaI         447.0  104.335570  150.600827  0.0   0.0  28.0  107.0  462.0
KnoxK2        5867.0  171.9



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

