<a href="https://colab.research.google.com/github/drscook/day1_materials/blob/main/day1_materials_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Day 1 materials
I've uploaded all data and code (wrangling & analysis) to https://github.com/drscook/day1_materials.

- run setup code once
- data: either "students" or "courses" depending on if you want student-level or course-level analysis
- vars: list of variables used for analysis
- height: height of image - must be adjusted for different size analyses

##Setup - run once

In [2]:
# run once to update packages
import google
! pip install -U ipython-autotime numpy pandas matplotlib plotly_express
google.colab.output.clear() # clear messy output from update process
get_ipython().kernel.do_shutdown(True) # restart runtime so updates take effect - ignore "session crashed" pop-up

{'status': 'ok', 'restart': True}

In [3]:
%reload_ext autotime
import numpy as np, pandas as pd, matplotlib.pyplot as plt, plotly.express as px
from IPython.core.display import HTML
students = pd.read_csv('https://raw.githubusercontent.com/drscook/day1_materials/main/data/students.csv')
courses = pd.read_csv('https://raw.githubusercontent.com/drscook/day1_materials/main/data/courses.csv')

def disp(df, max_rows=100, max_cols=200, **kwargs):
    display(HTML(pd.DataFrame(df).to_html(max_rows=max_rows, max_cols=max_cols, **kwargs)))

def analyze(data, vars, height=1000):
    X = data[vars].dropna()
    dt = X.dtypes.apply(pd.api.types.is_numeric_dtype)
    val = dt[dt].index.tolist()
    grp = dt[~dt].index.tolist()
    Y = X.sort_values(grp)
    c = Y.filter(like='spend_per_crse').columns
    Y[c] = Y[c].clip(0, 300)

    if len(val) == 0:
        Z = X.groupby(grp).value_counts().to_frame('ct')
        grp.pop()
        Z['pct'] = 100 * Z['ct'] // X.groupby(grp).size()
        px.sunburst(Y, path=vars, height=height).update_traces(sort=False).show()
        px.icicle(Y, path=vars, height=height).update_traces(sort=False).show()
    else:
        Z = X.groupby(grp)[val].describe()
        for v in val:
            px.box(Y, x=v, y=grp[0], title=v, points='all', height=height,
                category_orders={g: np.unique(X[g]) for g in grp},
                color=grp[1] if len(grp) > 1 else None).show()
    return Z

time: 202 ms (started: 2024-03-08 20:09:53 +00:00)


##Analysis

In [4]:
analyze(data=students, vars=['class','campus','first'], height=1000)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ct,pct
class,campus,first,Unnamed: 3_level_1,Unnamed: 4_level_1
0_hs,dist,1_some_first,2,33
0_hs,dist,3_all_first,4,66
0_other,dist,2_most_first,2,100
0_other,ftw,1_some_first,1,50
0_other,ftw,3_all_first,1,50
...,...,...,...,...
5_gr,steph,3_all_first,8,30
5_gr,waco,0_none_first,2,22
5_gr,waco,1_some_first,1,11
5_gr,waco,2_most_first,4,44


time: 731 ms (started: 2024-03-08 20:09:53 +00:00)


In [5]:
analyze(data=students, vars=['class','campus','spend_per_crse','pct_crse_pay'], height=1800)









Unnamed: 0_level_0,Unnamed: 1_level_0,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
class,campus,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
0_hs,dist,5.0,7.4,15.453155,0.0,0.0,0.0,2.0,35.0,5.0,90.0,22.36068,50.0,100.0,100.0,100.0,100.0
0_other,dist,2.0,64.5,31.819805,42.0,53.25,64.5,75.75,87.0,2.0,62.5,17.67767,50.0,56.25,62.5,68.75,75.0
0_other,ftw,2.0,47.0,38.183766,20.0,33.5,47.0,60.5,74.0,2.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
0_other,steph,9.0,115.0,39.344631,48.0,94.0,122.0,150.0,159.0,9.0,97.222222,8.333333,75.0,100.0,100.0,100.0,100.0
0_other,waco,1.0,241.0,,241.0,241.0,241.0,241.0,241.0,1.0,100.0,,100.0,100.0,100.0,100.0,100.0
1_fr,dist,3.0,74.333333,70.301731,21.0,34.5,48.0,101.0,154.0,3.0,91.666667,14.433757,75.0,87.5,100.0,100.0,100.0
1_fr,steph,185.0,63.859459,37.239264,0.0,39.0,56.0,84.0,234.0,185.0,68.324324,24.591504,20.0,50.0,75.0,83.0,100.0
2_so,dist,9.0,86.333333,25.588083,26.0,75.0,94.0,102.0,108.0,9.0,80.666667,14.908052,67.0,67.0,75.0,100.0,100.0
2_so,ftw,1.0,133.0,,133.0,133.0,133.0,133.0,133.0,1.0,100.0,,100.0,100.0,100.0,100.0,100.0
2_so,steph,139.0,75.964029,46.556403,0.0,49.5,67.0,96.5,400.0,139.0,76.661871,22.080076,20.0,60.0,80.0,100.0,100.0


time: 391 ms (started: 2024-03-08 20:10:12 +00:00)


In [6]:
analyze(data=courses, vars=['subj','spend_per_crse','pct_crse_pay'], height=10000)

Unnamed: 0_level_0,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
subj,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
acct,83.0,84.783133,38.735516,20.0,60.00,75.0,104.00,225.0,83.0,84.012048,20.906000,20.0,75.0,100.0,100.00,100.0
acom,12.0,87.583333,60.533174,25.0,58.00,58.0,95.00,200.0,12.0,73.916667,29.081260,33.0,40.0,83.5,100.00,100.0
acrs,1.0,922.000000,,922.0,922.00,922.0,922.00,922.0,1.0,50.000000,,50.0,50.0,50.0,50.00,50.0
adri,1.0,88.000000,,88.0,88.00,88.0,88.00,88.0,1.0,75.000000,,75.0,75.0,75.0,75.00,75.0
aest,1.0,45.000000,,45.0,45.00,45.0,45.00,45.0,1.0,50.000000,,50.0,50.0,50.0,50.00,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tarl,1.0,53.000000,,53.0,53.00,53.0,53.00,53.0,1.0,100.000000,,100.0,100.0,100.0,100.00,100.0
teca,3.0,47.333333,18.823744,35.0,36.50,38.0,53.50,69.0,3.0,78.666667,25.794056,50.0,68.0,86.0,93.00,100.0
univ,10.0,69.300000,34.589819,36.0,40.50,54.0,101.25,118.0,10.0,63.900000,12.784105,40.0,60.0,63.5,73.00,80.0
vete,6.0,52.666667,14.665151,40.0,42.50,51.5,53.00,80.0,6.0,71.500000,8.117881,60.0,67.0,71.0,78.75,80.0


time: 597 ms (started: 2024-03-08 20:10:16 +00:00)


In [7]:
analyze(data=courses, vars=['subj','class','spend_per_crse','pct_crse_pay'], height=20000)









Unnamed: 0_level_0,Unnamed: 1_level_0,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,spend_per_crse,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay,pct_crse_pay
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
subj,class,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
acct,0_other,4.0,71.000000,58.889727,20.0,20.0,71.0,122.00,122.0,4.0,100.000000,0.000000,100.0,100.00,100.0,100.00,100.0
acct,1_fr,14.0,72.714286,34.668604,29.0,60.0,67.5,75.75,167.0,14.0,68.500000,24.139817,20.0,60.00,80.0,82.25,100.0
acct,2_so,20.0,68.900000,36.533906,29.0,49.0,60.0,72.75,190.0,20.0,70.850000,23.061189,25.0,55.25,77.5,85.00,100.0
acct,3_jr,25.0,94.560000,37.731596,26.0,75.0,100.0,103.00,225.0,25.0,92.600000,13.238202,50.0,80.00,100.0,100.00,100.0
acct,4_sr,17.0,96.235294,36.360228,50.0,72.0,88.0,113.00,173.0,17.0,93.058824,11.442632,67.0,80.00,100.0,100.00,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wses,1_fr,5.0,46.000000,18.493242,16.0,40.0,58.0,58.00,58.0,5.0,68.000000,43.817805,20.0,20.00,100.0,100.00,100.0
wses,2_so,3.0,78.333333,4.041452,76.0,76.0,76.0,79.50,83.0,3.0,88.666667,9.814955,83.0,83.00,83.0,91.50,100.0
wses,3_jr,3.0,25.000000,0.000000,25.0,25.0,25.0,25.00,25.0,3.0,50.000000,0.000000,50.0,50.00,50.0,50.00,50.0
wses,4_sr,2.0,60.000000,0.000000,60.0,60.0,60.0,60.00,60.0,2.0,100.000000,0.000000,100.0,100.00,100.0,100.00,100.0


time: 1.69 s (started: 2024-03-08 20:10:25 +00:00)
