In [2]:
import pandas as pd
import numpy as np
students0 = pd.read_csv('data/StudentsPerformance.csv')
students0

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [3]:
scores = [s.split()[0] for s in students0.columns[-3:]]


In [4]:
students = students0.set_axis(['gender', 'origin', 'parent_ed', 'lunch', 'testprep']+scores, axis=1)
students

Unnamed: 0,gender,origin,parent_ed,lunch,testprep,math,reading,writing
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


# 1

In [5]:
from scipy import stats as st
pd.set_option("Display.max_columns",None)

In [6]:
from itertools import combinations
def test_indep(df, col_to_grpby, colstotest):
    out = {'sample1':[], 'sample2':[]}
    for statcol in colstotest:
        out[statcol+'_t']=[]
        out[statcol+'_p']=[]
    varset = df.loc[:,col_to_grpby].unique()
    varpairs = list(combinations(varset, 2))
    grps = df.groupby(col_to_grpby)
    for pair in varpairs:
        v1, v2 = pair
        smpl1 = grps.get_group(v1)[colstotest]
        smpl2 = grps.get_group(v2)[colstotest]
        t, p = st.ttest_ind(smpl1, smpl2, equal_var=False)
        out['sample1'].append(v1)
        out['sample2'].append(v2)
        for i,statcol in enumerate(colstotest):
            out[statcol+'_t'].append(t[i])
            out[statcol+'_p'].append(p[i])
    return pd.DataFrame(out).set_index(['sample1','sample2'])


In [7]:
def test_relationship(df, col_to_grpby, colstotest):
    out = {'variable': []}
    for statcol in colstotest:
        out[statcol+'_t']=[]
        out[statcol+'_p']=[]
    varset = df.loc[:,col_to_grpby].unique()
    
    for var in varset:
        smpl = df[df[col_to_grpby]==var]
        ctrl = df[df[col_to_grpby]!=var]
        t, p = st.ttest_ind(smpl[colstotest], ctrl[colstotest], equal_var=False)
        out['variable'].append(var)
        for i,statcol in enumerate(colstotest):
            out[statcol+'_t'].append(t[i])
            out[statcol+'_p'].append(p[i])
    return pd.DataFrame(out).set_index(['variable'])


In [8]:
def test_effect_of(df, category, score_set, confidence_level=.9):
    categ_vals = df.loc[:,category].unique()
    if len(categ_vals)==2:
        print(f'Only 2 values of {category} to compare. Assume value {categ_vals[0]} is compared against value {categ_vals[1]}')
        ctrl_cat = categ_vals[1]
        categ_vals = [categ_vals[0]]
    else: ctrl_cat = 'other'
    
    for i,score_col in enumerate(score_set):
        rescolname = score_col+'_eff'
        res = {category:[], rescolname:[], 'p':[] }
        
        for val in categ_vals:
            smpl = df[df[category]==val][score_col]
            ctrl = df[df[category]!=val][score_col]
            n1, n2 = smpl.shape[0], ctrl.shape[0]
            x1, s1  = smpl.mean(), smpl.std()
            x2, s2 = ctrl.mean(), ctrl.std()
            dof = min([(n1-1), (n2-1)])
            alpha = 1-confidence_level
            deltax = x1-x2

            t, p = st.ttest_ind(smpl, ctrl, equal_var=False)
            if p<(alpha/2):
                if deltax<0: comp='lower than'
                else: comp='greater than'
                #print(f'For {score_col} scores, {val} scores are {abs(deltax):.2f} {comp} {ctrl_cat} scores (p={p:.5g}).')

                res[category].append(val)
                res[rescolname].append(round(deltax,3))
                res['p'].append(p)
        display(pd.DataFrame(res).set_index([category]).sort_values(rescolname))


## Gender

In [9]:
students.groupby('gender')[scores].describe().round(3)

Unnamed: 0_level_0,math,math,math,math,math,math,math,math,reading,reading,reading,reading,reading,reading,reading,reading,writing,writing,writing,writing,writing,writing,writing,writing
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
gender,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
female,518.0,63.633,15.491,0.0,54.0,65.0,74.0,100.0,518.0,72.608,14.378,17.0,63.25,73.0,83.0,100.0,518.0,72.467,14.845,10.0,64.0,74.0,82.0,100.0
male,482.0,68.728,14.356,27.0,59.0,69.0,79.0,100.0,482.0,65.473,13.932,23.0,56.0,66.0,75.0,100.0,482.0,63.311,14.114,15.0,53.0,64.0,73.75,100.0


In [10]:
test_indep(students, 'gender', ['math', 'reading', 'writing'])

Unnamed: 0_level_0,Unnamed: 1_level_0,math_t,math_p,reading_t,reading_p,writing_t,writing_p
sample1,sample2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
female,male,-5.398001,8.420838e-08,7.968357,4.376297e-15,9.997719,1.7118090000000001e-22


In [11]:
test_relationship(students, 'gender', ['math', 'reading', 'writing'])

Unnamed: 0_level_0,math_t,math_p,reading_t,reading_p,writing_t,writing_p
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,-5.398001,8.420838e-08,7.968357,4.376297e-15,9.997719,1.7118090000000001e-22
male,5.398001,8.420838e-08,-7.968357,4.376297e-15,-9.997719,1.7118090000000001e-22


In [12]:
test_effect_of(students, 'gender', ['math', 'reading', 'writing'])

Only 2 values of gender to compare. Assume value female is compared against value male


Unnamed: 0_level_0,math_eff,p
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,-5.095,8.420838e-08


Unnamed: 0_level_0,reading_eff,p
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,7.135,4.376297e-15


Unnamed: 0_level_0,writing_eff,p
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,9.156,1.7118090000000001e-22


## Race/Ethnicity

In [13]:
students.groupby('origin')[scores].describe().round(3)

Unnamed: 0_level_0,math,math,math,math,math,math,math,math,reading,reading,reading,reading,reading,reading,reading,reading,writing,writing,writing,writing,writing,writing,writing,writing
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
origin,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
group A,89.0,61.629,14.523,28.0,51.0,61.0,71.0,100.0,89.0,64.674,15.544,23.0,53.0,64.0,74.0,100.0,89.0,62.674,15.468,19.0,51.0,62.0,73.0,97.0
group B,190.0,63.453,15.468,8.0,54.0,63.0,74.0,97.0,190.0,67.353,15.177,24.0,56.0,67.0,79.75,97.0,190.0,65.6,15.625,15.0,55.25,67.0,78.0,96.0
group C,319.0,64.464,14.853,0.0,55.0,65.0,74.0,98.0,319.0,69.103,13.997,17.0,60.0,71.0,78.5,100.0,319.0,67.828,14.983,10.0,57.0,68.0,79.0,100.0
group D,262.0,67.363,13.769,26.0,59.0,69.0,77.0,100.0,262.0,70.031,13.895,31.0,60.25,71.0,79.0,100.0,262.0,70.145,14.368,32.0,61.0,72.0,80.0,100.0
group E,140.0,73.821,15.534,30.0,64.75,74.5,85.0,100.0,140.0,73.029,14.874,26.0,63.0,74.0,84.0,100.0,140.0,71.407,15.114,22.0,62.0,72.0,80.25,100.0


In [14]:
test_indep(students, 'origin', ['math', 'reading', 'writing'])

Unnamed: 0_level_0,Unnamed: 1_level_0,math_t,math_p,reading_t,reading_p,writing_t,writing_p
sample1,sample2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
group B,group C,-0.724066,0.4694654,-1.295458,0.195965,-1.579592,0.115023
group B,group A,0.957158,0.3397544,1.351609,0.178312,1.467814,0.143963
group B,group D,-2.776631,0.005765788,-1.918007,0.055849,-3.156822,0.00172
group B,group E,-6.003507,5.587367e-09,-3.396475,0.000774,-3.40034,0.000763
group C,group A,1.620139,0.1073983,2.427651,0.016561,2.798064,0.00588
group C,group D,-2.43662,0.01513,-0.797582,0.425452,-1.897467,0.058276
group C,group E,-6.021181,6.00726e-09,-2.649679,0.008567,-2.342327,0.019909
group A,group D,-3.259755,0.001388804,-2.883088,0.004566,-4.00694,9.9e-05
group A,group E,-6.026064,8.143028e-09,-4.031218,8.2e-05,-4.201638,4.1e-05
group D,group E,-4.12866,4.940865e-05,-1.969494,0.049927,-0.811387,0.417853


In [15]:
test_relationship(students, 'origin', ['math', 'reading', 'writing'])

Unnamed: 0_level_0,math_t,math_p,reading_t,reading_p,writing_t,writing_p
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
group B,-2.624216,0.009163496,-1.849679,0.065433,-2.422209,0.016068
group C,-2.347409,0.01920952,-0.09931,0.920922,-0.324818,0.745426
group A,-3.023175,0.00312712,-2.875811,0.004894,-3.445376,0.000821
group D,1.681025,0.09336365,1.147298,0.251822,2.689204,0.007407
group E,6.396221,1.305356e-09,3.323334,0.001073,2.829794,0.005166


In [16]:
test_effect_of(students, 'origin', ['math', 'reading', 'writing'])

Unnamed: 0_level_0,math_eff,p
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
group A,-4.895,0.00312712
group B,-3.255,0.009163496
group C,-2.386,0.01920952
group E,8.991,1.305356e-09


Unnamed: 0_level_0,reading_eff,p
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
group A,-4.934,0.004894
group E,4.488,0.001073


Unnamed: 0_level_0,writing_eff,p
origin,Unnamed: 1_level_1,Unnamed: 2_level_1
group A,-5.905,0.000821
group B,-3.03,0.016068
group D,2.833,0.007407
group E,3.899,0.005166


## Parental Eductation Level

In [17]:
students.groupby('parent_ed')[scores].describe().round(3)

Unnamed: 0_level_0,math,math,math,math,math,math,math,math,reading,reading,reading,reading,reading,reading,reading,reading,writing,writing,writing,writing,writing,writing,writing,writing
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
parent_ed,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
associate's degree,222.0,67.883,15.112,26.0,57.0,67.0,80.0,100.0,222.0,70.928,13.869,31.0,61.0,72.5,81.0,100.0,222.0,69.896,14.311,35.0,58.0,70.5,80.0,100.0
bachelor's degree,118.0,69.39,14.944,29.0,61.0,68.0,79.0,100.0,118.0,73.0,14.285,41.0,63.0,73.0,82.75,100.0,118.0,73.381,14.728,38.0,62.5,74.0,83.0,100.0
high school,196.0,62.138,14.54,8.0,53.75,63.0,72.0,99.0,196.0,64.704,14.132,24.0,54.0,66.0,74.25,99.0,196.0,62.449,14.086,15.0,52.0,64.0,73.0,100.0
master's degree,59.0,69.746,15.154,40.0,55.5,73.0,81.0,95.0,59.0,75.373,13.775,42.0,65.5,76.0,84.5,100.0,59.0,75.678,13.731,46.0,67.0,75.0,85.0,100.0
some college,226.0,67.128,14.313,19.0,59.0,67.5,76.0,100.0,226.0,69.46,14.057,23.0,60.0,70.5,79.75,100.0,226.0,68.841,15.012,19.0,60.0,70.0,79.0,99.0
some high school,179.0,63.497,15.928,0.0,53.0,65.0,74.0,97.0,179.0,66.939,15.479,17.0,56.5,67.0,79.0,100.0,179.0,64.888,15.736,10.0,54.0,66.0,77.0,100.0


In [18]:
test_indep(students, 'parent_ed', ['math', 'reading', 'writing'])

Unnamed: 0_level_0,Unnamed: 1_level_0,math_t,math_p,reading_t,reading_p,writing_t,writing_p
sample1,sample2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
bachelor's degree,some college,1.35176,0.177788,2.193731,0.029239,2.696501,0.007500636
bachelor's degree,master's degree,-0.147988,0.882612,-1.067007,0.288111,-1.023623,0.308012
bachelor's degree,associate's degree,0.881688,0.378824,1.286078,0.199694,2.097365,0.03703999
bachelor's degree,high school,4.207306,3.6e-05,5.004121,1e-06,6.475066,5.371391e-10
bachelor's degree,some high school,3.238958,0.001355,3.460588,0.000628,4.731752,3.645162e-06
some college,master's degree,-1.194859,0.235391,-2.923456,0.004356,-3.339152,0.001191762
some college,associate's degree,-0.542421,0.587801,-1.112453,0.266543,-0.761923,0.4465092
some college,high school,3.542139,0.000443,3.45653,0.000604,4.508913,8.477638e-06
some college,some high school,2.382003,0.017735,1.695107,0.090911,2.561659,0.0108091
master's degree,associate's degree,0.839771,0.403237,2.199874,0.030329,2.849058,0.005384403


In [19]:
test_relationship(students, 'parent_ed', ['math', 'reading', 'writing'])

Unnamed: 0_level_0,math_t,math_p,reading_t,reading_p,writing_t,writing_p
variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
bachelor's degree,2.550775,0.011744,3.094489,0.002349,4.170403,5.092989e-05
some college,1.219312,0.223463,0.349907,0.726601,0.892244,0.37284
master's degree,1.910833,0.060406,3.553719,0.000705,4.368442,4.435996e-05
associate's degree,2.004125,0.045811,2.111061,0.035427,2.137279,0.03321387
high school,-4.207024,3.4e-05,-4.906701,2e-06,-6.119474,2.790397e-09
some high school,-2.428904,0.015848,-2.154557,0.032155,-2.995209,0.003014391


In [20]:
test_effect_of(students, 'parent_ed', ['math', 'reading', 'writing'])

Unnamed: 0_level_0,math_eff,p
parent_ed,Unnamed: 1_level_1,Unnamed: 2_level_1
high school,-4.914,3.4e-05
some high school,-3.157,0.015848
associate's degree,2.306,0.045811
bachelor's degree,3.742,0.011744


Unnamed: 0_level_0,reading_eff,p
parent_ed,Unnamed: 1_level_1,Unnamed: 2_level_1
high school,-5.553,2e-06
some high school,-2.717,0.032155
associate's degree,2.261,0.035427
bachelor's degree,4.344,0.002349
master's degree,6.593,0.000705


Unnamed: 0_level_0,writing_eff,p
parent_ed,Unnamed: 1_level_1,Unnamed: 2_level_1
high school,-6.971,2.790397e-09
some high school,-3.856,0.003014391
associate's degree,2.368,0.03321387
bachelor's degree,6.04,5.092989e-05
master's degree,8.102,4.435996e-05


# 2

In [21]:
test_effect_of(students, 'lunch', ['math', 'reading', 'writing'])

Only 2 values of lunch to compare. Assume value standard is compared against value free/reduced


Unnamed: 0_level_0,math_eff,p
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1
standard,11.113,5.539585e-28


Unnamed: 0_level_0,reading_eff,p
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1
standard,7.001,8.421689e-13


Unnamed: 0_level_0,writing_eff,p
lunch,Unnamed: 1_level_1,Unnamed: 2_level_1
standard,7.801,1.716147e-14


It appears that standard lunch improves test scores over free/reduced lunch. this is likely due to confounding factors like the probability that a kid getting free/reduced lunch has a home life less conducive to academics (more disruptive, less books, etc.)

# 3

In [22]:
test_effect_of(students, 'testprep', ['math', 'reading', 'writing'])

Only 2 values of testprep to compare. Assume value none is compared against value completed


Unnamed: 0_level_0,math_eff,p
testprep,Unnamed: 1_level_1,Unnamed: 2_level_1
none,-5.618,1.042562e-08


Unnamed: 0_level_0,reading_eff,p
testprep,Unnamed: 1_level_1,Unnamed: 2_level_1
none,-7.36,4.388808e-15


Unnamed: 0_level_0,writing_eff,p
testprep,Unnamed: 1_level_1,Unnamed: 2_level_1
none,-9.914,2.66274e-25


Prep courses seem to have a positive effect on exam scores accross the board.

# 4

In [24]:
students[['math', 'reading', 'writing']].corr()

Unnamed: 0,math,reading,writing
math,1.0,0.81758,0.802642
reading,0.81758,1.0,0.954598
writing,0.802642,0.954598,1.0


Reading and writing scores are correlated most with each other (at 0.95), likely because they employ similar skills such as verbal fluency.