In [15]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
filename = '../data/sat-scores.csv'

df = pd.read_csv(filename,
                usecols=['Year', 'State.Code', 'Total.Math', 
                         'Family Income.Less than 20k.Math', 
                         'Family Income.Between 20-40k.Math', 
                         'Family Income.Between 40-60k.Math', 
                         'Family Income.Between 60-80k.Math',
                         'Family Income.Between 80-100k.Math',
                         'Family Income.More than 100k.Math'])
df.head()

Unnamed: 0,Year,State.Code,Total.Math,Family Income.Between 20-40k.Math,Family Income.Between 40-60k.Math,Family Income.Between 60-80k.Math,Family Income.Between 80-100k.Math,Family Income.Less than 20k.Math,Family Income.More than 100k.Math
0,2005,AL,559,513,539,550,566,462,588
1,2005,AK,519,492,517,513,528,464,541
2,2005,AZ,530,498,520,524,534,485,554
3,2005,AR,552,513,543,553,570,489,572
4,2005,CA,522,477,506,521,535,451,566


In [3]:
# Rename the income-related column names
df.columns = ['Year', 'State.Code', 'Total.Math',
                      'income<20k',
                      '20k<income<40k',
                      '40k<income<60k',
                      '60k<income<80k',
                      '80k<income<100k',
                      'income>100k',
                      ]
df.head()

Unnamed: 0,Year,State.Code,Total.Math,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
0,2005,AL,559,513,539,550,566,462,588
1,2005,AK,519,492,517,513,528,464,541
2,2005,AZ,530,498,520,524,534,485,554
3,2005,AR,552,513,543,553,570,489,572
4,2005,CA,522,477,506,521,535,451,566


In [12]:
# Find the average SAT math score for each income level, grouped and then sorted by year.
df.groupby('Year').mean(numeric_only=True).sort_index()

Unnamed: 0_level_0,Total.Math,income<20k,20k<income<40k,40k<income<60k,60k<income<80k,80k<income<100k,income>100k
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005,535.653846,488.653846,522.673077,536.076923,548.942308,427.596154,572.173077
2006,537.480769,502.923077,523.769231,534.903846,550.461538,461.019231,572.519231
2007,535.339623,494.849057,519.490566,533.188679,545.698113,457.924528,565.169811
2008,535.981132,523.622642,547.471698,549.188679,557.641509,478.641509,564.566038
2009,540.803922,527.823529,550.980392,553.941176,565.333333,482.058824,585.784314
2010,540.843137,499.27451,522.0,534.235294,547.627451,477.039216,569.27451
2011,533.226415,494.886792,513.415094,528.660377,541.849057,460.45283,563.245283
2012,533.603774,492.056604,512.45283,525.773585,538.301887,458.773585,557.320755
2013,532.622642,490.132075,511.377358,520.320755,537.396226,469.358491,556.339623
2014,534.283019,497.641509,514.943396,527.169811,543.132075,459.415094,555.433962


In [13]:
# For each year in our data set, find out much better each income group did,
# on average, than the next-poorer group of students. Do we see (just by looking 
# at the data) any income group that did worse, in any year, 
# than the next-poorer students?

df.groupby('Year')[['income<20k',
                      '20k<income<40k',
                      '40k<income<60k',
                      '60k<income<80k',
                      '80k<income<100k',
                      'income>100k']].mean().T.pct_change()

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
income<20k,,,,,,,,,,,
20k<income<40k,0.069618,0.04145,0.049796,0.045546,0.043872,0.045517,0.037439,0.041451,0.043346,0.034768,0.045059
40k<income<60k,0.025645,0.021259,0.026368,0.003136,0.005374,0.023439,0.029694,0.025994,0.017489,0.023743,0.026038
60k<income<80k,0.023999,0.029085,0.023462,0.015391,0.020566,0.025068,0.024947,0.023828,0.032817,0.030279,0.028277
80k<income<100k,-0.221054,-0.162486,-0.160846,-0.141668,-0.147302,-0.128898,-0.150219,-0.147739,-0.126606,-0.154137,-0.174429
income>100k,0.338116,0.241855,0.234199,0.179518,0.215172,0.19335,0.223242,0.214806,0.185319,0.209002,0.259097


In [6]:
# Which income bracket, on average, had the greatest advantage over the next-poorer income bracket?

df.groupby('Year')[['income<20k',
                      '20k<income<40k',
                      '40k<income<60k',
                      '60k<income<80k',
                      '80k<income<100k',
                      'income>100k']].mean().T.pct_change().T.mean().sort_values(ascending=False).head()

income>100k        0.226698
20k<income<40k     0.045260
60k<income<80k     0.025247
40k<income<60k     0.020744
80k<income<100k   -0.155944
dtype: float64

In [7]:
# Can we find, in a calculated and automated way, which income levels
# consistently (i.e., across all years) did worse than the next-poorest group?

change = df.groupby('Year')[['income<20k',
                      '20k<income<40k',
                      '40k<income<60k',
                      '60k<income<80k',
                      '80k<income<100k',
                      'income>100k']].mean().T.pct_change() 

change[change <= 0].dropna()

Year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
80k<income<100k,-0.221054,-0.162486,-0.160846,-0.141668,-0.147302,-0.128898,-0.150219,-0.147739,-0.126606,-0.154137,-0.174429
