In [12]:
import pandas as pd
import numpy as np
import util

proficiency_path = './data/dc/proficiency/'

In [13]:
# process 2014 DC CAS
filename = 'dc_proficiency_2014_cleaned.csv'

df = pd.read_csv(proficiency_path + filename)
df['year'] = '2014'
df = df.rename(columns={
    'school_name': 'school',
    'subgroup': 'group_state',
    'tested_grade': 'grade',
    'math_testtakers': 'math_num_tested',
    'read_testtakers': 'read_num_tested'
})

shared_columns = ['year', 'school_id', 'school', 'group_state', 'grade']
# split into math and reading
df_math = df[shared_columns + ['math_num_tested', 'math_proficient', 'math_advanced']]
df_ela = df[shared_columns + ['read_num_tested', 'read_proficient', 'read_advanced']]

# set subjects & rename columns
df_math['subject'] = 'Math'
df_ela['subject'] = 'ELA'
df_math = df_math.rename(columns={
    'math_num_tested': 'num_tested',
    'math_proficient': 'proficient',
    'math_advanced': 'advanced'
})
df_ela = df_ela.rename(columns={
    'read_num_tested': 'num_tested',
    'read_proficient': 'proficient',
    'read_advanced': 'advanced'
})

# union files
df = df_math.append(df_ela, ignore_index=True, sort=True)

# drop suppressed rows and null rows
print(df.shape)
df = df[df['num_tested'] != 'n<10']
print(df.shape)
df = df[~df['school_id'].isnull()]
print(df.shape)

# calculate combined num_at_level of proficient & advanced
df['num_at_level'] = df.apply(lambda x: int(float(x['proficient']) + float(x['advanced'])), axis=1)

# calculate pct_at_level
df['pct_at_level'] = df.apply(lambda x: x['num_at_level'] / float(x['num_tested']), axis=1)

# set district & district id
df['district_id'] = '001'
df['district'] = 'District of Columbia Public Schools'

df['school_id'] = [str(int(x)) for x in df['school_id']]

df['proficient_tf'] = 1
df['performance_level'] = 'Proficient and Advanced'
del df['proficient']
del df['advanced']

print(df.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


(23694, 9)
(10406, 9)
(9504, 9)
          grade           group_state num_tested  \
0             3      African American         49   
1  all students      African American         49   
2             3                   All         49   
3  all students                   All         49   
8             3  Direct Certification         17   

                                   school school_id subject  year  \
0          Achievement Preparatory PCS-El       217    Math  2014   
1  Achievement Preparatory PCS-Elementary       217    Math  2014   
2  Achievement Preparatory PCS-Elementary       217    Math  2014   
3  Achievement Preparatory PCS-Elementary       217    Math  2014   
8  Achievement Preparatory PCS-Elementary       217    Math  2014   

   num_at_level  pct_at_level district_id  \
0            11      0.224490         001   
1            11      0.224490         001   
2            11      0.224490         001   
3            11      0.224490         001   
8             6

In [14]:
# process 2015-2016 PARCC & MSAA
filename = 'dc_proficiency_2015_2016_cleaned.csv'

df2 = pd.read_csv(proficiency_path + filename)
df2 = df2.rename(columns={
    'LEA Code': 'district_id',
    'LEA Name': 'district',
    'School Code': 'school_id',
    'School Name': 'school',
    'Subject': 'subject',
    'Subgroup Value': 'group_state',
    'Tested Grade/Subject': 'grade',
    'Total valid test takers': 'num_tested'
})

print(df2.columns)

# create year
df2['year'] = ['2015' if x == '2014-15' else '2016' for x in df2['School Year']]
print(df2['year'].drop_duplicates())

# drop suppressed rows
print(df2.shape)
df2 = df2[df2['num_tested'] != 'n<25']
print(df2.shape)

df2['pct_at_level'] = df2.apply(lambda x: float(x['% level 4+'].replace('%','')) / 100, axis=1)
df2['num_at_level'] = df2.apply(lambda x: int(float(x['num_tested']) * x['pct_at_level']), axis=1)
df2['proficient_tf'] = 1
df2['performance_level'] = 'Level 4 & 5'
df2 = df2[['year', 'district_id', 'district', 'school_id', 'school', 'subject', 'grade', 'group_state', 'num_tested', 'pct_at_level', 'num_at_level', 'proficient_tf', 'performance_level']]
print(df2.head())

Index(['School Year', 'School Ward', 'district_id', 'district', 'school_id',
       'school', 'Assessment Type', 'subject', 'grade', 'Grade of Enrollment',
       'Subgroup', 'group_state', '% level 4+', '% level 3+', '% level 1',
       '% level 2', '% level 3', '% level 4', '% level 5', 'num_tested'],
      dtype='object')
0    2015
1    2016
Name: year, dtype: object
(128982, 21)
(41904, 21)
   year  district_id                             district  school_id  \
0  2015          155  Achievement Preparatory Academy PCS        217   
1  2016          155  Achievement Preparatory Academy PCS        217   
4  2016          155  Achievement Preparatory Academy PCS        217   
5  2015          155  Achievement Preparatory Academy PCS        217   
6  2016          155  Achievement Preparatory Academy PCS        217   

                                   school subject grade group_state  \
0  Achievement Preparatory PCS Elementary     ELA   All         All   
1  Achievement Preparatory 

In [15]:
# process 2017 PARCC & MSAA
filename = 'dc_proficiency_2017_cleaned.csv'

df3 = pd.read_csv(proficiency_path + filename)
df3 = df3.rename(columns={
    'LEA Code': 'district_id',
    'LEA Name': 'district',
    'School Code': 'school_id',
    'School Name': 'school',
    'Subject': 'subject',
    'Subgroup Value': 'group_state',
    'Tested Grade/Subject': 'grade',
    'Total Number Valid Test Takers': 'num_tested'
})

print(df3.columns)

# create year
df3['year'] = '2017'

# drop suppressed rows
print(df3.shape)
df3 = df3[df3['num_tested'] != 'n<25']
print(df3.shape)
df3 = df3[~df3['Percent Meeting or Exceeding Expectations'].isnull()]
print(df3.shape)

df3['pct_at_level'] = df3.apply(lambda x: float(x['Percent Meeting or Exceeding Expectations'].replace('%','')) / 100, axis=1)
df3['num_at_level'] = df3.apply(lambda x: int(float(x['num_tested']) * x['pct_at_level']), axis=1)
df3['proficient_tf'] = 1
df3['performance_level'] = 'Level 4 & 5'
df3 = df3[['year', 'district_id', 'district', 'school_id', 'school', 'subject', 'grade', 'group_state', 'num_tested', 'pct_at_level', 'num_at_level', 'proficient_tf', 'performance_level']]
print(df3.head())

Index(['School Ward', 'district_id', 'district', 'school_id', 'school',
       'Assessment Type', 'subject', 'grade', 'Grade of Enrollment',
       'Subgroup', 'group_state', 'Percent Meeting or Exceeding Expectations',
       'Percent Level 3+', 'Percent Level 1', 'Percent Level 2',
       'Percent Level 3', 'Percent Level 4', 'Percent Level 5', 'num_tested'],
      dtype='object')
(76020, 20)
(30732, 20)
(22837, 20)
   year  district_id                             district  school_id  \
0  2017          155  Achievement Preparatory Academy PCS        217   
1  2017          155  Achievement Preparatory Academy PCS        217   
2  2017          155  Achievement Preparatory Academy PCS        217   
3  2017          155  Achievement Preparatory Academy PCS        217   
5  2017          155  Achievement Preparatory Academy PCS        217   

                                              school subject grade  \
0  Achievement Preparatory Academy PCS Wahler Pla...     ELA     3   
1  Ac

In [16]:
# process 2018 PARCC & MSAA
filename = 'dc_proficiency_2018_cleaned.csv'

df4 = pd.read_csv(proficiency_path + filename)
df4 = df4.rename(columns={
    'LEA Code': 'district_id',
    'LEA Name': 'district',
    'School Code': 'school_id',
    'School Name': 'school',
    'Subject': 'subject',
    'Subgroup Value': 'group_state',
    'Tested Grade/Subject': 'grade',
    'Total Number Valid Test Takers': 'num_tested'
})

print(df4.columns)

# create year
df4['year'] = '2018'

# drop suppressed rows
print(df4.shape)
df4 = df4[df4['num_tested'] != 'n<10']
print(df4.shape)
df4 = df4[~df4['Percent Meeting or Exceeding Expectations'].isnull()]
print(df4.shape)

df4['pct_at_level'] = df4.apply(lambda x: float(x['Percent Meeting or Exceeding Expectations'].replace('%','')) / 100, axis=1)
df4['num_at_level'] = df4.apply(lambda x: int(float(x['num_tested']) * x['pct_at_level']), axis=1)
df4['proficient_tf'] = 1
df4['performance_level'] = 'Level 4 & 5'
df4 = df4[['year', 'district_id', 'district', 'school_id', 'school', 'subject', 'grade', 'group_state', 'num_tested', 'pct_at_level', 'num_at_level', 'proficient_tf', 'performance_level']]
print(df4.head())

Index(['School Ward', 'district_id', 'district', 'school_id', 'school',
       'Assessment Type', 'subject', 'grade', 'Grade of Enrollment',
       'Subgroup', 'group_state', 'Percent Meeting or Exceeding Expectations',
       'Percent Level 3+', 'Percent Level 1', 'Percent Level 2',
       'Percent Level 3', 'Percent Level 4', 'Percent Level 5', 'num_tested',
       'Percent Subclaim Level 1\n(Met or Exceeded Expectations)',
       'Percent Subclaim Level 2\n(Nearly Met Expectations)',
       'Percent Subclaim Level 3\n(Below Expectations)'],
      dtype='object')
(88297, 23)
(54796, 23)
(35081, 23)
   year  district_id                             district  school_id  \
0  2018          155  Achievement Preparatory Academy PCS        217   
6  2018          155  Achievement Preparatory Academy PCS        217   
7  2018          155  Achievement Preparatory Academy PCS        217   
8  2018          155  Achievement Preparatory Academy PCS        217   
9  2018          155  Achievemen

In [17]:
# process 2019 PARCC & MSAA
filename = 'dc_proficiency_2019_cleaned.csv'

df5 = pd.read_csv(proficiency_path + filename)
df5 = df5.rename(columns={
    'LEA Code': 'district_id',
    'LEA Name': 'district',
    'School Code': 'school_id',
    'School Name': 'school',
    'Subject': 'subject',
    'Subgroup Value': 'group_state',
    'Tested Grade/Subject': 'grade',
    'Total Number Valid Test Takers': 'num_tested'
})

print(df5.columns)

# create year
df5['year'] = '2019'

# drop suppressed rows
print(df5.shape)
df5 = df5[df5['num_tested'] != 'n<10']
print(df5.shape)
df5 = df5[~df5['Percent Meeting or Exceeding Expectations'].isnull()]
print(df5.shape)

df5['pct_at_level'] = df5.apply(lambda x: float(x['Percent Meeting or Exceeding Expectations'].replace('%','')) / 100, axis=1)
df5['num_at_level'] = df5.apply(lambda x: int(float(x['num_tested']) * x['pct_at_level']), axis=1)
df5['proficient_tf'] = 1
df5['performance_level'] = 'Level 4 & 5'
df5 = df5[['year', 'district_id', 'district', 'school_id', 'school', 'subject', 'grade', 'group_state', 'num_tested', 'pct_at_level', 'num_at_level', 'proficient_tf', 'performance_level']]
print(df5.head())

Index(['School Ward', 'district_id', 'district', 'school_id', 'school',
       'Assessment Type', 'subject', 'grade', 'Grade of Enrollment',
       'Subgroup', 'group_state', 'Percent Meeting or Exceeding Expectations',
       'Percent Level 3+', 'Percent Level 1', 'Percent Level 2',
       'Percent Level 3', 'Percent Level 4', 'Percent Level 5', 'num_tested',
       'Percent Subclaim Level 1\n(Met or Exceeded Expectations)',
       'Percent Subclaim Level 2\n(Nearly Met Expectations)',
       'Percent Subclaim Level 3\n(Below Expectations)'],
      dtype='object')
(80321, 23)
(50095, 23)
(32319, 23)
    year  district_id                             district  school_id  \
5   2019            1  District of Columbia Public Schools        175   
6   2019            1  District of Columbia Public Schools        175   
7   2019            1  District of Columbia Public Schools        175   
8   2019            1  District of Columbia Public Schools        175   
11  2019            1  Dist

In [18]:
# append all proficiency files
print(df.shape)
df = df.append(df2, ignore_index=True, sort=True)
print(df.shape)
df = df.append(df3, ignore_index=True, sort=True)
print(df.shape)
df = df.append(df4, ignore_index=True, sort=True)
print(df.shape)
df = df.append(df5, ignore_index=True, sort=True)
print(df.shape)
print(df.columns)

(9504, 13)
(51408, 13)
(74245, 13)
(109326, 13)
(141645, 13)
Index(['district', 'district_id', 'grade', 'group_state', 'num_at_level',
       'num_tested', 'pct_at_level', 'performance_level', 'proficient_tf',
       'school', 'school_id', 'subject', 'year'],
      dtype='object')


In [19]:
# unify data types
print(df.dtypes)
df['num_at_level'] = df['num_at_level'].astype('float64')
df['num_tested'] = df['num_tested'].astype('float64')
df['proficient_tf'] = df['proficient_tf'].astype(bool)
df['district_id'] = df['district_id'].astype(int)
df['district_id'] = df['district_id'].astype(str)
df['school_id'] = df['school_id'].astype(int)
df['school_id'] = df['school_id'].astype(str)
print(df.dtypes)

district              object
district_id           object
grade                 object
group_state           object
num_at_level           int64
num_tested            object
pct_at_level         float64
performance_level     object
proficient_tf          int64
school                object
school_id             object
subject               object
year                  object
dtype: object
district              object
district_id           object
grade                 object
group_state           object
num_at_level         float64
num_tested           float64
pct_at_level         float64
performance_level     object
proficient_tf           bool
school                object
school_id             object
subject               object
year                  object
dtype: object


In [20]:
# export final dataset
df.to_csv('./data/finalized/dc_proficiency.csv', index=False)