In [53]:
import pandas as pd
import numpy as np
import util

proficiency_path = './data/pa/proficiency/'

# import pssa
pssa_files = [
    'pa_proficiency_2019_pssa_cleaned.csv'
    , 'pa_proficiency_2018_pssa_cleaned.csv'
    , 'pa_proficiency_2017_pssa_cleaned.csv'
    , 'pa_proficiency_2016_pssa_cleaned.csv'
    , 'pa_proficiency_2015_pssa_cleaned.csv'
]

pssa_years = [x[15:19] for x in pssa_files]

df_pssa = pd.read_csv(proficiency_path + pssa_files[0])
df_pssa['year'] = df_pssa.apply(lambda x: pssa_years[0], axis=1)

for i in range(1, len(pssa_files)):
    temp = pd.read_csv(proficiency_path + pssa_files[i])
    temp['year'] = temp.apply(lambda x: pssa_years[i], axis=1)
    df_pssa = df_pssa.append(temp, ignore_index = True, sort=True)

# rename columns
df_pssa = df_pssa.rename(columns={
    'AUN': 'district_id'
    , 'District': 'district'
    , 'Grade': 'grade'
    , 'Group': 'group_state'
    , 'Number Scored': 'num_tested'
    , 'School': 'school'
    , 'School Number': 'school_id'
    , 'Subject': 'subject'
    , '% Advanced': 'Percent Advanced'
    , '% Proficient': 'Percent Proficient'
    , '% Basic': 'Percent Basic'
    , '% Below Basic': 'Percent Below Basic'
})

# rename Total and School Total to All Grades
df_pssa['grade'] = [x if (x != 'Total' and x != 'School Total') else 'All Grades' for x in df_pssa['grade']]

In [54]:
# import keystone
keystone_files = [
    'pa_proficiency_2019_keystone_cleaned.csv'
    , 'pa_proficiency_2018_keystone_cleaned.csv'
    , 'pa_proficiency_2017_keystone_cleaned.csv'
    , 'pa_proficiency_2016_keystone_cleaned.csv'
    , 'pa_proficiency_2015_keystone_cleaned.csv'
]

keystone_years = [x[15:19] for x in keystone_files]

df_keystone = pd.read_csv(proficiency_path + keystone_files[0])
df_keystone['year'] = df_keystone.apply(lambda x: keystone_years[0], axis=1)

for i in range(1, len(keystone_files)):
    temp = pd.read_csv(proficiency_path + keystone_files[i])
    temp['year'] = temp.apply(lambda x: keystone_years[i], axis=1)
    df_keystone = df_keystone.append(temp, ignore_index = True, sort=True)

# rename columns
df_keystone = df_keystone.rename(columns={
    'AUN': 'district_id'
    , 'District': 'district'
    , 'Grade': 'grade'
    , 'Group': 'group_state'
    , 'Number Scored': 'num_tested'
    , 'School': 'school'
    , 'School Number': 'school_id'
    , 'Subject': 'subject'
})

# drop null rows
df_keystone = df_keystone.dropna(subset=['Percent Advanced', 'Percent Proficient', 'Percent Basic', 'Percent Below Basic'], how='all')

In [55]:
# add district IDs to 2015 data
temp_2015 = df_keystone[df_keystone.year.eq('2015')]
del temp_2015['district_id']
temp_not = df_keystone[df_keystone['year'] != '2015']
districts = temp_not[['district_id', 'district']]
districts = districts.drop_duplicates()
districts['district'] = districts['district'].str.strip()
temp_2015['district'] = temp_2015['district'].str.strip()
temp_2015 = temp_2015.merge(districts, on='district', how='left')

df_keystone = temp_2015.append(temp_not, ignore_index = True, sort=True)
del temp_2015
del temp_not
del districts

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [56]:
# create All Grades groups for years 2016 - 2019
temp = df_keystone[df_keystone['year'] != '2015']
temp['grade'] = temp.apply(lambda x: 'All Grades', axis = 1)
df_keystone = df_keystone.append(temp, ignore_index = True, sort=True)

# rename 2015 Total grade to All Grades
df_keystone['grade'] = [x if x != 'Total' else 'All Grades' for x in df_keystone['grade']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [57]:
# combine test files into one file
df = df_pssa.append(df_keystone, ignore_index = True, sort = True)

In [58]:
# reshape file
id_vars = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject', 'num_tested']
value_vars = ['Percent Advanced', 'Percent Proficient', 'Percent Basic', 'Percent Below Basic']
df = pd.melt(df, id_vars = id_vars
             , value_vars = value_vars
             , var_name = 'performance_level'
             , value_name = 'pct_at_level')
print(df.head())

   year  district_id              district  school_id  \
0  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
1  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
2  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
3  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
4  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   

                     school       grade                   group_state  \
0  BERMUDIAN SPRINGS EL SCH           3                  All Students   
1  BERMUDIAN SPRINGS EL SCH           4                  All Students   
2  BERMUDIAN SPRINGS EL SCH  All Grades                  All Students   
3  BERMUDIAN SPRINGS EL SCH           3  Historically Underperforming   
4  BERMUDIAN SPRINGS EL SCH           4  Historically Underperforming   

                 subject  num_tested performance_level pct_at_level  
0  English Language Arts       140.0  Percent Advanced          7.9  
1  English Language Arts       123.0  Percent Advanced         24.4  
2  Engli

In [59]:
# filter out suppressed values, create num_at_level
df = df[df['pct_at_level'] != 'IS']
df = df[~df['pct_at_level'].isna()]
df['pct_at_level'] = pd.to_numeric(df['pct_at_level']) / 100.0
df['num_at_level'] = df.apply(lambda x: int(round(x['pct_at_level'] * x['num_tested'])), axis = 1)

In [60]:
# standardize subjects
def clean_subjects(row):
    if row['subject'] == 'Algebra I' or row['subject'] == 'M':
        return 'Math'
    if row['subject'] == 'English Language Arts' or row['subject'] == 'E' or row['subject'] == 'Literature':
        return 'ELA'
    if row['subject'] == 'Biology' or row['subject'] == 'S':
        return 'Science'
    return row['subject']

df['subject'] = df.apply(lambda x: clean_subjects(x), axis = 1)

In [61]:
# drop rows with null num_tested and num_at_level
print(df.shape)
df = df.dropna(subset=['num_tested', 'num_at_level'])
print(df.shape)

(1118872, 12)
(1118872, 12)


In [62]:
print(df.groupby(['year', 'grade', 'group_state'])['school_id'].count())

year  grade       group_state                 
2015  11          All Students                     8240
                  Historically Underperforming     7968
      3           All Students                    13056
                  Historically Underperforming    12504
      4           All Students                    19088
                  Historically Underperforming    18316
      5           All Students                    11856
                  Historically Underperforming    11360
      6           All Students                     8564
                  Historically Underperforming     8352
      7           All Students                     6996
                  Historically Underperforming     6900
      8           All Students                    10448
                  Historically Underperforming    10204
      All Grades  All Students                    36148
                  Historically Underperforming    35544
2016  11.0        All Students                     8268
 

In [63]:
# reaggregate file
grouped_by = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject', 'performance_level']
df1 = df.groupby(grouped_by, as_index = False)['num_tested'].sum()
df2 = df.groupby(grouped_by, as_index = False)['num_at_level'].sum()
df = pd.merge(df1, df2, on = grouped_by)

year  grade       group_state                 
2015  11          All Students                     8016
                  Historically Underperforming     7816
      3           All Students                    13056
                  Historically Underperforming    12504
      4           All Students                    19088
                  Historically Underperforming    18316
      5           All Students                    11856
                  Historically Underperforming    11360
      6           All Students                     8564
                  Historically Underperforming     8352
      7           All Students                     6996
                  Historically Underperforming     6900
      8           All Students                    10448
                  Historically Underperforming    10204
      All Grades  All Students                    33540
                  Historically Underperforming    33140
2016  11.0        All Students                     8268
 

In [64]:
# rename performance levels
df['performance_level'] = [x.replace('Percent ', '') for x in df['performance_level']]

In [65]:
# add proficient_tf
print(df.shape)
df['proficient_tf'] = [1 if x == 'Advanced' or x == 'Proficient' else 0 for x in df['performance_level']]
df = df[df['proficient_tf']  == 1]
print(df.shape)

(1093084, 11)
(546542, 12)


In [66]:
# remove rows with no scores
print(df.shape)
df = df.dropna(subset=['num_tested', 'num_at_level'])
print(df.shape)

(546542, 12)
(546542, 12)


In [67]:
# roll up into a single performance level
print(df.shape)
grouped_by = ['year', 'district_id', 'district', 'school_id', 'school', 'subject', 'grade', 'group_state', 'proficient_tf']
df = df.groupby(grouped_by, as_index=False).agg({'num_tested': 'sum', 'num_at_level': 'sum'})
print(df.shape)

(546542, 12)
(273271, 11)
year  grade       group_state                 
2015  11          All Students                    2004
                  Historically Underperforming    1954
      3           All Students                    3264
                  Historically Underperforming    3126
      4           All Students                    4772
                  Historically Underperforming    4579
      5           All Students                    2964
                  Historically Underperforming    2840
      6           All Students                    2141
                  Historically Underperforming    2088
      7           All Students                    1749
                  Historically Underperforming    1725
      8           All Students                    2612
                  Historically Underperforming    2551
      All Grades  All Students                    8385
                  Historically Underperforming    8285
2016  11.0        All Students                 

In [68]:
# recalculate pct_at_level
df['pct_at_level'] = df['num_at_level'] / df['num_tested']

year  grade       group_state                 
2015  11          All Students                    2004
                  Historically Underperforming    1954
      3           All Students                    3264
                  Historically Underperforming    3126
      4           All Students                    4772
                  Historically Underperforming    4579
      5           All Students                    2964
                  Historically Underperforming    2840
      6           All Students                    2141
                  Historically Underperforming    2088
      7           All Students                    1749
                  Historically Underperforming    1725
      8           All Students                    2612
                  Historically Underperforming    2551
      All Grades  All Students                    8385
                  Historically Underperforming    8285
2016  11.0        All Students                    2067
                  

In [69]:
print(df.dtypes)
df['district_id'] = df['district_id'].astype('int64')
df['district_id'] = df['district_id'].astype(str)
df['school_id'] = df['school_id'].astype('int64')
df['school_id'] = df['school_id'].astype(str)
df['proficient_tf'] = df['proficient_tf'].astype(bool)
df['num_at_level'] = df['num_at_level'].astype('float64')
print(df.dtypes)

year              object
district_id      float64
district          object
school_id        float64
school            object
subject           object
grade             object
group_state       object
proficient_tf      int64
num_tested       float64
num_at_level       int64
pct_at_level     float64
dtype: object
year              object
district_id       object
district          object
school_id         object
school            object
subject           object
grade             object
group_state       object
proficient_tf       bool
num_tested       float64
num_at_level     float64
pct_at_level     float64
dtype: object


In [71]:
# export final dataset
df.to_csv('./data/finalized/pa_proficiency.csv', index=False)