In [25]:
import pandas as pd
import numpy as np
import util

proficiency_path = './data/pa/proficiency/'

# import pssa
pssa_files = [
    'pa_proficiency_2019_pssa_cleaned.csv'
    , 'pa_proficiency_2018_pssa_cleaned.csv'
    , 'pa_proficiency_2017_pssa_cleaned.csv'
    , 'pa_proficiency_2016_pssa_cleaned.csv'
    , 'pa_proficiency_2015_pssa_cleaned.csv'
]

pssa_years = [x[15:19] for x in pssa_files]

df_pssa = pd.read_csv(proficiency_path + pssa_files[0])
df_pssa['year'] = df_pssa.apply(lambda x: pssa_years[0], axis=1)

for i in range(1, len(pssa_files)):
    temp = pd.read_csv(proficiency_path + pssa_files[i])
    temp['year'] = temp.apply(lambda x: pssa_years[i], axis=1)
    df_pssa = df_pssa.append(temp, ignore_index = True, sort=True)

# rename columns
df_pssa = df_pssa.rename(columns={
    'AUN': 'district_id'
    , 'District': 'district'
    , 'Grade': 'grade'
    , 'Group': 'group_state'
    , 'Number Scored': 'num_tested'
    , 'School': 'school'
    , 'School Number': 'school_id'
    , 'Subject': 'subject'
    , '% Advanced': 'Percent Advanced'
    , '% Proficient': 'Percent Proficient'
    , '% Basic': 'Percent Basic'
    , '% Below Basic': 'Percent Below Basic'
})

# rename Total and School Total to All Grades
df_pssa['grade'] = [x if (x != 'Total' and x != 'School Total') else 'All Grades' for x in df_pssa['grade']]

In [26]:
# import keystone
keystone_files = [
    'pa_proficiency_2019_keystone_cleaned.csv'
    , 'pa_proficiency_2018_keystone_cleaned.csv'
    , 'pa_proficiency_2017_keystone_cleaned.csv'
    , 'pa_proficiency_2016_keystone_cleaned.csv'
    , 'pa_proficiency_2015_keystone_cleaned.csv'
]

keystone_years = [x[15:19] for x in keystone_files]

df_keystone = pd.read_csv(proficiency_path + keystone_files[0])
df_keystone['year'] = df_keystone.apply(lambda x: keystone_years[0], axis=1)

for i in range(1, len(keystone_files)):
    temp = pd.read_csv(proficiency_path + keystone_files[i])
    temp['year'] = temp.apply(lambda x: keystone_years[i], axis=1)
    df_keystone = df_keystone.append(temp, ignore_index = True, sort=True)

# rename columns
df_keystone = df_keystone.rename(columns={
    'AUN': 'district_id'
    , 'District': 'district'
    , 'Grade': 'grade'
    , 'Group': 'group_state'
    , 'Number Scored': 'num_tested'
    , 'School': 'school'
    , 'School Number': 'school_id'
    , 'Subject': 'subject'
})

# drop null rows
df_keystone = df_keystone.dropna(subset=['Percent Advanced', 'Percent Proficient', 'Percent Basic', 'Percent Below Basic'], how='all')

In [27]:
# create All Grades groups for years 2016 - 2019
temp = df_keystone[df_keystone['year'] != '2015']
temp['grade'] = temp.apply(lambda x: 'All Grades', axis = 1)
df_keystone = df_keystone.append(temp, ignore_index = True, sort=True)

# rename 2015 Total grade to All Grades
df_keystone['grade'] = [x if x != 'Total' else 'All Grades' for x in df_keystone['grade']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
# combine test files into one file
df = df_pssa.append(df_keystone, ignore_index = True, sort = True)

  County Growth  Growth** Percent Advanced Percent Basic Percent Below Basic  \
0  Adams    NaN       NaN              7.9          27.1                12.1   
1  Adams    NaN       NaN             24.4          27.6                10.6   
2  Adams    NaN       NaN             15.6          27.4                11.4   
3  Adams    NaN       NaN              4.5          31.8                22.7   
4  Adams    NaN       NaN             16.2          36.8                16.2   

  Percent Proficient  Unnamed: 13  Unnamed: 14  Unnamed: 15  ...  Year  \
0               52.9          NaN          NaN          NaN  ...   NaN   
1               37.4          NaN          NaN          NaN  ...   NaN   
2               45.6          NaN          NaN          NaN  ...   NaN   
3               40.9          NaN          NaN          NaN  ...   NaN   
4               30.9          NaN          NaN          NaN  ...   NaN   

               district  district_id       grade  \
0  BERMUDIAN SPRINGS S

In [29]:
# reshape file
id_vars = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject', 'num_tested']
value_vars = ['Percent Advanced', 'Percent Proficient', 'Percent Basic', 'Percent Below Basic']
df = pd.melt(df, id_vars = id_vars
             , value_vars = value_vars
             , var_name = 'performance_level'
             , value_name = 'pct_at_level')
print(df.head())

   year  district_id              district  school_id  \
0  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
1  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
2  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
3  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
4  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   

                     school       grade                   group_state  \
0  BERMUDIAN SPRINGS EL SCH           3                  All Students   
1  BERMUDIAN SPRINGS EL SCH           4                  All Students   
2  BERMUDIAN SPRINGS EL SCH  All Grades                  All Students   
3  BERMUDIAN SPRINGS EL SCH           3  Historically Underperforming   
4  BERMUDIAN SPRINGS EL SCH           4  Historically Underperforming   

                 subject  num_tested performance_level pct_at_level  
0  English Language Arts       140.0  Percent Advanced          7.9  
1  English Language Arts       123.0  Percent Advanced         24.4  
2  Engli

In [41]:
# filter out suppressed values, create num_at_level
df = df[df['pct_at_level'] != 'IS']
df = df[~df['pct_at_level'].isna()]
df['pct_at_level'] = pd.to_numeric(df['pct_at_level']) / 100.0
df['num_at_level'] = df.apply(lambda x: int(round(x['pct_at_level'] * x['num_tested'])), axis = 1)

   year  district_id              district  school_id  \
0  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
1  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
2  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
3  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   
4  2019  112011103.0  BERMUDIAN SPRINGS SD     7302.0   

                     school       grade                   group_state  \
0  BERMUDIAN SPRINGS EL SCH           3                  All Students   
1  BERMUDIAN SPRINGS EL SCH           4                  All Students   
2  BERMUDIAN SPRINGS EL SCH  All Grades                  All Students   
3  BERMUDIAN SPRINGS EL SCH           3  Historically Underperforming   
4  BERMUDIAN SPRINGS EL SCH           4  Historically Underperforming   

                 subject  num_tested performance_level  pct_at_level  \
0  English Language Arts       140.0  Percent Advanced         0.079   
1  English Language Arts       123.0  Percent Advanced         0.244   
2 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [44]:
# standardize subjects
def clean_subjects(row):
    if row['subject'] == 'Algebra I' or row['subject'] == 'M':
        return 'Math'
    if row['subject'] == 'English Language Arts' or row['subject'] == 'E' or row['subject'] == 'Literature':
        return 'ELA'
    if row['subject'] == 'Biology' or row['subject'] == 'S':
        return 'Science'
    return row['subject']

df['subject'] = df.apply(lambda x: clean_subjects(x), axis = 1)

0         English Language Arts
6                          Math
12                      Science
245384                Algebra I
245386                  Biology
245388               Literature
261734                        E
261738                        M
261742                        S
Name: subject, dtype: object
0         ELA
6        Math
12    Science
Name: subject, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


In [45]:
# reaggregate file
grouped_by = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject', 'performance_level']
df1 = df.groupby(grouped_by, as_index = False)['num_tested'].sum()
df2 = df.groupby(grouped_by, as_index = False)['num_at_level'].sum()
df = pd.merge(df1, df2, on = grouped_by)
print(df.head())

   year  district_id                     district  school_id  \
0  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   
1  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   
2  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   
3  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   
4  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   

                        school grade   group_state subject  \
0  FIRST PHILA CS FOR LITERACY     3  All Students     ELA   
1  FIRST PHILA CS FOR LITERACY     3  All Students     ELA   
2  FIRST PHILA CS FOR LITERACY     3  All Students     ELA   
3  FIRST PHILA CS FOR LITERACY     3  All Students     ELA   
4  FIRST PHILA CS FOR LITERACY     3  All Students    Math   

     performance_level  num_tested  num_at_level  
0     Percent Advanced       149.0             2  
1        Percent Basic       149.0            63  
2  Percent Below Basic       149.0            40  
3   Percent Proficient       149.0    

In [46]:
# rename performance levels
df['performance_level'] = [x.replace('Percent ', '') for x in df['performance_level']]

0       Advanced
1          Basic
2    Below Basic
3     Proficient
Name: performance_level, dtype: object


In [48]:
# recalculate pct_at_level
df['pct_at_level'] = df['num_at_level'] / df['num_tested']

   year  district_id                     district  school_id  \
0  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   
1  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   
2  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   
3  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   
4  2015  100510000.0  FIRST PHILA CS FOR LITERACY     7744.0   

                        school grade   group_state subject performance_level  \
0  FIRST PHILA CS FOR LITERACY     3  All Students     ELA          Advanced   
1  FIRST PHILA CS FOR LITERACY     3  All Students     ELA             Basic   
2  FIRST PHILA CS FOR LITERACY     3  All Students     ELA       Below Basic   
3  FIRST PHILA CS FOR LITERACY     3  All Students     ELA        Proficient   
4  FIRST PHILA CS FOR LITERACY     3  All Students    Math          Advanced   

   num_tested  num_at_level  pct_at_level  
0       149.0             2      0.013423  
1       149.0            63      0.422819  
2 

In [49]:
# add proficient_tf
df['proficient_tf'] = [1 if x == 'Advanced' or x == 'Proficient' else 0 for x in df['performance_level']]

   proficient_tf performance_level
0              1          Advanced
1              0             Basic
2              0       Below Basic
3              1        Proficient


In [50]:
# export final dataset
df.to_csv('./data/finalized/pa_proficiency.csv', index=False)