In [96]:
# 'a' file structure, grades 4-8 results by grade and group
import pandas as pd
import numpy as np
import util

proficiency_path = './data/la/proficiency/'

proficiency_files = [
    'la_proficiency_a_2015.csv',
    'la_proficiency_a_2016.csv',
    'la_proficiency_a_2017.csv',
    'la_proficiency_a_2018.csv',
    'la_proficiency_a_2019.csv'
]

years = [x[17:21] for x in proficiency_files]

df = pd.read_csv(proficiency_path + proficiency_files[0], dtype='str')
df['year'] = df.apply(lambda x: years[0], axis=1)

for i in range(1, len(proficiency_files)):
    temp = pd.read_csv(proficiency_path + proficiency_files[i], dtype='str')
    temp['year'] = temp.apply(lambda x: years[i], axis=1)
    temp['Subgroup'] = temp['Subgroup'].fillna('Total Population')
    df = df.append(temp, ignore_index = True, sort=True)

In [97]:
# drop non-School records
print(df.shape)
df = df[df['Summary Level'] == 'School']
print(df.shape)

(421447, 28)
(384067, 28)


In [98]:
# rename columns
df = df.rename(columns={
    'School Code': 'school_id',
    'School Name': 'school',
    'School System Code': 'district_id',
    'School System Name': 'district',
    'Subgroup': 'group_state',
    'Grade': 'grade'
})
print(df.columns)

Index(['ELA % Advanced', 'ELA % Approaching Basic', 'ELA % Basic',
       'ELA % Mastery', 'ELA % Unsatisfactory', 'grade',
       'Mathematics % Advanced', 'Mathematics % Approaching Basic',
       'Mathematics % Basic', 'Mathematics % Mastery',
       'Mathematics % Unsatisfactory', 'school_id', 'school', 'district_id',
       'district', 'Science % Advanced', 'Science % Approaching Basic',
       'Science % Basic', 'Science % Mastery', 'Science % Unsatisfactory',
       'Social Studies % Advanced', 'Social Studies % Approaching Basic',
       'Social Studies % Basic', 'Social Studies % Mastery',
       'Social Studies % Unsatisfactory', 'group_state', 'Summary Level',
       'year'],
      dtype='object')


In [99]:
# split into math & ela files, drop other subjects
math_columns = ['year', 'district_id', 'district', 'school_id', 'school', 'group_state', 'grade', 'Mathematics % Advanced', 'Mathematics % Approaching Basic', 'Mathematics % Basic', 'Mathematics % Mastery', 'Mathematics % Unsatisfactory']
ela_columns = ['year', 'district_id', 'district', 'school_id', 'school', 'group_state', 'grade', 'ELA % Advanced', 'ELA % Approaching Basic', 'ELA % Basic', 'ELA % Mastery', 'ELA % Unsatisfactory']
df_math = df[math_columns]
df_ela = df[ela_columns]

In [100]:
# drop NR rows
print(df_math.shape)
df_math = df_math[df_math['Mathematics % Advanced'] != 'NR']
print(df_math.shape)
print(df_ela.shape)
df_ela = df_ela[df_ela['ELA % Advanced'] != 'NR']
print(df_ela.shape)

(384067, 12)
(158755, 12)
(384067, 12)
(159402, 12)


In [101]:
# process <5 rows
def suppressedCount(row):
    list = []
    list.append(row['advanced'])
    list.append(row['approaching basic'])
    list.append(row['basic'])
    list.append(row['mastery'])
    list.append(row['unsatisfactory'])
    return list.count('≤5') + list.count('<5')

df_math = df_math.rename(columns={
    'Mathematics % Advanced': 'advanced',
    'Mathematics % Approaching Basic': 'approaching basic',
    'Mathematics % Basic': 'basic',
    'Mathematics % Mastery': 'mastery',
    'Mathematics % Unsatisfactory': 'unsatisfactory'
})
df_math['suppressedCount'] = df_math.apply(lambda x: suppressedCount(x), axis = 1)

df_ela = df_ela.rename(columns={
    'ELA % Advanced': 'advanced',
    'ELA % Approaching Basic': 'approaching basic',
    'ELA % Basic': 'basic',
    'ELA % Mastery': 'mastery',
    'ELA % Unsatisfactory': 'unsatisfactory'
})
df_ela['suppressedCount'] = df_ela.apply(lambda x: suppressedCount(x), axis = 1)

In [102]:
def replaceSuppressed(row, column, advancedTF):
    if row[column] == '≤5' or row[column] == '<5':
        if row['suppressedCount'] == 1:
            if advancedTF:
                if (100.0 - float(row['approaching basic']) - float(row['basic']) - float(row['mastery']) - float(row['unsatisfactory'])) > 0:
                    return 100.0 - float(row['approaching basic']) - float(row['basic']) - float(row['mastery']) - float(row['unsatisfactory'])
                return 0
            if (100.0 - float(row['approaching basic']) - float(row['basic']) - float(row['advanced']) - float(row['unsatisfactory'])) > 0:
                return 100.0 - float(row['approaching basic']) - float(row['basic']) - float(row['advanced']) - float(row['unsatisfactory'])
            return 0
        if row['suppressedCount'] == 2:
            return 2.5
        if row['suppressedCount'] == 3:
            return 1.67
        if row['suppressedCount'] == 4:
            return 1.25
    return row[column]
df_math['cleanedAdvanced'] = df_math.apply(lambda x: replaceSuppressed(x, 'advanced', 1), axis = 1)
df_math['cleanedMastery'] = df_math.apply(lambda x: replaceSuppressed(x, 'mastery', 0), axis = 1)
df_ela['cleanedAdvanced'] = df_ela.apply(lambda x: replaceSuppressed(x, 'advanced', 1), axis = 1)
df_ela['cleanedMastery'] = df_ela.apply(lambda x: replaceSuppressed(x, 'mastery', 0), axis = 1)

In [103]:
def convertFloat(row, column):
    try:
        return float(row[column])
    except:
        return float(row[column].replace('>', '').replace('<', '').replace('≤', '').replace('≥', ''))

# add advanced and mastery
def calcPct(row):
    return (convertFloat(row, 'cleanedAdvanced') + convertFloat(row,'cleanedMastery')) / 100.0

df_math['pct_at_level'] = df_math.apply(lambda x: calcPct(x), axis = 1)
df_ela['pct_at_level'] = df_ela.apply(lambda x: calcPct(x), axis = 1)

In [104]:
# append subjects
df_math['subject'] = 'Math'
df_ela['subject'] = 'ELA'
df = df_math.append(df_ela, ignore_index=True, sort=True)

In [105]:
# set performance_level and proficient_tf
df['proficient_tf'] = 1
df['performance_level'] = 'Mastery and Advanced'

In [106]:
# remove rows with no scores
print(df.shape)
df = df[df['pct_at_level'] != 0]
df = df.dropna(subset=['pct_at_level'])
print(df.shape)

(318157, 19)
(318157, 19)


In [107]:
# filter columns
df = df[['district', 'district_id', 'grade', 'group_state', 'pct_at_level', 'school', 'school_id', 'subject', 'year', 'proficient_tf']]

In [108]:
# calculate 'All' grades rows
grouped_by = ['district', 'district_id', 'group_state', 'school', 'school_id', 'subject', 'year', 'proficient_tf']
df_allgrades = df.groupby(grouped_by, as_index=False)['pct_at_level'].mean()
df_allgrades['grade'] = 'All'
print(df_allgrades.head(5))

                          district district_id                group_state  \
0  A.E. Phillips Laboratory School         322  Black or African American   
1  A.E. Phillips Laboratory School         322  Black or African American   
2  A.E. Phillips Laboratory School         322  Black or African American   
3  A.E. Phillips Laboratory School         322  Black or African American   
4  A.E. Phillips Laboratory School         322  Black or African American   

                             school school_id subject  year  proficient_tf  \
0  A. E. Phillips Laboratory School    322001     ELA  2015              1   
1  A. E. Phillips Laboratory School    322001     ELA  2016              1   
2  A. E. Phillips Laboratory School    322001     ELA  2017              1   
3  A. E. Phillips Laboratory School    322001     ELA  2018              1   
4  A. E. Phillips Laboratory School    322001     ELA  2019              1   

   pct_at_level grade  
0       0.80000   All  
1       0.74670   Al

In [109]:
# append 'All' grades rows to main df
print(df.shape)
df = df.append(df_allgrades, ignore_index=True, sort=True)
del df_allgrades
print(df.shape)

(318157, 10)
(400443, 10)


In [110]:
print(df.dtypes)
df['proficient_tf'] = df['proficient_tf'].astype(bool)
print(df.dtypes)

district          object
district_id       object
grade             object
group_state       object
pct_at_level     float64
proficient_tf      int64
school            object
school_id         object
subject           object
year              object
dtype: object
district          object
district_id       object
grade             object
group_state       object
pct_at_level     float64
proficient_tf       bool
school            object
school_id         object
subject           object
year              object
dtype: object


In [111]:
# 'b' file structure, 2017 - 2019 EOC
b_files = [
    'la_proficiency_b_2017.csv'
    , 'la_proficiency_b_2018.csv'
    , 'la_proficiency_b_2019.csv'
]

years = [x[17:21] for x in b_files]

df_b = pd.DataFrame()

for i in range(0, len(b_files)):
    temp = pd.read_csv(proficiency_path + b_files[i], dtype='str')
    temp['year'] = temp.apply(lambda x: years[i], axis=1)
    df_b = df_b.append(temp, ignore_index = True, sort=True)

In [112]:
# drop non-School records
print(df_b.shape)
df_b = df_b[df_b['Summary Level'] == 'School']
print(df_b.shape)

(45277, 74)
(40587, 74)


In [113]:
# rename columns
df_b = df_b.rename(columns={
    'School System Code': 'district_id'
    , 'School System Name': 'district'
    , 'School Code': 'school_id'
    , 'School Name': 'school'
})

In [114]:
# create combined subgroup column
df_b['Group'] = df_b['Group'].fillna('NA')
df_b['Subgroup'] = df_b['Subgroup'].fillna('NA')
df_b['group_state'] = df_b['Group'] + df_b['Subgroup']

In [115]:
# create grade as HS/EOC
df_b['grade'] = 'HS/EOC'

In [116]:
# pivot dataset
id_vars = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state']
value_vars = ['Algebra I % Advanced', 'Algebra I % Excellent', 'Algebra I % Good', 'Algebra I % Mastery','Biology % Excellent', 'Biology % Good','Biology (EOC) % Excellent','Biology (EOC) % Good','English I % Advanced','English I % Mastery','English II % Advanced','English II % Excellent', 'English II % Good', 'English II % Mastery', 'English III % Excellent', 'English III % Good','English III (EOC) % Excellent','English III (EOC) % Good','Geometry % Advanced', 'Geometry % Excellent', 'Geometry % Fair', 'Geometry % Good','Geometry % Mastery','U.S. History  % Excellent', 'U.S. History  % Good', 'U.S. History % Advanced', 'U.S. History % Mastery', 'U.S. History (EOC) % Excellent', 'U.S. History (EOC) % Good', 'U.S. History (LEAP 2025) % Advanced', 'U.S. History (LEAP 2025) % Mastery']
df_b = pd.melt(df_b, id_vars = id_vars, value_vars = value_vars, var_name = 'subject' , value_name = 'pct_at_level')
print(df_b.head(5))

   year district_id       district school_id                    school  \
0  2017           1  Acadia Parish      1005  Church Point High School   
1  2017           1  Acadia Parish      1005  Church Point High School   
2  2017           1  Acadia Parish      1005  Church Point High School   
3  2017           1  Acadia Parish      1005  Church Point High School   
4  2017           1  Acadia Parish      1005  Church Point High School   

    grade                                group_state               subject  \
0  HS/EOC                         Total PopulationNA  Algebra I % Advanced   
1  HS/EOC                                 GenderMale  Algebra I % Advanced   
2  HS/EOC                               GenderFemale  Algebra I % Advanced   
3  HS/EOC                   EthnicityHispanic/Latino  Algebra I % Advanced   
4  HS/EOC  EthnicityAmerican Indian or Alaska Native  Algebra I % Advanced   

  pct_at_level  
0          NaN  
1          NaN  
2          NaN  
3          NaN  
4

In [117]:
# trim everything from % on from column names
df_b ['new_subject'] = [x[:x.find('%')] for x in df_b['subject']]
df_b ['new_subject'] = df_b ['new_subject'].str.strip()
del df_b['subject']
df_b = df_b.rename(columns={'new_subject':'subject'})

In [118]:
df_b['proficient_tf'] = 1

In [119]:
# convert pct_at_level to float
def toFloat(row, column):
    try:
        return float(row[column])
    except:
        return np.nan

df_b['pct_at_level'] = df_b.apply(lambda x: toFloat(x, 'pct_at_level'), axis=1)

In [120]:
# drop null and zero rows
print(df_b.shape)
df_b = df_b.dropna(subset=['pct_at_level'])
print(df_b.shape)
df_b = df_b[df_b['pct_at_level'] != 0]
print(df_b.shape)

(1258197, 10)
(127595, 10)
(127595, 10)


In [121]:
# divide by 100
df_b['pct_at_level'] = df_b['pct_at_level'] / 100

In [122]:
# calculate 'All' grades rows
grouped_by = ['district', 'district_id', 'group_state', 'school', 'school_id', 'subject', 'year', 'proficient_tf']
df_allgrades = df_b.groupby(grouped_by, as_index=False)['pct_at_level'].mean()
df_allgrades['grade'] = 'All'

# append 'All' grades rows to main df
print(df_b.shape)
df_b = df_b.append(df_allgrades, ignore_index=True, sort=True)
del df_allgrades
print(df_b.shape)

(127595, 10)
(203515, 10)


In [123]:
# unify dtypes
print(df_b.dtypes)
df_b['proficient_tf'] = df_b['proficient_tf'].astype(bool)
print(df_b.dtypes)

district          object
district_id       object
grade             object
group_state       object
pct_at_level     float64
proficient_tf      int64
school            object
school_id         object
subject           object
year              object
dtype: object
district          object
district_id       object
grade             object
group_state       object
pct_at_level     float64
proficient_tf       bool
school            object
school_id         object
subject           object
year              object
dtype: object


In [124]:
# 'c' file structure, 2009 - 2012 EOC
c_files = [
    'la_proficiency_c_20092012_algebrai.csv'
    , 'la_proficiency_c_20092012_biology.csv'
    , 'la_proficiency_c_20092012_englishii.csv'
    , 'la_proficiency_c_20092012_englishiii.csv'
    , 'la_proficiency_c_20092012_geometry.csv'
]

subjects = [x[26:x.find('.csv')] for x in c_files]
print(subjects)

df_c = pd.DataFrame()

for i in range(0, len(c_files)):
    temp = pd.read_csv(proficiency_path + c_files[i], dtype='str')
    temp['subject'] = subjects[i]
    temp['grade'] = 'HS/EOC'
    temp['group_state'] = 'All Students'
    df_c = df_c.append(temp, ignore_index = True, sort=True)
print(df_c.head(5))

['algebrai', 'biology', 'englishii', 'englishiii', 'geometry']
  2009 to 2012 Growth December 2008 and May 2009 December 2009 and May 2010  \
0                  -8                         39                         20   
1                   3                         47                         54   
2                   6                         54                         42   
3                  20                         26                         37   
4                   6                         53                         44   

  December 2010 and May 2011 December 2011 and May 2012 School Code  \
0                         29                         31        1005   
1                         50                         50        1007   
2                         49                         60        1017   
3                         49                         46        1021   
4                         66                         59        1034   

                School Name School 

In [125]:
# rename columns
df_c = df_c.rename(columns={
    'School System Code': 'district_id'
    , 'School System Name': 'district'
    ,'School Code': 'school_id'
    ,'School Name': 'school'
    ,'December 2008 and May 2009': '2009'
    ,'December 2009 and May 2010': '2010'
    ,'December 2010 and May 2011': '2011'
    ,'December 2011 and May 2012': '2012'
})

In [126]:
# pivot scores
id_vars = ['district_id', 'district', 'school_id', 'school', 'grade', 'subject', 'group_state']
value_vars = ['2009', '2010', '2011', '2012']
df_c = pd.melt(df_c, id_vars = id_vars, value_vars = value_vars, var_name = 'year' , value_name = 'pct_at_level')
print(df_c.head(5))

  district_id       district school_id                    school   grade  \
0           1  Acadia Parish      1005  Church Point High School  HS/EOC   
1           1  Acadia Parish      1007       Crowley High School  HS/EOC   
2           1  Acadia Parish      1017       Midland High School  HS/EOC   
3           1  Acadia Parish      1021         Rayne High School  HS/EOC   
4           1  Acadia Parish      1034          Iota High School  HS/EOC   

    subject   group_state  year pct_at_level  
0  algebrai  All Students  2009           39  
1  algebrai  All Students  2009           47  
2  algebrai  All Students  2009           54  
3  algebrai  All Students  2009           26  
4  algebrai  All Students  2009           53  


In [127]:
# convert scores to float, drop nulls and zeros
# convert pct_at_level to float
def toFloat(row, column):
    try:
        return float(row[column])
    except:
        return np.nan

df_c['pct_at_level'] = df_c.apply(lambda x: toFloat(x, 'pct_at_level'), axis=1)

# drop null and zero rows
print(df_c.shape)
df_c = df_c.dropna(subset=['pct_at_level'])
print(df_c.shape)
df_c = df_c[df_c['pct_at_level'] != 0]
print(df_c.shape)

(7836, 9)
(4600, 9)
(4600, 9)


In [128]:
# divide by 100
df_c['pct_at_level'] = df_c['pct_at_level'] / 100

In [129]:
# calculate 'All' grades rows
grouped_by = ['district', 'district_id', 'group_state', 'school', 'school_id', 'subject', 'year']
df_allgrades = df_c.groupby(grouped_by, as_index=False)['pct_at_level'].mean()
df_allgrades['grade'] = 'All'

# append 'All' grades rows to main df
print(df_c.shape)
df_c = df_c.append(df_allgrades, ignore_index=True, sort=True)
del df_allgrades
print(df_c.shape)

(4600, 9)
(9200, 9)


In [130]:
# create proficient_tf
df_c['proficient_tf'] = 1

In [131]:
# unify dtypes
print(df_c.dtypes)
df_c['proficient_tf'] = df_c['proficient_tf'].astype(bool)
print(df_c.dtypes)

district          object
district_id       object
grade             object
group_state       object
pct_at_level     float64
school            object
school_id         object
subject           object
year              object
proficient_tf      int64
dtype: object
district          object
district_id       object
grade             object
group_state       object
pct_at_level     float64
school            object
school_id         object
subject           object
year              object
proficient_tf       bool
dtype: object


In [132]:
# 'd' file structure, 2013 EOC
d_files = [
    'la_proficiency_d_2013_algebrai.csv'
    , 'la_proficiency_d_2013_biology.csv'
    , 'la_proficiency_d_2013_englishii.csv'
    , 'la_proficiency_d_2013_englishiii.csv'
    , 'la_proficiency_d_2013_geometry.csv'
]

subjects = [x[22:x.find('.csv')] for x in d_files]
print(subjects)

df_d = pd.DataFrame()

for i in range(0, len(d_files)):
    temp = pd.read_csv(proficiency_path + d_files[i], dtype='str')
    temp['subject'] = subjects[i]
    temp['grade'] = 'HS/EOC'
    temp['group_state'] = 'All Students'
    temp['year'] = '2013'
    temp['proficient_tf'] = 1
    df_d = df_d.append(temp, ignore_index = True, sort=True)
print(df_d.head(5))

['algebrai', 'biology', 'englishii', 'englishiii', 'geometry']
  Excellent Fair Good Needs Improvement School Code               School Name  \
0         4   44   25                28        1005  Church Point High School   
1        16   27   31                26        1007       Crowley High School   
2        36    8   53                 3        1017       Midland High School   
3        23   29   33                15        1021         Rayne High School   
4        14   36   34                17        1034          Iota High School   

  School System Code School System Name Total   grade   group_state  \
0                  1      Acadia Parish  ≥130  HS/EOC  All Students   
1                  1      Acadia Parish  ≥170  HS/EOC  All Students   
2                  1      Acadia Parish   ≥50  HS/EOC  All Students   
3                  1      Acadia Parish  ≥180  HS/EOC  All Students   
4                  1      Acadia Parish  ≥130  HS/EOC  All Students   

   proficient_tf   subj

In [133]:
# rename columns
df_d = df_d.rename(columns={
    'School System Code': 'district_id',
    'School System Name': 'district',
    'School Code': 'school_id',
    'School Name': 'school'
})

In [134]:
# convert Excellent and Good to float, all together, drop nulls and zeros
df_d['Excellent'] = df_d.apply(lambda x: toFloat(x, 'Excellent'), axis=1)
df_d['Good'] = df_d.apply(lambda x: toFloat(x, 'Good'), axis=1)
df_d['pct_at_level'] = df_d['Excellent'] + df_d['Good']

# drop null and zero rows
print(df_d.shape)
df_d = df_d.dropna(subset=['pct_at_level'])
print(df_d.shape)
df_d = df_d[df_d['pct_at_level'] != 0]
print(df_d.shape)

(2029, 15)
(1618, 15)
(1618, 15)


In [135]:
# divide pct at level by 100
df_d['pct_at_level'] = df_d['pct_at_level'] / 100

In [136]:
# calculate 'All' grades rows
grouped_by = ['district', 'district_id', 'group_state', 'school', 'school_id', 'subject', 'year', 'proficient_tf']
df_allgrades = df_d.groupby(grouped_by, as_index=False)['pct_at_level'].mean()
df_allgrades['grade'] = 'All'

# append 'All' grades rows to main df
print(df_d.shape)
df_d = df_d.append(df_allgrades, ignore_index=True, sort=True)
del df_allgrades
print(df_d.shape)

(1618, 15)
(3236, 15)


In [137]:
# filter to wanted columns
df_d = df_d[['district', 'district_id', 'group_state', 'school', 'school_id', 'subject', 'grade', 'year', 'proficient_tf', 'pct_at_level']]

In [138]:
# unify dtypes
print(df_d.dtypes)
df_d['proficient_tf'] = df_d['proficient_tf'].astype(bool)
print(df_d.dtypes)

district          object
district_id       object
group_state       object
school            object
school_id         object
subject           object
grade             object
year              object
proficient_tf      int64
pct_at_level     float64
dtype: object
district          object
district_id       object
group_state       object
school            object
school_id         object
subject           object
grade             object
year              object
proficient_tf       bool
pct_at_level     float64
dtype: object


In [139]:
# join all results together
df = df.append(df_b, ignore_index=True, sort=True)
df = df.append(df_c, ignore_index=True, sort=True)
df = df.append(df_d, ignore_index=True, sort=True)
print(df.dtypes)

district          object
district_id       object
grade             object
group_state       object
pct_at_level     float64
proficient_tf       bool
school            object
school_id         object
subject           object
year              object
dtype: object


In [140]:
print(df[['year', 'grade', 'group_state', 'subject']].drop_duplicates())

        year   grade                 group_state     subject
0       2015       6   Black or African American        Math
1       2015       6  Economically Disadvantaged        Math
2       2015       6            Total Population        Math
3       2015       6                       White        Math
4       2015       7   Black or African American        Math
5       2015       7  Economically Disadvantaged        Math
6       2015       7            Total Population        Math
7       2015       7                       White        Math
8       2015       8   Black or African American        Math
9       2015       8  Economically Disadvantaged        Math
10      2015       8    Students with Disability        Math
11      2015       8            Total Population        Math
12      2015       8                       White        Math
13      2015       3  Economically Disadvantaged        Math
14      2015       3            Total Population        Math
15      2015       3    

In [141]:
# export final dataset
df.to_csv('./data/finalized/la_proficiency.csv', index=False)