In [1]:
import pandas as pd
import numpy as np
import util

proficiency_path = './data/ma/proficiency/raw/'

# import and append all files from file path
# from os import listdir
# from os.path import isfile, join
# onlyfiles = [f for f in listdir(proficiency_path) if isfile(join(proficiency_path, f))]

# initialize list of years, grades, and group
years = ['2019', '2018', '2017', '2016', '2015', '2014', '2013']
grades = ['ALL', '03', '04', '05', '06', '07', '08', '10', 'HS SCI']
groups = ['All Students', 'Non-Econ. Disadvantaged', 'Econ. Disadvantaged', 'Non-Low Income', 'Low income', 'High needs', 'Migrant', 'Moderate or High Level of Need', 'Amer. Ind. or Alaska Nat.', 'Afr. Amer./Black', 'Asian', 'Hispanic/Latino', 'White', 'Nat. Haw. or Pacif. Isl.', 'Multi-race, Non-Hisp./Lat.', 'Former EL', 'EL', 'EL and Former EL', 'Ever EL', 'Students w/disabilities', 'Non-Title1', 'Title1', 'Non-Disabled', 'Foster Care', 'Homeless', 'Military']

df_mcas = pd.DataFrame()

# build files list
for year in years:
    for grade in grades:
        for group in groups:
            filename = 'mcas_' + grade.replace('.', '').replace('/', '') + '_' + group.replace('.', '').replace('/', '') + '_' + year.replace('.', '').replace('/', '') + '.xlsx'
            temp = pd.read_excel(proficiency_path + filename, dtype='str', header = 1)
            temp['year'] = year
            temp['grade'] = grade
            temp['group_state'] = group
            df_mcas = df_mcas.append(temp, ignore_index = True, sort=True)

print(df_mcas.head())

   A # A % Ave. SGP     CPI Included In Ave. SGP Included In Median SGP  \
0   33  35      NaN    95.8                  NaN                    NaN   
1   37  27      NaN    90.3                  NaN                    NaN   
2   10  16      NaN    81.0                  NaN                    NaN   
3  295  72      NaN    99.1                  NaN                    NaN   
4    5  12      NaN    82.3                  NaN                    NaN   

  Median SGP NI # NI %  P #  ... P+A % School Code  \
0        NaN   10   11   51  ...    88    04450105   
1        NaN   37   27   63  ...    72    00010505   
2        NaN   28   44   23  ...    52    04120530   
3        NaN    7    2  105  ...    98    06000505   
4        NaN   11   27   21  ...    63    06030505   

                                         School Name Student Included  \
0  Abby Kelley Foster Charter Public (District) -...               95   
1                           Abington - Abington High              139   
2  Ac

In [2]:
# import next-gen MCAS results
years = ['2019', '2018', '2017']
grades = ['ALL (03-08)', '03', '04', '05', '06', '07', '08', '10']
groups = ['All Students', 'Afr. Amer./Black', 'Hispanic/Latino', 'Econ. Disadvantaged', 'White']

nextgen_path = './data/ma/proficiency/nextgen_mcas/'

df_ng = pd.DataFrame()

# build files list
for year in years:
    for grade in grades:
        for group in groups:
            filename = 'mcas_nextgen_' + grade.replace('.', '').replace('/', '') + '_' + group.replace('.', '').replace('/', '') + '_' + year.replace('.', '').replace('/', '') + '.xlsx'
            temp = pd.read_excel(nextgen_path + filename, dtype='str', header = 1)
            temp['year'] = year
            temp['grade'] = grade
            temp['group_state'] = group
            df_ng = df_ng.append(temp, ignore_index = True, sort=True)
print(df_ng.head(5))

  Ach. PCTL Avg. Scaled Score           E # E % Included In SGP           M #  \
0        38             499.4            37   5             578           309   
1        35             495.9            23   3             579           259   
2        27             492.5             4   2             NaN            75   
3        47             501.8            51   8             634           300   
4        41             497.4            29   4             634           265   

  M %         M+E # M+E %          NM #  ... No. of Students Included  \
0  44           346    49            50  ...                      699   
1  37           282    40            60  ...                      699   
2  32            79    34            21  ...                      234   
3  46           351    54            46  ...                      654   
4  40           294    45            53  ...                      655   

           PM # PM %             SGP School Code  \
0           303   43  

In [3]:
# rename next gen columns, then append to df
df_ng = df_ng.rename(columns={
    'M+E #': 'P+A #',
    'No. of Students Included': 'Student Included'
})
df = df_mcas.append(df_ng, ignore_index=True, sort=True)

In [4]:
# rename columns
df = df.rename(columns={
    'Subject': 'subject',
    'P+A #': 'num_at_level',
    'Student Included': 'num_tested'
})
print(df.columns)

Index(['A #', 'A %', 'Ach. PCTL', 'Ave. SGP', 'Avg. Scaled Score', 'CPI',
       'E #', 'E %', 'Included In Ave. SGP', 'Included In Median SGP',
       'Included In SGP', 'M #', 'M %', 'M+E %', 'Median SGP', 'NI #', 'NI %',
       'NM #', 'NM %', 'P #', 'P %', 'num_at_level', 'P+A %', 'PM #', 'PM %',
       'SGP', 'School Code', 'School Name', 'num_tested', 'subject', 'W/F #',
       'W/F %', 'grade', 'group_state', 'year'],
      dtype='object')


In [5]:
# split school_id and school into district id and code
def splitSchool(row, type):
    if type == 'school':
        return row['School Name'][row['School Name'].find(' - ') + 3:]
    if type == 'district':
        return row['School Name'][:row['School Name'].find(' - ')]

df['district_id'] = df.apply(lambda x: x['School Code'][:4], axis = 1)
df['district'] = df.apply(lambda x: splitSchool(x, 'district'), axis = 1)
df['school_id'] = df.apply(lambda x: x['School Code'][4:], axis = 1)
df['school'] = df.apply(lambda x: splitSchool(x, 'school'), axis = 1)

In [6]:
# add performance level
df['performance_level'] = 'Proficient and Advanced'
df['proficient_tf'] = 1

In [7]:
# drop unneeded columns
df = df[['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject', 'proficient_tf', 'num_at_level', 'num_tested']]

In [8]:
# remove state total rows
print(df.shape)
df = df[df['district'] != 'State Totals']
print(df.shape)

(700035, 11)
(698004, 11)


In [9]:
# convert num_at_level and num_tested to float
def toFloat(row, column):
    try:
        return float(row[column])
    except:
        return np.nan

df['num_at_level'] = df.apply(lambda x: toFloat(x,'num_at_level'), axis=1)
df['num_tested'] = df.apply(lambda x: toFloat(x,'num_tested'), axis=1)

In [10]:
# remove rows with null scores
print(df.shape)
df = df.dropna(subset=['num_at_level', 'num_tested'])
print(df.shape)
df = df[df['num_at_level'] != 0]
print(df.shape)

(698004, 11)
(697677, 11)
(685406, 11)


In [11]:
# roll up since we imported next gen scores for some groups
print(df.shape)
grouped_by = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject', 'proficient_tf']
df1 = df.groupby(grouped_by, as_index=False)['num_at_level'].sum()
print(df1.shape)
df2 = df.groupby(grouped_by, as_index=False)['num_tested'].sum()
print(df2.shape)
df = df1.merge(df2, on=grouped_by)
print(df.shape)
del df1
del df2

(685406, 11)
(685406, 10)
(685406, 10)
(685406, 11)


In [12]:
# create pct_at_level
df['pct_at_level'] = df['num_at_level'] / df['num_tested']

In [13]:
# unify data types
print(df.dtypes)
df['proficient_tf'] = df['proficient_tf'].astype(bool)
print(df.dtypes)

year              object
district_id       object
district          object
school_id         object
school            object
grade             object
group_state       object
subject           object
proficient_tf      int64
num_at_level     float64
num_tested       float64
pct_at_level     float64
dtype: object
year              object
district_id       object
district          object
school_id         object
school            object
grade             object
group_state       object
subject           object
proficient_tf       bool
num_at_level     float64
num_tested       float64
pct_at_level     float64
dtype: object


In [15]:
# export final dataset
df.to_csv('./data/finalized/ma_proficiency.csv', index=False)