In [72]:
import pandas as pd
import numpy as np
import util

proficiency_path = './data/mn/proficiency/'

proficiency_files = [
    '2013MCA3MathPublicFilter9.tab'
    , '2013MCA3ReadingPublicFilter9.tab'
    , '2013MCAMathPublicFilter9.tab'
    , '2013MTAS3MathPublicFilter9.tab'
    , '2013MTAS3ReadingPublicFilter9.tab'
    , '2013MTASMathPublicFilter9.tab'
    , '2014MCA3MathPublicFilter9.tab'
    , '2014MCA3ReadingPublicFilter9.tab'
    , '2014MTAS3MathPublicFilter9.tab'
    , '2014MTAS3ReadingPublicFilter9.tab'
    , '2015MCA3MathPublicFilter9.tab'
    , '2015MCA3ReadingPublicFilter9.tab'
    , '2015MTAS3MathPublicFilter9.tab'
    , '2015MTAS3ReadingPublicFilter9.tab'
    , '2016MCA3MathPublicFilter9.tab'
    , '2016MCA3ReadingPublicFilter9.tab'
    , '2016MTAS3MathPublicFilter9.tab'
    , '2016MTAS3ReadingPublicFilter9.tab'
    , '2017MCA3MathPublicFilter9.tab'
    , '2017MCA3ReadingPublicFilter9.tab'
    , '2017MTAS3MathPublicFilter9.tab'
    , '2017MTAS3ReadingPublicFilter9.tab'
    , '2018MCA3MathPublicFilter9.tab'
    , '2018MCA3ReadingPublicFilter9.tab'
    , '2018MTAS3MathPublicFilter9.tab'
    , '2018MTAS3ReadingPublicFilter9.tab'
]

years = [int(x[:4]) for x in proficiency_files]

# process initial file
df = pd.read_csv(proficiency_path + 'raw/' + proficiency_files[0], sep = '\t', header = (0))
df['year'] = df.apply(lambda x: years[0], axis=1)
df['filename'] = df.apply(lambda x: proficiency_files[0], axis = 1)
df = df[[
    'countLevel1'
    , 'countLevel2'
    , 'countLevel3'
    , 'countLevel4'
    , 'countTested'
    , 'districtName'
    , 'districtNumber'
    , 'grade'
    , 'schoolName'
    , 'schoolNumber'
    , 'subject'
    , 'reportCategory'
    , 'ReportDescription'
    , 'filename'
    , 'year'
]]

# process other files, attempt to subset to the same column names and append
for i in range(1, len(proficiency_files)):
    df2 = pd.read_csv(proficiency_path + 'raw/' + proficiency_files[i], sep = '\t', header = (0))
    df2['year'] = df2.apply(lambda x: years[i], axis = 1)
    df2['filename'] = df2.apply(lambda x: proficiency_files[i], axis = 1)
    try:
        df2 = df2[[
            'countLevel1'
            , 'countLevel2'
            , 'countLevel3'
            , 'countLevel4'
            , 'countTested'
            , 'districtName'
            , 'districtNumber'
            , 'grade'
            , 'schoolName'
            , 'schoolNumber'
            , 'subject'
            , 'reportCategory'
            , 'ReportDescription'
            , 'filename'
            , 'year'
        ]]
        df = df.append(df2, ignore_index = True, sort=True)
    except:
        print('Columns for ' + proficiency_files[i] + ' not matching.')

In [73]:
# import 2019 data and append to other years
files = ['2019 Math MCA MTAS Results.csv', '2019 Reading MCA MTAS Results.csv']
df_2019 = pd.DataFrame()

for file in files:
    temp = pd.read_csv(proficiency_path + file)
    temp['year'] = temp.apply(lambda x: 2019, axis=1)
    temp['filename'] = temp.apply(lambda x: file, axis = 1)
    df_2019 = df_2019.append(temp, ignore_index = True, sort=True)

df_2019 = df_2019[[
    'Count Level D'
    , 'Count Level E'
    , 'Count Level M'
    , 'Count Level P'
    , 'Count Valid Scores MCA'
    , 'Count Valid Scores MTAS'
    , 'District Name'
    , 'District Number'
    , 'Grade'
    , 'School Name'
    , 'School Number'
    , 'Subject'
    , 'Group Category'
    , 'Student Group'
    , 'filename'
    , 'year'
]]

# combine valid scores count
df_2019['countTested'] = df_2019['Count Valid Scores MCA'] + df_2019['Count Valid Scores MTAS']
del df_2019['Count Valid Scores MCA']
del df_2019['Count Valid Scores MTAS']

# rename fields to match other years
df_2019 = df_2019.rename(columns={
    "District Name": "districtName"
    , "District Number": "districtNumber"
    , "School Name": "schoolName"
    , "School Number": "schoolNumber"
    , 'Grade': 'grade'
    , 'Subject': 'subject'
    , 'Group Category': 'reportCategory'
    , 'Student Group' : 'ReportDescription'
    , 'Count Level D' : 'countLevel1'
    , 'Count Level P' : 'countLevel2'
    , 'Count Level M' : 'countLevel3'
    , 'Count Level E' : 'countLevel4'
})

In [74]:
# append 2019 data to all other data
df = df.append(df_2019, ignore_index = True, sort=True)
df = df.rename(columns={
    "districtName": "district"
    , "districtNumber": "district_id"
    , "schoolName": "school"
    , "schoolNumber": "school_id"
    , 'Grade': 'grade'
    , 'Subject': 'subject'
    , 'ReportDescription' : 'group_state'
    , 'countLevel1' : 'Level 1'
    , 'countLevel2' : 'Level 2'
    , 'countLevel3' : 'Level 3'
    , 'countLevel4' : 'Level 4'
    , 'countTested' : 'num_tested'
})

In [75]:
# remove 'All Schools' records
df = df[df['school'] != 'All Schools']

In [76]:
# remove records with no scores for any level
df = df.dropna(subset=['Level 1', 'Level 2', 'Level 3', 'Level 4'], how='all')

In [77]:
# standardize subjects
df['subject'] = [x if x != 'R' else 'Reading' for x in [x if x != 'M' else 'MATH' for x in df['subject']]]

In [78]:
# reshape scores
id_vars = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject', 'num_tested']
df = pd.melt(df, id_vars=id_vars
        , value_vars=[
            'Level 1'
            , 'Level 2'
            , 'Level 3'
            , 'Level 4'],
        var_name='performance_level'
        , value_name='num_at_level')

In [79]:
# create All Grades rollups
grouped_by = ['year', 'district_id', 'district', 'school_id', 'school', 'group_state', 'subject', 'performance_level']
df1 = df.groupby(grouped_by, as_index = False)['num_tested'].sum()
df2 = df.groupby(grouped_by, as_index = False)['num_at_level'].sum()
df_allgrades = pd.merge(df1, df2, on = grouped_by)
df_allgrades['grade'] = df_allgrades.apply(lambda x: 'All Grades', axis = 1)
df = df.append(df_allgrades, ignore_index = True, sort=True)

In [80]:
# create proficient_tf
def proficientTF (row):
    if row['performance_level'] == 'Level 3' or row['performance_level'] == 'Level 4':
        return 1
    return 0
df['proficient_tf'] = df.apply(lambda x: proficientTF(x), axis=1)

In [81]:
# create pct_at_level
df['pct_at_level'] = df.apply(lambda x: x['num_at_level'] / x['num_tested'], axis=1)

In [82]:
# export final dataset
df.to_csv('./data/finalized/mn_proficiency.csv', index=False)