In [13]:
import pandas as pd
import numpy as np
import util

proficiency_path = './data/tn/proficiency/'

proficiency_files = [
    'tn_proficiency_2012_cleaned.csv'
    , 'tn_proficiency_2013_cleaned.csv'
    , 'tn_proficiency_2014_cleaned.csv'
    , 'tn_proficiency_2015_cleaned.csv'
    , 'tn_proficiency_2016_cleaned.csv'
    , 'tn_proficiency_2017_cleaned.csv'
    , 'tn_proficiency_2018_cleaned.csv'
    , 'tn_proficiency_2019_cleaned.csv'
]

years = [x[15:19] for x in proficiency_files]

df = pd.read_csv(proficiency_path + proficiency_files[0])
df['year'] = df.apply(lambda x: years[0], axis=1)

for i in range(1, len(proficiency_files)):
    temp = pd.read_csv(proficiency_path + proficiency_files[i])
    temp['year'] = temp.apply(lambda x: years[i], axis=1)
    df = df.append(temp, ignore_index = True, sort=True)

In [14]:
# get school and district names for 2017 data
df_2017 = df[df['year'] == '2017']
print(df_2017.shape)
df_all = df[df['year'] != '2017']
print(df_all.shape)
del df_2017['school']
del df_2017['district']

df_ids = df_all[['district_id', 'district', 'school_id', 'school']].drop_duplicates()
df_2017 = pd.merge(df_2017, df_ids, on=['district_id', 'school_id'], how = 'left')
print(df_2017.shape)
df = df_all.append(df_2017, ignore_index = True, sort = True)
del df_2017
del df_all
del df_ids

(440633, 32)
(2443008, 32)
(766162, 32)


In [15]:
# rename aggregate group names
df['group_state'] = [x if x != 'All' else 'All Groups' for x in df['group_state']]
df['group_state'] = [x if x != 'All Students' else 'All Groups' for x in df['group_state']]
print(df['group_state'].drop_duplicates())

0                                                 All Groups
1                                                      Black
2          Economically Disadvantaged (Free or Reduced Pr...
3                                 Students with Disabilities
4                                                      White
24                                                     Asian
34                                                  Hispanic
453                                          Native American
1367                                          Hawaiian or PI
162781                          Hawaiian or Pacific Islander
162782                            Economically Disadvantaged
162784                             English Language Learners
162785                  English Language Learners with T1/T2
162786                        Black/Hispanic/Native American
582903                        Non-Economically Disadvantaged
582904                        Non-Students with Disabilities
582905                  

In [4]:
# create subject map
# subject_map = [['Algebra I', 'Math']
#                , ['Algebra II', 'Math']
#                , ['Biology I', 'Science']
#                , ['English I', 'ELA']
#                , ['English II', 'ELA']
#                , ['English III', 'ELA']
#                , ['Math', 'Math']
#                , ['Reading/Language', 'ELA']
#                , ['Science', 'Science']
#                , ['US History', 'US History']
#                , ['RLA', 'ELA']
#                , ['Social Studies', 'Social Studies']
#                , ['Chemistry', 'Science']
#                , ['Integrated Math I', 'Math']
#                , ['Integrated Math II', 'Math']
#                , ['Geometry', 'Math']
#                , ['Integrated Math III', 'Math']
#                , ['ELA', 'ELA']
#               ]
# df_subjects = pd.DataFrame(subject_map, columns = ['subject', 'subject_cleaned']) 

In [5]:
# join subject map to assessment data
# df = pd.merge(df, df_subjects, on='subject')

In [6]:
# delete uncleaned subject field and rename new
# del df['subject']
# df = df.rename(columns={"subject_cleaned": "subject"})

In [16]:
print(df.columns)

Index(['Number Advanced', 'Number Approaching', 'Number Basic', 'Number Below',
       'Number Below Basic', 'Number Enrolled', 'Number Mastered',
       'Number On Track', 'Number Proficient', 'Percent Advanced',
       'Percent Approaching', 'Percent Basic', 'Percent Below',
       'Percent Below Basic', 'Percent Below Basic or Basic',
       'Percent Mastered', 'Percent On Track', 'Percent On Track or Mastered',
       'Percent Proficient', 'Percent Proficient or Advanced', 'district',
       'district_id', 'enrolled', 'grade', 'group_state', 'num_tested',
       'school', 'school_id', 'subject', 'test', 'tested', 'year'],
      dtype='object')


In [17]:
id_vars = ['year', 'district_id', 'district', 'school_id', 'school', 'grade', 'group_state', 'subject', 'num_tested']
# reshape scores
df = pd.melt(df, id_vars=id_vars
        , value_vars=[
            'Number Advanced'
            , 'Number Mastered'
            , 'Number On Track'
            , 'Number Proficient'],
        var_name='performance_level'
        , value_name='num_at_level')

In [18]:
# create proficient_tf
df['proficient_tf'] = 1

In [19]:
# convert num_at_level and num_tested to float
def toFloat(row, column):
    try:
        return float(row[column])
    except:
        return np.nan
df['num_at_level'] = df.apply(lambda x: toFloat(x, 'num_at_level'), axis = 1)
df['num_tested'] = df.apply(lambda x: toFloat(x, 'num_tested'), axis = 1)

In [20]:
# remove rows with no scores
print(df.shape)
df = df.dropna(subset=['num_tested', 'num_at_level'])
print(df.shape)

(12836680, 12)
(1784074, 12)


In [21]:
# roll up into a single performance level
print(df.shape)
grouped_by = ['year', 'district_id', 'district', 'school_id', 'school', 'subject', 'grade', 'group_state', 'proficient_tf']
df = df.groupby(grouped_by, as_index=False).agg({'num_tested': 'sum', 'num_at_level': 'sum'})
print(df.shape)

(1784074, 12)
(891225, 11)


In [23]:
# create pct_at_level
df['pct_at_level'] = df['num_at_level'] / df['num_tested']

In [25]:
print(df.dtypes)
df['district_id'] = df['district_id'].astype(str)
df['school_id'] = df['school_id'].astype(str)
df['proficient_tf'] = df['proficient_tf'].astype(bool)
print(df.dtypes)

year              object
district_id        int64
district          object
school_id          int64
school            object
subject           object
grade             object
group_state       object
proficient_tf      int64
num_tested       float64
num_at_level     float64
pct_at_level     float64
dtype: object
year              object
district_id       object
district          object
school_id         object
school            object
subject           object
grade             object
group_state       object
proficient_tf       bool
num_tested       float64
num_at_level     float64
pct_at_level     float64
dtype: object


In [26]:
# save as cleaned proficiency
df.to_csv('./data/finalized/tn_proficiency.csv', index=False)