In [3]:
# import files
import pandas as pd
import numpy as np

path = './data/studentdiscipline/raw/'

years = ['2018-19', '2017-18']
types = ['All Offenses', 'Bullying', 'Destruction of school property due to arson', 'Felony conviction or complaint', 'Homicide (murder or manslaughter)', 'Illegal substances', 'Kidnapping (abduction)', 'Non-drug, non-violent or non-criminal-related offense', 'Other violence or substance-related offense', 'Physical attack (battery)', 'Physical fight ', 'Robbery using force', 'Sexual assault', 'Sexual harassment', 'Theft (school, staff or student property)', 'Threat of physical attack', 'Threat of robbery', 'Vandalismdestruction of property', 'Weapon on school premises']
groups = ['All Students', 'English Learner', 'Economically disadvantaged', 'Students wdisabilities', 'High needs', 'Female', 'Male', 'Amer Ind or Alaska Nat', 'Asian', 'Afr AmerBlack', 'HispanicLatino', 'Multi-race, Non-HispLat', 'Nat Haw or Pacif Isl', 'White']

df = pd.DataFrame()

for year in years:
    for type in types:
        for group in groups:
            temp = pd.read_excel(path + 'ma_studentdiscipline_' + type + '_' + group + '_' + year + '.xlsx', dtype={'District Name':str,'District Code':str,'Students':np.float64,'Students Disciplined':np.float64,'% In-School Suspension':np.float64,'% Out-of-School Suspension':np.float64,'% Expulsion':np.float64,'% Removed to Alternate Setting':np.float64,'% Emergency Removal':np.float64,'% Students with a School-Based Arrest':np.float64},skiprows=[0], thousands=',')
            temp['year'] = year
            temp['type'] = type
            temp['group_state'] = group
            df = df.append(temp, ignore_index=True, sort=True)

df = df.rename(columns={'District Name': 'district'})

print(df.shape)
print(df.head(5))

(56982, 13)
   % Emergency Removal  % Expulsion  % In-School Suspension  \
0                  0.0          0.0                     0.2   
1                  0.0          0.0                     3.0   
2                  0.5          0.0                     2.0   
3                  0.0          0.0                     0.3   
4                  0.0          0.0                     1.4   

   % Out-of-School Suspension  % Removed to Alternate Setting  \
0                         2.4                             0.0   
1                         1.7                             0.0   
2                         8.4                             0.0   
3                         0.5                             0.0   
4                         1.4                             0.0   

   % Students with a School-Based Arrest District Code  \
0                                    0.0      04450000   
1                                    0.0      00010000   
2                                    0.0    

In [4]:
# add manual data changes
print(df.shape)
df = df[df['District Code'] != '04070000']
print(df.shape)
df = df[df['District Code'] != '04110000']
print(df.shape)
df = df[df['District Code'] != '04230000']
print(df.shape)
df = df[df['District Code'] != '04270000']
print(df.shape)
df = df[df['District Code'] != '04520000']
print(df.shape)
df = df[df['District Code'] != '04710000']
print(df.shape)
df = df[df['District Code'] != '04770000']
print(df.shape)
df = df[df['District Code'] != '04800000']
print(df.shape)
df = df[df['District Code'] != '35050000']
print(df.shape)
df = df[df['District Code'] != '35110000']
print(df.shape)

# remove district totals & Horace Mann
print(df.shape)
df = df[df['District Code'] != '00000000']
print(df.shape)
df = df[~df['district'].str.contains('Horace Mann')]
print(df.shape)

(56982, 13)
(56916, 13)
(56758, 13)
(56758, 13)
(56730, 13)
(56616, 13)
(56616, 13)
(56569, 13)
(56361, 13)
(56168, 13)
(56125, 13)
(56125, 13)
(55691, 13)
(55691, 13)


In [5]:
df.to_csv('temp.csv')

In [6]:
# drop rows with null values
print(df.shape)
df = df.dropna(subset=['% Emergency Removal', '% Expulsion', '% In-School Suspension', '% Out-of-School Suspension', '% Removed to Alternate Setting', '% Students with a School-Based Arrest'], how='all')
print(df.shape)

(55691, 13)
(20264, 13)


In [7]:
print(df.columns)

Index(['% Emergency Removal', '% Expulsion', '% In-School Suspension',
       '% Out-of-School Suspension', '% Removed to Alternate Setting',
       '% Students with a School-Based Arrest', 'District Code', 'district',
       'Students', 'Students Disciplined', 'group_state', 'type', 'year'],
      dtype='object')


In [8]:
# remove last four digits of school code
df['district_id'] = df.apply(lambda x: x['District Code'][:4], axis = 1)
del df['District Code']
print(df['district_id'].drop_duplicates().head(5))

0    0445
1    0001
2    0412
3    0600
4    0003
Name: district_id, dtype: object


In [9]:
print(df.dtypes)
df['district_id'] = df['district_id'].astype('int64')
print(df.dtypes)

% Emergency Removal                      float64
% Expulsion                              float64
% In-School Suspension                   float64
% Out-of-School Suspension               float64
% Removed to Alternate Setting           float64
% Students with a School-Based Arrest    float64
district                                  object
Students                                 float64
Students Disciplined                     float64
group_state                               object
type                                      object
year                                      object
district_id                               object
dtype: object
% Emergency Removal                      float64
% Expulsion                              float64
% In-School Suspension                   float64
% Out-of-School Suspension               float64
% Removed to Alternate Setting           float64
% Students with a School-Based Arrest    float64
district                                  object
Studen

In [10]:
# use most recent district name per district_id
df_leas = df[['year', 'district_id', 'district']]
df_leas = df_leas.sort_values(['year', 'district_id', 'district'], ascending = False)
df_leas = df_leas.drop_duplicates(subset=['district_id'])
del df_leas['year']

# join back to df
del df['district']
print(df.shape)
df = df.merge(df_leas, on = ['district_id'])
print(df.shape)

(20264, 12)
(20264, 13)


In [11]:
# import file for charter flag
df_flags = pd.read_csv('./data/finalized/charter_to_district.csv')
df_flags = df_flags[df_flags.fy.eq('fy21')]
df_flags = df_flags[['charter_lea_code']].drop_duplicates()
df_flags = df_flags.rename(columns={'charter_lea_code': 'district_id'})
df_flags['charter_flag'] = 1

print(df.shape)
df = df.merge(df_flags, on = ['district_id'], how = 'left')
del df_flags
print(df.shape)
print(df.head(5))

(20264, 13)
(20264, 14)
   % Emergency Removal  % Expulsion  % In-School Suspension  \
0                  0.0          0.0                     0.2   
1                  0.0          0.0                     0.0   
2                  0.0          0.0                     1.0   
3                  0.0          0.0                     0.3   
4                  0.0          0.0                     0.1   

   % Out-of-School Suspension  % Removed to Alternate Setting  \
0                         2.4                             0.0   
1                         1.4                             0.0   
2                         3.1                             0.0   
3                         2.2                             0.0   
4                         0.9                             0.0   

   % Students with a School-Based Arrest  Students  Students Disciplined  \
0                                    0.0    1440.0                  35.0   
1                                    0.0     624.0    

In [12]:
# edit year field
print(df['year'].drop_duplicates())
df['year'] = ['20' + x[-2:] for x in df['year']]
print(df['year'].drop_duplicates())

0     2018-19
14    2017-18
Name: year, dtype: object
0     2019
14    2018
Name: year, dtype: object


In [13]:
print(df.columns)

Index(['% Emergency Removal', '% Expulsion', '% In-School Suspension',
       '% Out-of-School Suspension', '% Removed to Alternate Setting',
       '% Students with a School-Based Arrest', 'Students',
       'Students Disciplined', 'group_state', 'type', 'year', 'district_id',
       'district', 'charter_flag'],
      dtype='object')


In [14]:
# join charter reg and geo file
df_reg = pd.read_csv('./data/finalized/MCPSA Charter Reg and Geo Affiliations.csv')
df_reg = df_reg[['Charter LEA ID', 'Charter LEA Name', 'Regional Affiliation/s', 'Geographic Location/s']]
print(df_reg.head(5))
df_reg['district_id'] = df_reg.apply(lambda x: int(x['Charter LEA ID'] / 10000), axis = 1)
del df_reg['Charter LEA ID']
del df_reg['Charter LEA Name']

print(df.shape)
df = df.merge(df_reg, on=['district_id'], how='left')
print(df.shape)

   Charter LEA ID                                   Charter LEA Name  \
0         4070405  Dudley Street Neighborhood Charter School (Dis...   
1         4090205  Alma del Mar Charter School (District) - Alma ...   
2         4100205  Excel Academy Charter (District) - Excel Acade...   
3         4110305  Boston Green Academy Horace Mann Charter Schoo...   
4         4120530  Academy Of the Pacific Rim Charter Public (Dis...   

  Regional Affiliation/s Geographic Location/s  
0                 Boston                Boston  
1                Gateway          Southeastern  
2                 Boston                Boston  
3                 Boston                Boston  
4                 Boston                Boston  
(20264, 14)
(20810, 16)


In [15]:
print(df.columns)

Index(['% Emergency Removal', '% Expulsion', '% In-School Suspension',
       '% Out-of-School Suspension', '% Removed to Alternate Setting',
       '% Students with a School-Based Arrest', 'Students',
       'Students Disciplined', 'group_state', 'type', 'year', 'district_id',
       'district', 'charter_flag', 'Regional Affiliation/s',
       'Geographic Location/s'],
      dtype='object')


In [16]:
# join charter_to_district data for calculating multipliers
df_mult = pd.read_csv('./data/finalized/charter_to_district.csv')
print(df_mult.dtypes)
print(df_mult.shape)

# filter to just fy21, since we're missing many years in the data
df_mult = df_mult[df_mult.fy.eq('fy21')]
print(df_mult.shape)
del df_mult['year']

df_charter = df[df['charter_flag'] == 1]
df_district = df[df['charter_flag'] != 1]

df_mult = df_mult.merge(df_charter, left_on=['charter_lea_code'], right_on = ['district_id'], how='left')
print(df_mult.shape)
df_mult = df_mult.merge(df_district, left_on=['sending_lea_code', 'year', 'group_state'], right_on = ['district_id', 'year','group_state'], how='outer', suffixes=['_charter', '_district'])
print(df_mult.shape)
print(df_mult.head(5))

fy                            object
year                           int64
charter_lea_code               int64
sending_lea_code               int64
enrolled_n                     int64
physical_charter_location     object
chartered_to_serve           float64
dtype: object
(3513, 7)
(966, 7)
(46090, 22)
(275251, 36)
     fy  charter_lea_code  sending_lea_code  enrolled_n  \
0  fy21             409.0               3.0         2.0   
1  fy21             409.0               3.0         2.0   
2  fy21             409.0               3.0         2.0   
3  fy21             409.0               3.0         2.0   
4  fy21             409.0               3.0         2.0   

  physical_charter_location  chartered_to_serve  % Emergency Removal_charter  \
0                       NaN                 NaN                          0.0   
1                       NaN                 NaN                          0.0   
2                       NaN                 NaN                          0.0   
3       

In [41]:
# review grade/group/year breakdown of data
# df.groupby(['year', 'grade', 'group_state'])['num'].sum().to_csv('temp.csv')

In [17]:
# export dataset for QA
df.to_csv('./data/finalized/discipline.csv')

In [18]:
# export final dataset
df_mult.to_csv('./data/finalized/discipline_with_multipliers.csv', index=False)