In [1]:
import pandas as pd

## Year 2021

### Assessment Data

https://caaspp-elpac.ets.org/caaspp/researchfiles/sb_ca2021_1_csv_v2.zip

In [2]:
data = pd.read_csv('test21.txt', sep = '^')

In [3]:
data.columns

Index(['County Code', 'District Code', 'School Code', 'Filler', 'Test Year',
       'Student Group ID', 'Test Type', 'Total Tested at Reporting Level',
       'Total Tested with Scores at Reporting Level', 'Grade', 'Test ID',
       'Students Enrolled', 'Students Tested', 'Mean Scale Score',
       'Percentage Standard Exceeded', 'Percentage Standard Met',
       'Percentage Standard Met and Above', 'Percentage Standard Nearly Met',
       'Percentage Standard Not Met', 'Students with Scores',
       'Area 1 Percentage Above Standard', 'Area 1 Percentage Near Standard',
       'Area 1 Percentage Below Standard', 'Area 2 Percentage Above Standard',
       'Area 2 Percentage Near Standard', 'Area 2 Percentage Below Standard',
       'Area 3 Percentage Above Standard', 'Area 3 Percentage Near Standard',
       'Area 3 Percentage Below Standard', 'Area 4 Percentage Above Standard',
       'Area 4 Percentage Near Standard', 'Area 4 Percentage Below Standard',
       'Type ID'],
      dtype=

In [4]:
condition = ((data['Grade'] == 11) & (data['Test ID'] == 2) & (data['School Code'] != 0) & 
            (data['Percentage Standard Met and Above'].notnull()) &
            (data['Percentage Standard Met and Above'] != '*'))

In [8]:
test_data = data.loc[condition, ['County Code', 'District Code', 'School Code', 'Test Year',
                   'Students with Scores','Percentage Standard Met and Above']].rename(columns = {
    'Students with Scores':'totaltested', 'Percentage Standard Met and Above':'mathpass', 'Test Year':'year'
})
test_data.loc[:, ['County Code', 'District Code', 'School Code', 'year']] = test_data.loc[:, [
    'County Code', 'District Code', 'School Code', 'year']].astype('str')
test_data['mathpass'] = round(test_data['mathpass'].astype('float'), 2)

for i in list(test_data['School Code']):
    if len(i) == 6:
        test_data.loc[test_data['School Code'] == i, 'School Code'] = '0' + i

test_data['schoolcode'] = test_data['County Code'] + test_data['District Code'] + test_data['School Code']

test_data = test_data.loc[test_data['totaltested'].astype('int') > 0, :]

test_data.head()

Unnamed: 0,County Code,District Code,School Code,year,totaltested,mathpass,schoolcode
46,1,10017,112607,2021,59,22.03,1100170112607
422,1,61127,130450,2021,218,83.49,1611270130450
482,1,61143,131177,2021,56,73.21,1611430131177
630,1,61150,132225,2021,249,86.35,1611500132225
634,1,61150,133876,2021,32,34.38,1611500133876


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2020-21&cCat=Enrollment&cPage=filesenr.asp

In [9]:
data = pd.read_csv('enr21.txt', sep = '\t')

In [10]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [11]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'total':'totalenrolled', 
                                                                   'CDS_CODE':'schoolcode'})

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white,totalenrolled
0,1100170112607,35.54,49.51,4.17,408
1,1100170123968,18.58,63.24,1.98,253
2,1100170124172,8.83,6.67,9.01,555
3,1100170125567,20.51,29.87,27.09,395
4,1100170130401,62.0,32.0,2.0,50


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm2021.xlsx

In [12]:
data = pd.read_csv('frpm21.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       'CALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [13]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'})

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170130419,72.0
3,1100170130401,100.0
4,1100170137448,74.1
5,1100170123968,69.8
6,1100170136101,26.9


In [14]:
math21 = frpm.merge(demo, on = 'schoolcode').merge(test_data, on = 'schoolcode')

## Year 2019

### Assessment Data

https://caaspp-elpac.ets.org/caaspp/researchfiles/sb_ca2019_1_csv_v4.zip

In [15]:
data = pd.read_csv('test19.txt', sep = ',')

In [16]:
data.columns

Index(['County Code', 'District Code', 'School Code', 'Filler', 'Test Year',
       'Subgroup ID', 'Test Type', 'Total Tested At Entity Level',
       'Total Tested with Scores', 'Grade', 'Test Id',
       'CAASPP Reported Enrollment', 'Students Tested', 'Mean Scale Score',
       'Percentage Standard Exceeded', 'Percentage Standard Met',
       'Percentage Standard Met and Above', 'Percentage Standard Nearly Met',
       'Percentage Standard Not Met', 'Students with Scores',
       'Area 1 Percentage Above Standard', 'Area 1 Percentage Near Standard',
       'Area 1 Percentage Below Standard', 'Area 2 Percentage Above Standard',
       'Area 2 Percentage Near Standard', 'Area 2 Percentage Below Standard',
       'Area 3 Percentage Above Standard', 'Area 3 Percentage Near Standard',
       'Area 3 Percentage Below Standard', 'Area 4 Percentage Above Standard',
       'Area 4 Percentage Near Standard', 'Area 4 Percentage Below Standard'],
      dtype='object')

In [17]:
condition = ((data['Grade'] == 11) & (data['Test Id'] == 2) & (data['School Code'] != 0) 
            & (data['Percentage Standard Met and Above'].notnull()) &
            (data['Percentage Standard Met and Above'] != '*'))

In [18]:
test_data = data.loc[condition, ['County Code', 'District Code', 'School Code', 'Test Year',
                   'Students with Scores','Percentage Standard Met and Above']].rename(columns = {
    'Students with Scores':'totaltested', 'Percentage Standard Met and Above':'mathpass', 
    'Test Year':'year'
})
test_data.loc[:, ['County Code', 'District Code', 'School Code', 'year']] = test_data.loc[:, [
    'County Code', 'District Code', 'School Code', 'year']].astype('str')
test_data['mathpass'] = round(test_data['mathpass'].astype('float'), 2)

for i in list(test_data['School Code']):
    if len(i) == 6:
        test_data.loc[test_data['School Code'] == i, 'School Code'] = '0' + i

test_data['schoolcode'] = test_data['County Code'] + test_data['District Code'] + test_data['School Code']

test_data = test_data.loc[test_data['totaltested'].astype('int') > 0, :]

test_data.head()

Unnamed: 0,County Code,District Code,School Code,year,totaltested,mathpass,schoolcode
42,1,10017,112607,2019,84,10.71,1100170112607
124,1,10017,136101,2019,18,33.33,1100170136101
170,1,15725,115725,2019,63,23.81,1157250115725
230,1,31617,131763,2019,35,2.86,1316170131763
250,1,61119,106401,2019,48,89.58,1611190106401


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2018-19&cCat=Enrollment&cPage=filesenr.asp

In [19]:
data = pd.read_csv('enr19.txt', sep = '\t')

In [20]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [21]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'total':'totalenrolled', 
                                                                   'CDS_CODE':'schoolcode'})

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white,totalenrolled
0,1100170112607,32.73,56.36,3.64,385
1,1100170123968,14.94,57.68,2.9,241
2,1100170124172,6.07,5.84,8.09,445
3,1100170125567,18.98,25.69,30.56,432
4,1100170130401,63.04,26.09,10.87,46


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm1819.xlsx

In [22]:
data = pd.read_csv('frpm19.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       'CALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [23]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'})

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170112607,78.6
3,1100170123968,51.3
4,1100170124172,14.4
5,1100170125567,30.9
6,1100170130401,100.0


In [24]:
math19 = frpm.merge(demo, on = 'schoolcode').merge(test_data, on = 'schoolcode')

## Year 2018

### Assessment Data

https://caaspp-elpac.ets.org/caaspp/researchfiles/sb_ca2018_1_csv_v3.zip

In [25]:
data = pd.read_csv('test18.txt', sep = ',')

In [26]:
data.columns

Index(['County Code', 'District Code', 'School Code', 'Filler', 'Test Year',
       'Subgroup ID', 'Test Type', 'Total Tested At Entity Level',
       'Total Tested with Scores', 'Grade', 'Test Id',
       'CAASPP Reported Enrollment', 'Students Tested', 'Mean Scale Score',
       'Percentage Standard Exceeded', 'Percentage Standard Met',
       'Percentage Standard Met and Above', 'Percentage Standard Nearly Met',
       'Percentage Standard Not Met', 'Students with Scores',
       'Area 1 Percentage Above Standard', 'Area 1 Percentage Near Standard',
       'Area 1 Percentage Below Standard', 'Area 2 Percentage Above Standard',
       'Area 2 Percentage Near Standard', 'Area 2 Percentage Below Standard',
       'Area 3 Percentage Above Standard', 'Area 3 Percentage Near Standard',
       'Area 3 Percentage Below Standard', 'Area 4 Percentage Above Standard',
       'Area 4 Percentage Near Standard', 'Area 4 Percentage Below Standard'],
      dtype='object')

In [27]:
condition = ((data['Grade'] == 11) & (data['Test Id'] == 2) & (data['School Code'] != 0) 
            & (data['Percentage Standard Met and Above'].notnull()) &
            (data['Percentage Standard Met and Above'] != '*'))

In [28]:
test_data = data.loc[condition, ['County Code', 'District Code', 'School Code', 'Test Year',
                   'Students with Scores','Percentage Standard Met and Above']].rename(columns = {
    'Students with Scores':'totaltested', 'Percentage Standard Met and Above':'mathpass', 
    'Test Year':'year'
})
test_data.loc[:, ['County Code', 'District Code', 'School Code', 'year']] = test_data.loc[:, [
    'County Code', 'District Code', 'School Code', 'year']].astype('str')
test_data['mathpass'] = round(test_data['mathpass'].astype('float'), 2)

for i in list(test_data['School Code']):
    if len(i) == 6:
        test_data.loc[test_data['School Code'] == i, 'School Code'] = '0' + i

test_data['schoolcode'] = test_data['County Code'] + test_data['District Code'] + test_data['School Code']

test_data = test_data.loc[test_data['totaltested'].astype('int') > 0, :]

test_data.head()

Unnamed: 0,County Code,District Code,School Code,year,totaltested,mathpass,schoolcode
41,1,10017,112607,2018,96,19.79,1100170112607
86,1,10017,130401,2018,15,0.0,1100170130401
94,1,10017,130419,2018,23,0.0,1100170130419
205,1,31617,131763,2018,37,5.41,1316170131763
225,1,61119,106401,2018,44,88.64,1611190106401


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2017-18&cCat=Enrollment&cPage=filesenr.asp

In [29]:
data = pd.read_csv('enr18.txt', sep = '\t')

In [30]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [31]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'total':'totalenrolled', 
                                                                   'CDS_CODE':'schoolcode'})

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white,totalenrolled
0,1100170112607,33.41,55.69,2.66,413
1,1100170123968,19.05,57.94,3.97,252
2,1100170124172,4.91,4.39,8.27,387
3,1100170125567,19.42,26.56,30.36,448
4,1100170130401,69.86,16.44,5.48,73


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm1718.xlsx

In [32]:
data = pd.read_csv('frpm18.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       '2017-18 \nCALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [33]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'})

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170112607,74.6
3,1100170123968,70.0
4,1100170124172,13.7
5,1100170125567,29.0
6,1100170130401,100.0


In [34]:
math18 = frpm.merge(demo, on = 'schoolcode').merge(test_data, on = 'schoolcode')

## Year 2017

### Assessment Data

https://caaspp-elpac.ets.org/caaspp/researchfiles/sb_ca2017_1_csv_v2.zip

In [35]:
data = pd.read_csv('test17.txt', sep = ',')

In [36]:
data.columns

Index(['County Code', 'District Code', 'School Code', 'Filler', 'Test Year',
       'Subgroup ID', 'Test Type', 'Total Tested At Entity Level',
       'Total Tested with Scores', 'Grade', 'Test Id',
       'CAASPP Reported Enrollment', 'Students Tested', 'Mean Scale Score',
       'Percentage Standard Exceeded', 'Percentage Standard Met',
       'Percentage Standard Met and Above', 'Percentage Standard Nearly Met',
       'Percentage Standard Not Met', 'Students with Scores',
       'Area 1 Percentage Above Standard', 'Area 1 Percentage Near Standard',
       'Area 1 Percentage Below Standard', 'Area 2 Percentage Above Standard',
       'Area 2 Percentage Near Standard', 'Area 2 Percentage Below Standard',
       'Area 3 Percentage Above Standard', 'Area 3 Percentage Near Standard',
       'Area 3 Percentage Below Standard', 'Area 4 Percentage Above Standard',
       'Area 4 Percentage Near Standard', 'Area 4 Percentage Below Standard'],
      dtype='object')

In [37]:
condition = ((data['Grade'] == 11) & (data['Test Id'] == 2) & (data['School Code'] != 0) 
            & (data['Percentage Standard Met and Above'].notnull()) &
            (data['Percentage Standard Met and Above'] != '*'))

In [38]:
test_data = data.loc[condition, ['County Code', 'District Code', 'School Code', 'Test Year',
                   'Students with Scores','Percentage Standard Met and Above']].rename(columns = {
    'Students with Scores':'totaltested', 'Percentage Standard Met and Above':'mathpass', 
    'Test Year':'year'
})
test_data.loc[:, ['County Code', 'District Code', 'School Code', 'year']] = test_data.loc[:, [
    'County Code', 'District Code', 'School Code', 'year']].astype('str')
test_data['mathpass'] = round(test_data['mathpass'].astype('float'), 2)

for i in list(test_data['School Code']):
    if len(i) == 6:
        test_data.loc[test_data['School Code'] == i, 'School Code'] = '0' + i

test_data['schoolcode'] = test_data['County Code'] + test_data['District Code'] + test_data['School Code']

test_data = test_data.loc[test_data['totaltested'].astype('int') > 0, :]

test_data.head()

Unnamed: 0,County Code,District Code,School Code,year,totaltested,mathpass,schoolcode
43,1,10017,112607,2017,90,18.89,1100170112607
86,1,10017,130401,2017,24,0.0,1100170130401
94,1,10017,130419,2017,20,0.0,1100170130419
185,1,31617,131763,2017,29,0.0,1316170131763
205,1,61119,106401,2017,45,84.44,1611190106401


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2016-17&cCat=Enrollment&cPage=filesenr.asp

In [39]:
data = pd.read_csv('enr17.txt', sep = '\t')

In [40]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [41]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'total':'totalenrolled', 
                                                                   'CDS_CODE':'schoolcode'})

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white,totalenrolled
0,1100170112607,37.22,52.61,4.47,403
1,1100170123968,11.06,60.1,7.69,208
2,1100170124172,3.92,5.04,9.8,357
3,1100170125567,20.05,25.67,30.75,374
4,1100170130401,66.67,24.14,2.3,87


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm1617.xls

In [42]:
data = pd.read_csv('frpm17.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       '2016-17 \nCALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [43]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'})

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170112607,74.3
3,1100170123968,63.3
4,1100170124172,9.2
5,1100170125567,27.5
6,1100170130401,100.0


In [44]:
math17 = frpm.merge(demo, on = 'schoolcode').merge(test_data, on = 'schoolcode')

## Year 2016

### Assessment Data

https://caaspp-elpac.ets.org/caaspp/researchfiles/sb_ca2016_1_csv_v3.zip

In [45]:
data = pd.read_csv('test16.txt', sep = ',')

In [46]:
data.columns

Index(['County Code', 'District Code', 'School Code', 'Filler', 'Test Year',
       'Subgroup ID', 'Test Type', 'Total CAASPP Enrollment',
       'Total Tested At Entity Level', 'Total Tested with Scores', 'Grade',
       'Test Id', 'CAASPP Reported Enrollment', 'Students Tested',
       'Mean Scale Score', 'Percentage Standard Exceeded',
       'Percentage Standard Met', 'Percentage Standard Met and Above',
       'Percentage Standard Nearly Met', 'Percentage Standard Not Met',
       'Students with Scores', 'Area 1 Percentage Above Standard',
       'Area 1 Percentage Near Standard', 'Area 1 Percentage Below Standard',
       'Area 2 Percentage Above Standard', 'Area 2 Percentage Near Standard',
       'Area 2 Percentage Below Standard', 'Area 3 Percentage Above Standard',
       'Area 3 Percentage Near Standard', 'Area 3 Percentage Below Standard',
       'Area 4 Percentage Above Standard', 'Area 4 Percentage Near Standard',
       'Area 4 Percentage Below Standard'],
      dtype='o

In [47]:
condition = ((data['Grade'] == 11) & (data['Test Id'] == 2) & (data['School Code'] != 0) 
            & (data['Percentage Standard Met and Above'].notnull()) &
            (data['Percentage Standard Met and Above'] != '*'))

In [48]:
test_data = data.loc[condition, ['County Code', 'District Code', 'School Code', 'Test Year',
                   'Students with Scores','Percentage Standard Met and Above']].rename(columns = {
    'Students with Scores':'totaltested', 'Percentage Standard Met and Above':'mathpass', 
    'Test Year':'year'
})
test_data.loc[:, ['County Code', 'District Code', 'School Code', 'year']] = test_data.loc[:, [
    'County Code', 'District Code', 'School Code', 'year']].astype('str')
test_data['mathpass'] = round(test_data['mathpass'].astype('float'), 2)

for i in list(test_data['School Code']):
    if len(i) == 6:
        test_data.loc[test_data['School Code'] == i, 'School Code'] = '0' + i

test_data['schoolcode'] = test_data['County Code'] + test_data['District Code'] + test_data['School Code']

test_data = test_data.loc[test_data['totaltested'].astype('int') > 0, :]

test_data.head()

Unnamed: 0,County Code,District Code,School Code,year,totaltested,mathpass,schoolcode
47,1,10017,112607,2016,89,11.0,1100170112607
83,1,10017,130401,2016,17,0.0,1100170130401
90,1,10017,130419,2016,36,3.0,1100170130419
182,1,31617,131763,2016,38,0.0,1316170131763
202,1,61119,106401,2016,46,96.0,1611190106401


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2015-16&cCat=Enrollment&cPage=filesenr.asp

In [49]:
data = pd.read_csv('enr16.txt', sep = '\t')

In [50]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [51]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'total':'totalenrolled', 
                                                                   'CDS_CODE':'schoolcode'})

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white,totalenrolled
0,1100170112607,38.33,51.35,3.19,407
1,1100170123968,17.71,48.96,10.42,192
2,1100170124172,3.26,4.23,9.12,307
3,1100170125567,21.45,25.55,32.49,317
4,1100170130401,47.54,36.89,4.92,122


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm1516.xls

In [52]:
data = pd.read_csv('frpm16.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       '2015-16 \nCALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [53]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'})

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170112607,66.8
3,1100170123968,82.7
4,1100170124172,6.8
5,1100170125567,34.3
6,1100170130401,54.5


In [54]:
math16 = frpm.merge(demo, on = 'schoolcode').merge(test_data, on = 'schoolcode')

## Year 2015

### Assessment Data

https://caaspp-elpac.ets.org/caaspp/researchfiles/sb_ca2015_1_csv_v3.zip

In [55]:
data = pd.read_csv('test15.txt', sep = ',')

In [56]:
data.columns

Index(['County Code', 'District Code', 'School Code', 'Filler', 'Test Year',
       'Subgroup ID', 'Test Type', 'Total CAASPP Enrollment',
       'Total Tested At Entity Level', 'Total Tested at Subgroup Level',
       'Grade', 'Test Id', 'CAASPP Reported Enrollment', 'Students Tested',
       'Mean Scale Score', 'Percentage Standard Exceeded',
       'Percentage Standard Met', 'Percentage Standard Met and Above',
       'Percentage Standard Nearly Met', 'Percentage Standard Not Met',
       'Students with Scores', 'Area 1 Percentage Above Standard',
       'Area 1 Percentage At or Near Standard',
       'Area 1 Percentage Below Standard', 'Area 2 Percentage Above Standard',
       'Area 2 Percentage At or Near Standard',
       'Area 2 Percentage Below Standard', 'Area 3 Percentage Above Standard',
       'Area 3 Percentage At or Near Standard',
       'Area 3 Percentage Below Standard', 'Area 4 Percentage Above Standard',
       'Area 4 Percentage At or Near Standard',
       'Area 4

In [57]:
condition = ((data['Grade'] == 11) & (data['Test Id'] == 2) & (data['School Code'] != 0) 
            & (data['Percentage Standard Met and Above'].notnull()) &
            (data['Percentage Standard Met and Above'] != '*'))

In [58]:
test_data = data.loc[condition, ['County Code', 'District Code', 'School Code', 'Test Year',
                   'Students with Scores','Percentage Standard Met and Above']].rename(columns = {
    'Students with Scores':'totaltested', 'Percentage Standard Met and Above':'mathpass', 
    'Test Year':'year'
})
test_data.loc[:, ['County Code', 'District Code', 'School Code', 'year']] = test_data.loc[:, [
    'County Code', 'District Code', 'School Code', 'year']].astype('str')
test_data['mathpass'] = round(test_data['mathpass'].astype('float'), 2)

for i in list(test_data['School Code']):
    if len(i) == 6:
        test_data.loc[test_data['School Code'] == i, 'School Code'] = '0' + i

test_data['schoolcode'] = test_data['County Code'] + test_data['District Code'] + test_data['School Code']

test_data = test_data.loc[test_data['totaltested'].astype('int') > 0, :]

test_data.head()

Unnamed: 0,County Code,District Code,School Code,year,totaltested,mathpass,schoolcode
62,1,10017,109835,2015,55,5.0,1100170109835
67,1,10017,112607,2015,85,8.0,1100170112607
71,1,10017,118489,2015,58,21.0,1100170118489
102,1,10017,130401,2015,39,3.0,1100170130401
110,1,10017,130419,2015,27,0.0,1100170130419


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2014-15&cCat=Enrollment&cPage=filesenr.asp

In [59]:
data = pd.read_csv('enr15.txt', sep = '\t')

In [60]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [61]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'total':'totalenrolled', 
                                                                   'CDS_CODE':'schoolcode'})

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white,totalenrolled
0,1100170109835,7.91,15.27,47.93,1087
1,1100170112607,41.27,46.84,2.78,395
2,1100170118489,28.69,64.75,0.82,244
3,1100170123968,25.13,45.03,12.57,191
4,1100170124172,3.89,4.67,8.95,257


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm1415.xls

In [62]:
data = pd.read_csv('frpm15.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       '2013-14 \nCALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [63]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'}).dropna()

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170109835,65.6
3,1100170112607,48.4
4,1100170118489,73.0
5,1100170123968,73.2
6,1100170124172,8.2


In [64]:
math15 = frpm.merge(demo, on = 'schoolcode').merge(test_data, on = 'schoolcode')

### Merge Data (2015 - 2021)

In [65]:
mutual_school = list(set(math15.schoolcode).intersection(set(math16.schoolcode)).intersection(
    set(math17.schoolcode)).intersection(set(math18.schoolcode)).intersection(
    set(math19.schoolcode)).intersection(set(math21.schoolcode)))

mutual_math15 = math15.loc[math15['schoolcode'].isin(mutual_school)]
mutual_math16 = math16.loc[math16['schoolcode'].isin(mutual_school)]
mutual_math17 = math17.loc[math17['schoolcode'].isin(mutual_school)]
mutual_math18 = math18.loc[math18['schoolcode'].isin(mutual_school)]
mutual_math19 = math19.loc[math19['schoolcode'].isin(mutual_school)]
mutual_math21 = math21.loc[math21['schoolcode'].isin(mutual_school)]

In [66]:
pd.concat([mutual_math15,mutual_math16,mutual_math17,mutual_math18,
           mutual_math19,mutual_math21]).to_csv('mathpass_ca_15_21.csv')

### Pandemic Instruction Modality Data

In [67]:
mode = pd.read_csv('schoolmode.csv')

In [68]:
mode.columns

Index(['StateName', 'StateAbbrev', 'DataLevel', 'Charter', 'SchoolName',
       'SchoolType', 'NCESSchoolID', 'StateAssignedSchoolID', 'DistrictName',
       'DistrictType', 'NCESDistrictID', 'StateAssignedDistrictID',
       'EnrollmentTotal', 'TimePeriodInterval', 'TimePeriodStart',
       'TimePeriodEnd', 'LearningModel', 'LearningModelGrK5',
       'LearningModelGr68', 'LearningModelGr912', 'LearningModelStateCat',
       'LearningModelStateCatGrK5', 'LearningModelStateCatGr68',
       'LearningModelStateCatGr912', 'EnrollmentInPerson', 'EnrollmentHybrid',
       'EnrollmentVirtual', 'StaffCount', 'StaffCountInPerson'],
      dtype='object')

In [69]:
charter_mode = mode.loc[:, ['StateAssignedSchoolID', 'Charter']].drop_duplicates().rename(columns = {'StateAssignedSchoolID':
             'schoolcode', 'Charter':'charter'})
charter_map = {'No':0, 'Yes':1}
charter_mode['charter'] = charter_mode['charter'].map(charter_map)
charter_mode.head()

Unnamed: 0,schoolcode,charter
0,19642121995596,0
11,19642126010862,0
22,19642121930361,0
33,19642126071369,0
44,19642126010953,0


In [70]:
ca_mode = mode.groupby(['StateAssignedSchoolID'])['LearningModel'].value_counts(
    normalize = True).to_frame().rename(columns = {'LearningModel':'normalized'}).reset_index().rename(
columns = {'StateAssignedSchoolID':'schoolcode'})

ca_mode = ca_mode.pivot(columns = 'LearningModel', values = 'normalized', 
index = 'schoolcode').reset_index().fillna(0).loc[:, ['schoolcode', 'Hybrid', 'Virtual']].rename(
columns = {'Hybrid':'hybridper', 'Virtual':'virtualper'})

ca_mode['schoolmode'] = 1 * ca_mode['virtualper'] + 0.5 * ca_mode['hybridper']

ca_mode.loc[:, ['hybridper', 'virtualper', 'schoolmode']] = round(
    ca_mode.loc[:, ['hybridper', 'virtualper', 'schoolmode']], 2)

ca_mode.head()

LearningModel,schoolcode,hybridper,virtualper,schoolmode
0,1100170112607,0.0,1.0,1.0
1,1100170123968,0.0,1.0,1.0
2,1100170124172,0.0,1.0,1.0
3,1100170125567,0.0,1.0,1.0
4,1100170130419,0.0,1.0,1.0


### Merge Test Data and Schooling Mode Data

In [76]:
test = pd.read_csv('mathpass_ca_15_21.csv').iloc[:, 1:]
test_mode = test.merge(ca_mode, on = 'schoolcode').merge(charter_mode, on ='schoolcode').rename(
    columns = {'County Code':'countycode', 'District Code':'districtcode'}).drop(
    columns = ['School Code', 'totalenrolled'])

test_mode.loc[test_mode['year'] != 2021, ['hybridper', 'virtualper', 'schoolmode']] = 0
test_mode['year'] = test_mode['year'].astype('str')
test_mode['schoolcode'] = test_mode['schoolcode'].astype('str') + 'ca'
test_mode['countycode'] = test_mode['countycode'].astype('str') + 'ca'
test_mode['districtcode'] = test_mode['countycode'] + test_mode['districtcode'].astype('str') + 'ca'
test_mode['state'] = 'ca'

test_mode.to_csv('final_data_ca_mathpass.csv')