In [2]:
import pandas as pd

## Year 2021

### Drop-out Data

https://www3.cde.ca.gov/demo-downloads/acgr/acgr21.txt

In [3]:
data = pd.read_csv('drop21.csv')

In [4]:
data.columns

Index(['AcademicYear', 'AggregateLevel', 'CountyCode', 'DistrictCode',
       'SchoolCode', 'CountyName', 'DistrictName', 'SchoolName',
       'CharterSchool', 'DASS', 'ReportingCategory', 'CohortStudents',
       'Regular HS Diploma Graduates (Count)',
       'Regular HS Diploma Graduates (Rate)', 'Met UC/CSU Grad Req's (Count)',
       'Met UC/CSU Grad Req's (Rate)', 'Seal of Biliteracy (Count)',
       'Seal of Biliteracy (Rate)', 'Golden State Seal Merit Diploma (Count)',
       'Golden State Seal Merit Diploma (Rate', 'CHSPE Completer (Count)',
       'CHSPE Completer (Rate)', 'Adult Ed. HS Diploma (Count)',
       'Adult Ed. HS Diploma (Rate)', 'SPED Certificate (Count)',
       'SPED Certificate (Rate)', 'GED Completer (Count)',
       'GED Completer (Rate)', 'Other Transfer (Count)',
       'Other Transfer (Rate)', 'Dropout (Count)', 'Dropout (Rate)',
       'Still Enrolled (Count)', 'Still Enrolled (Rate)'],
      dtype='object')

In [5]:
condition = ((data['AggregateLevel'] == 'S') & (data['ReportingCategory'] == 'TA')
             & (data['CohortStudents'] != '*') & (data['CharterSchool'] == 'All') &
            (data['DASS'] == 'All'))

In [6]:
drop_data = data.loc[condition, ['CountyCode', 'DistrictCode', 'SchoolCode', 'CohortStudents',
                   'Dropout (Rate)']].rename(columns = {'CohortStudents':'totalenrolled',
                                                        'Dropout (Rate)':'dropout'})
drop_data.loc[:, ['CountyCode', 'DistrictCode', 'SchoolCode']] = drop_data.loc[:, [
    'CountyCode', 'DistrictCode', 'SchoolCode']].astype('int').astype('str')
drop_data['dropout'] = round(drop_data['dropout'].astype('float'), 2)

for i in list(drop_data['SchoolCode']):
    if len(i) != 7:
        add_len = 7 - len(i)
        add_str = '0' * add_len
        drop_data.loc[drop_data['SchoolCode'] == i, 'SchoolCode'] = add_str + i

drop_data['schoolcode'] = drop_data['CountyCode'] + drop_data['DistrictCode'] + drop_data['SchoolCode']

drop_data = drop_data.drop_duplicates().drop(columns = 'SchoolCode')

drop_data.head()

Unnamed: 0,CountyCode,DistrictCode,totalenrolled,dropout,schoolcode
66219,1,10017,94,9.6,1100170112607
66287,1,10017,63,66.7,1100170130401
66355,1,10017,63,57.1,1100170130419
66420,1,10017,19,15.8,1100170136101
66479,1,10017,106,18.9,1100170136226


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2020-21&cCat=Enrollment&cPage=filesenr.asp

In [7]:
data = pd.read_csv('enr21.txt', sep = '\t')

In [8]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [9]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'CDS_CODE':'schoolcode'}).drop(
    columns = 'total')

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white
0,1100170112607,35.54,49.51,4.17
1,1100170123968,18.58,63.24,1.98
2,1100170124172,8.83,6.67,9.01
3,1100170125567,20.51,29.87,27.09
4,1100170130401,62.0,32.0,2.0


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm2021.xlsx

In [10]:
data = pd.read_csv('frpm21.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       'CALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [11]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'})

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170130419,72.0
3,1100170130401,100.0
4,1100170137448,74.1
5,1100170123968,69.8
6,1100170136101,26.9


In [12]:
drop21 = frpm.merge(demo, on = 'schoolcode').merge(drop_data, on = 'schoolcode')
drop21.totalenrolled = drop21.totalenrolled.astype('int')
drop21['year'] = '2021'
drop21.head()

Unnamed: 0,schoolcode,lowincome,black,hispanic,white,CountyCode,DistrictCode,totalenrolled,dropout,year
0,1100170130419,72.0,34.29,56.19,3.81,1,10017,63,57.1,2021
1,1100170130401,100.0,62.0,32.0,2.0,1,10017,63,66.7,2021
2,1100170136101,26.9,3.18,14.61,35.58,1,10017,19,15.8,2021
3,1100170112607,78.1,35.54,49.51,4.17,1,10017,94,9.6,2021
4,1100170136226,63.2,37.31,46.27,7.46,1,10017,106,18.9,2021


## Year 2019

### Drop-out Data

https://www3.cde.ca.gov/demo-downloads/acgr/acgr19.txt

In [13]:
data = pd.read_csv('drop19.csv')

In [14]:
data.columns

Index(['AcademicYear', 'AggregateLevel', 'CountyCode', 'DistrictCode',
       'SchoolCode', 'CountyName', 'DistrictName', 'SchoolName',
       'CharterSchool', 'DASS', 'ReportingCategory', 'CohortStudents',
       'Regular HS Diploma Graduates (Count)',
       'Regular HS Diploma Graduates (Rate)', 'Met UC/CSU Grad Req's (Count)',
       'Met UC/CSU Grad Req's (Rate)', 'Seal of Biliteracy (Count)',
       'Seal of Biliteracy (Rate)', 'Golden State Seal Merit Diploma (Count)',
       'Golden State Seal Merit Diploma (Rate', 'CHSPE Completer (Count)',
       'CHSPE Completer (Rate)', 'Adult Ed. HS Diploma (Count)',
       'Adult Ed. HS Diploma (Rate)', 'SPED Certificate (Count)',
       'SPED Certificate (Rate)', 'GED Completer (Count)',
       'GED Completer (Rate)', 'Other Transfer (Count)',
       'Other Transfer (Rate)', 'Dropout (Count)', 'Dropout (Rate)',
       'Still Enrolled (Count)', 'Still Enrolled (Rate)'],
      dtype='object')

In [15]:
condition = ((data['AggregateLevel'] == 'S') & (data['ReportingCategory'] == 'TA')
             & (data['CohortStudents'] != '*') & (data['CharterSchool'] == 'All') &
            (data['DASS'] == 'All'))

In [16]:
drop_data = data.loc[condition, ['CountyCode', 'DistrictCode', 'SchoolCode', 'CohortStudents',
                   'Dropout (Rate)']].rename(columns = {'CohortStudents':'totalenrolled',
                                                        'Dropout (Rate)':'dropout'})
drop_data.loc[:, ['CountyCode', 'DistrictCode', 'SchoolCode']] = drop_data.loc[:, [
    'CountyCode', 'DistrictCode', 'SchoolCode']].astype('int').astype('str')
drop_data['dropout'] = round(drop_data['dropout'].astype('float'), 2)

for i in list(drop_data['SchoolCode']):
    if len(i) != 7:
        add_len = 7 - len(i)
        add_str = '0' * add_len
        drop_data.loc[drop_data['SchoolCode'] == i, 'SchoolCode'] = add_str + i

drop_data['schoolcode'] = drop_data['CountyCode'] + drop_data['DistrictCode'] + drop_data['SchoolCode']

drop_data = drop_data.drop_duplicates().drop(columns = 'SchoolCode')

drop_data.head()

Unnamed: 0,CountyCode,DistrictCode,totalenrolled,dropout,schoolcode
55740,1,10017,85,3.5,1100170112607
55793,1,10017,77,67.5,1100170130401
55849,1,10017,80,46.3,1100170130419
55961,1,10017,73,30.1,1100170136226
56047,1,31617,47,57.4,1316170131763


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2018-19&cCat=Enrollment&cPage=filesenr.asp

In [17]:
data = pd.read_csv('enr19.txt', sep = '\t')

In [18]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [19]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'CDS_CODE':'schoolcode'}).drop(
    columns = 'total')

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white
0,1100170112607,32.73,56.36,3.64
1,1100170123968,14.94,57.68,2.9
2,1100170124172,6.07,5.84,8.09
3,1100170125567,18.98,25.69,30.56
4,1100170130401,63.04,26.09,10.87


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm1819.xlsx

In [20]:
data = pd.read_csv('frpm19.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       'CALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [21]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'})

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170112607,78.6
3,1100170123968,51.3
4,1100170124172,14.4
5,1100170125567,30.9
6,1100170130401,100.0


In [22]:
drop19 = frpm.merge(demo, on = 'schoolcode').merge(drop_data, on = 'schoolcode')
drop19.totalenrolled = drop19.totalenrolled.astype('int')
drop19['year'] = '2019'
drop19.head()

Unnamed: 0,schoolcode,lowincome,black,hispanic,white,CountyCode,DistrictCode,totalenrolled,dropout,year
0,1100170112607,78.6,32.73,56.36,3.64,1,10017,85,3.5,2019
1,1100170130401,100.0,63.04,26.09,10.87,1,10017,77,67.5,2019
2,1100170130419,84.5,31.58,56.39,6.02,1,10017,80,46.3,2019
3,1100170136226,50.0,46.48,42.25,1.41,1,10017,73,30.1,2019
4,1316170131763,100.0,6.99,51.88,22.58,1,31617,47,57.4,2019


## Year 2018

### Drop-out Data

https://www3.cde.ca.gov/demo-downloads/acgr/acgr18.txt

In [23]:
data = pd.read_csv('drop18.csv')

In [24]:
data.columns

Index(['AcademicYear', 'AggregateLevel', 'CountyCode', 'DistrictCode',
       'SchoolCode', 'CountyName', 'DistrictName', 'SchoolName',
       'CharterSchool', 'DASS', 'ReportingCategory', 'CohortStudents',
       'Regular HS Diploma Graduates (Count)',
       'Regular HS Diploma Graduates (Rate)', 'Met UC/CSU Grad Req's (Count)',
       'Met UC/CSU Grad Req's (Rate)', 'Seal of Biliteracy (Count)',
       'Seal of Biliteracy (Rate)', 'Golden State Seal Merit Diploma (Count)',
       'Golden State Seal Merit Diploma (Rate', 'CHSPE Completer (Count)',
       'CHSPE Completer (Rate)', 'Adult Ed. HS Diploma (Count)',
       'Adult Ed. HS Diploma (Rate)', 'SPED Certificate (Count)',
       'SPED Certificate (Rate)', 'GED Completer (Count)',
       'GED Completer (Rate)', 'Other Transfer (Count)',
       'Other Transfer (Rate)', 'Dropout (Count)', 'Dropout (Rate)',
       'Still Enrolled (Count)', 'Still Enrolled (Rate)'],
      dtype='object')

In [25]:
condition = ((data['AggregateLevel'] == 'S') & (data['ReportingCategory'] == 'TA')
             & (data['CohortStudents'] != '*') & (data['CharterSchool'] == 'All') &
            (data['DASS'] == 'All'))

In [26]:
drop_data = data.loc[condition, ['CountyCode', 'DistrictCode', 'SchoolCode', 'CohortStudents',
                   'Dropout (Rate)']].rename(columns = {'CohortStudents':'totalenrolled',
                                                        'Dropout (Rate)':'dropout'})
drop_data.loc[:, ['CountyCode', 'DistrictCode', 'SchoolCode']] = drop_data.loc[:, [
    'CountyCode', 'DistrictCode', 'SchoolCode']].astype('int').astype('str')
drop_data['dropout'] = round(drop_data['dropout'].astype('float'), 2)

for i in list(drop_data['SchoolCode']):
    if len(i) != 7:
        add_len = 7 - len(i)
        add_str = '0' * add_len
        drop_data.loc[drop_data['SchoolCode'] == i, 'SchoolCode'] = add_str + i

drop_data['schoolcode'] = drop_data['CountyCode'] + drop_data['DistrictCode'] + drop_data['SchoolCode']

drop_data = drop_data.drop_duplicates().drop(columns = 'SchoolCode')

drop_data.head()

Unnamed: 0,CountyCode,DistrictCode,totalenrolled,dropout,schoolcode
55970,1,10017,87,4.6,1100170112607
56027,1,10017,93,65.6,1100170130401
56088,1,10017,95,38.9,1100170130419
56150,1,10017,69,31.9,1100170136226
56243,1,31617,38,2.6,1316170131763


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2017-18&cCat=Enrollment&cPage=filesenr.asp

In [27]:
data = pd.read_csv('enr18.txt', sep = '\t')

In [28]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [29]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'CDS_CODE':'schoolcode'}).drop(
    columns = 'total')

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white
0,1100170112607,33.41,55.69,2.66
1,1100170123968,19.05,57.94,3.97
2,1100170124172,4.91,4.39,8.27
3,1100170125567,19.42,26.56,30.36
4,1100170130401,69.86,16.44,5.48


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm1718.xlsx

In [30]:
data = pd.read_csv('frpm18.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       '2017-18 \nCALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [31]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'})

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170112607,74.6
3,1100170123968,70.0
4,1100170124172,13.7
5,1100170125567,29.0
6,1100170130401,100.0


In [32]:
drop18 = frpm.merge(demo, on = 'schoolcode').merge(drop_data, on = 'schoolcode')
drop18.totalenrolled = drop18.totalenrolled.astype('int')
drop18['year'] = '2018'
drop18.head()

Unnamed: 0,schoolcode,lowincome,black,hispanic,white,CountyCode,DistrictCode,totalenrolled,dropout,year
0,1100170112607,74.6,33.41,55.69,2.66,1,10017,87,4.6,2018
1,1100170130401,100.0,69.86,16.44,5.48,1,10017,93,65.6,2018
2,1100170130419,81.8,35.06,48.85,5.75,1,10017,95,38.9,2018
3,1100170136226,66.7,56.52,30.43,4.35,1,10017,69,31.9,2018
4,1316170131763,99.7,7.73,50.93,23.2,1,31617,38,2.6,2018


## Year 2017

### Drop-out Data

https://www3.cde.ca.gov/demo-downloads/acgr/acgr17.txt

In [33]:
data = pd.read_csv('drop17.csv')

In [34]:
data.columns

Index(['AcademicYear', 'AggregateLevel', 'CountyCode', 'DistrictCode',
       'SchoolCode', 'CountyName', 'DistrictName', 'SchoolName',
       'CharterSchool', 'DASS', 'ReportingCategory', 'CohortStudents',
       'Regular HS Diploma Graduates (Count)',
       'Regular HS Diploma Graduates (Rate)', 'Met UC/CSU Grad Req's (Count)',
       'Met UC/CSU Grad Req's (Rate)', 'Seal of Biliteracy (Count)',
       'Seal of Biliteracy (Rate)', 'Golden State Seal Merit Diploma (Count)',
       'Golden State Seal Merit Diploma (Rate', 'CHSPE Completer (Count)',
       'CHSPE Completer (Rate)', 'Adult Ed. HS Diploma (Count)',
       'Adult Ed. HS Diploma (Rate)', 'SPED Certificate (Count)',
       'SPED Certificate (Rate)', 'GED Completer (Count)',
       'GED Completer (Rate)', 'Other Transfer (Count)',
       'Other Transfer (Rate)', 'Dropout (Count)', 'Dropout (Rate)',
       'Still Enrolled (Count)', 'Still Enrolled (Rate)'],
      dtype='object')

In [35]:
condition = ((data['AggregateLevel'] == 'S') & (data['ReportingCategory'] == 'TA')
             & (data['CohortStudents'] != '*') & (data['CharterSchool'] == 'All') &
            (data['DASS'] == 'All'))

In [36]:
drop_data = data.loc[condition, ['CountyCode', 'DistrictCode', 'SchoolCode', 'CohortStudents',
                   'Dropout (Rate)']].rename(columns = {'CohortStudents':'totalenrolled',
                                                        'Dropout (Rate)':'dropout'})
drop_data.loc[:, ['CountyCode', 'DistrictCode', 'SchoolCode']] = drop_data.loc[:, [
    'CountyCode', 'DistrictCode', 'SchoolCode']].astype('int').astype('str')
drop_data['dropout'] = round(drop_data['dropout'].astype('float'), 2)

for i in list(drop_data['SchoolCode']):
    if len(i) != 7:
        add_len = 7 - len(i)
        add_str = '0' * add_len
        drop_data.loc[drop_data['SchoolCode'] == i, 'SchoolCode'] = add_str + i

drop_data['schoolcode'] = drop_data['CountyCode'] + drop_data['DistrictCode'] + drop_data['SchoolCode']

drop_data = drop_data.drop_duplicates().drop(columns = 'SchoolCode')

drop_data.head()

Unnamed: 0,CountyCode,DistrictCode,totalenrolled,dropout,schoolcode
55347,1,10017,81,4.9,1100170112607
55401,1,10017,95,55.8,1100170130401
55461,1,10017,171,36.8,1100170130419
55518,1,31609,12,0.0,1316090131755
55569,1,31617,47,0.0,1316170131763


### Demographics Data

https://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2016-17&cCat=Enrollment&cPage=filesenr.asp

In [37]:
data = pd.read_csv('enr17.txt', sep = '\t')

In [38]:
data.columns

Index(['CDS_CODE', 'COUNTY', 'DISTRICT', 'SCHOOL', 'ETHNIC', 'GENDER', 'KDGN',
       'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 'GR_6', 'GR_7', 'GR_8',
       'UNGR_ELM', 'GR_9', 'GR_10', 'GR_11', 'GR_12', 'UNGR_SEC', 'ENR_TOTAL',
       'ADULT'],
      dtype='object')

In [39]:
total_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby('CDS_CODE'
    ).sum().reset_index().loc[:,['CDS_CODE', 'ENR_TOTAL']].rename(columns = {'ENR_TOTAL':'total'})
total_enroll = total_enroll.loc[total_enroll['total'] > 0, :]

group_enroll = data.loc[:, ['CDS_CODE', 'ETHNIC', 'GENDER', 'ENR_TOTAL']].groupby(['CDS_CODE', 'ETHNIC']
    ).sum().reset_index()

race_map = {5:'hispanic', 6:'black', 7:'white'}

demo = group_enroll.loc[group_enroll['ETHNIC'].isin([5, 6, 7]), :]

demo['ETHNIC'] = demo['ETHNIC'].map(race_map)

demo = demo.merge(total_enroll, on ='CDS_CODE')

demo['percent'] = round((demo['ENR_TOTAL'] / demo['total']) * 100, 2)

demo = demo.pivot(columns = 'ETHNIC', index = 'CDS_CODE', values = 'percent').reset_index().fillna(0)

demo = demo.merge(total_enroll, on = 'CDS_CODE').rename(columns = {'CDS_CODE':'schoolcode'}).drop(
    columns = 'total')

demo['schoolcode'] = demo['schoolcode'].astype('str')

demo.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  demo['ETHNIC'] = demo['ETHNIC'].map(race_map)


Unnamed: 0,schoolcode,black,hispanic,white
0,1100170112607,37.22,52.61,4.47
1,1100170123968,11.06,60.1,7.69
2,1100170124172,3.92,5.04,9.8
3,1100170125567,20.05,25.67,30.75
4,1100170130401,66.67,24.14,2.3


### FRPM Data

https://www.cde.ca.gov/ds/ad/documents/frpm1617.xls

In [40]:
data = pd.read_csv('frpm17.csv', header = None).iloc[1:, :]

data.columns = data.iloc[0, :]

data = data.iloc[1:, :]

data.columns

Index(['Academic Year', 'County Code', 'District Code', 'School Code',
       'County Name', 'District Name', 'School Name', 'District Type',
       'School Type', 'Educational \nOption Type', 'NSLP \nProvision \nStatus',
       'Charter \nSchool \n(Y/N)', 'Charter \nSchool \nNumber',
       'Charter \nFunding \nType', 'IRC', 'Low Grade', 'High Grade',
       'Enrollment \n(K-12)', 'Free Meal \nCount \n(K-12)',
       'Percent (%) \nEligible Free \n(K-12)', 'FRPM Count \n(K-12)',
       'Percent (%) \nEligible FRPM \n(K-12)', 'Enrollment \n(Ages 5-17)',
       'Free Meal \nCount \n(Ages 5-17)',
       'Percent (%) \nEligible Free \n(Ages 5-17)', 'FRPM Count \n(Ages 5-17)',
       'Percent (%) \nEligible FRPM \n(Ages 5-17)',
       '2016-17 \nCALPADS Fall 1 \nCertification Status'],
      dtype='object', name=1)

In [41]:
frpm = data.loc[:, ['County Code', 'District Code', 'School Code',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)']].rename(columns = {'Academic Year':'year',
            'Percent (%) \nEligible FRPM \n(Ages 5-17)':'lowincome'})

frpm['lowincome'] = round(frpm['lowincome'].str.rstrip('%').astype('float'), 2)
frpm['County Code'] = frpm['County Code'].astype('int').astype('str')
frpm['schoolcode'] = frpm['County Code'] + frpm['District Code'] + frpm['School Code']

frpm = frpm.loc[:, ['schoolcode', 'lowincome']]

frpm.head()

1,schoolcode,lowincome
2,1100170112607,74.3
3,1100170123968,63.3
4,1100170124172,9.2
5,1100170125567,27.5
6,1100170130401,100.0


In [42]:
drop17 = frpm.merge(demo, on = 'schoolcode').merge(drop_data, on = 'schoolcode')
drop17.totalenrolled = drop17.totalenrolled.astype('int')
drop17['year'] = '2017'
drop17.head()

Unnamed: 0,schoolcode,lowincome,black,hispanic,white,CountyCode,DistrictCode,totalenrolled,dropout,year
0,1100170112607,74.3,37.22,52.61,4.47,1,10017,81,4.9,2017
1,1100170130401,100.0,66.67,24.14,2.3,1,10017,95,55.8,2017
2,1100170130419,79.9,39.26,45.04,5.37,1,10017,171,36.8,2017
3,1316090131755,100.0,2.56,53.85,25.64,1,31609,12,0.0,2017
4,1316170131763,92.8,8.68,50.0,22.37,1,31617,47,0.0,2017


### Merge Data (2017 - 2021)

In [44]:
mutual_school = list(set(drop17.schoolcode).intersection(set(drop18.schoolcode)).intersection(
    set(drop19.schoolcode)).intersection(set(drop21.schoolcode)))

mutual_drop17 = drop17.loc[drop17['schoolcode'].isin(mutual_school)]
mutual_drop18 = drop18.loc[drop18['schoolcode'].isin(mutual_school)]
mutual_drop19 = drop19.loc[drop19['schoolcode'].isin(mutual_school)]
mutual_drop21 = drop21.loc[drop21['schoolcode'].isin(mutual_school)]

In [45]:
pd.concat([mutual_drop17,mutual_drop18, mutual_drop19,mutual_drop21]).rename(columns = {
    'CountyCode':'countycode', 'DistrictCode':'districtcode'
}).to_csv('dropout_ca_17_21.csv')

### Pandemic Instruction Modality Data

In [46]:
mode = pd.read_csv('schoolmode.csv')

In [47]:
mode.columns

Index(['StateName', 'StateAbbrev', 'DataLevel', 'Charter', 'SchoolName',
       'SchoolType', 'NCESSchoolID', 'StateAssignedSchoolID', 'DistrictName',
       'DistrictType', 'NCESDistrictID', 'StateAssignedDistrictID',
       'EnrollmentTotal', 'TimePeriodInterval', 'TimePeriodStart',
       'TimePeriodEnd', 'LearningModel', 'LearningModelGrK5',
       'LearningModelGr68', 'LearningModelGr912', 'LearningModelStateCat',
       'LearningModelStateCatGrK5', 'LearningModelStateCatGr68',
       'LearningModelStateCatGr912', 'EnrollmentInPerson', 'EnrollmentHybrid',
       'EnrollmentVirtual', 'StaffCount', 'StaffCountInPerson'],
      dtype='object')

In [48]:
charter_mode = mode.loc[:, ['StateAssignedSchoolID', 'Charter']].drop_duplicates().rename(columns = {'StateAssignedSchoolID':
             'schoolcode', 'Charter':'charter'})
charter_map = {'No':0, 'Yes':1}
charter_mode['charter'] = charter_mode['charter'].map(charter_map)
charter_mode.head()

Unnamed: 0,schoolcode,charter
0,19642121995596,0
11,19642126010862,0
22,19642121930361,0
33,19642126071369,0
44,19642126010953,0


In [49]:
ca_mode = mode.groupby(['StateAssignedSchoolID'])['LearningModel'].value_counts(
    normalize = True).to_frame().rename(columns = {'LearningModel':'normalized'}).reset_index().rename(
columns = {'StateAssignedSchoolID':'schoolcode'})

ca_mode = ca_mode.pivot(columns = 'LearningModel', values = 'normalized', 
index = 'schoolcode').reset_index().fillna(0).loc[:, ['schoolcode', 'Hybrid', 'Virtual']].rename(
columns = {'Hybrid':'hybridper', 'Virtual':'virtualper'})

ca_mode['schoolmode'] = 1 * ca_mode['virtualper'] + 0.5 * ca_mode['hybridper']

ca_mode.loc[:, ['hybridper', 'virtualper', 'schoolmode']] = round(
    ca_mode.loc[:, ['hybridper', 'virtualper', 'schoolmode']], 2)

ca_mode.head()

LearningModel,schoolcode,hybridper,virtualper,schoolmode
0,1100170112607,0.0,1.0,1.0
1,1100170123968,0.0,1.0,1.0
2,1100170124172,0.0,1.0,1.0
3,1100170125567,0.0,1.0,1.0
4,1100170130419,0.0,1.0,1.0


### Merge Test Data and Schooling Mode Data

In [50]:
drop = pd.read_csv('dropout_ca_17_21.csv').iloc[:, 1:]
drop_mode = drop.merge(ca_mode, on = 'schoolcode').merge(charter_mode, on ='schoolcode').rename(
    columns = {'CountyCode':'countycode', 'DistrictCode':'districtcode'})

drop_mode.loc[drop_mode['year'] != 2021, ['hybridper', 'virtualper', 'schoolmode']] = 0
drop_mode['year'] = drop_mode['year'].astype('str')
drop_mode['schoolcode'] = drop_mode['schoolcode'].astype('str') + 'ca'
drop_mode['countycode'] = drop_mode['countycode'].astype('str') + 'ca'
drop_mode['districtcode'] = drop_mode['countycode'] + drop_mode['districtcode'].astype('str') + 'ca'
drop_mode['state'] = 'ca'

drop_mode.to_csv('final_data_ca_dropout.csv')