### Equitability of Education: Impacts on College Entrance Exam Performance

## Data Import and Cleaning

In [1]:
# Imports:
# ...
import numpy as np
# Data Wrangling, Cleaning, Reading from CSV file
import pandas as pd
# Hypothesis Testing 
from random import randint
# Vizualizations: 
import matplotlib.pyplot as plt
# Vizualizations: 
import seaborn as sns

#### Import Data Sets

In [2]:
# 2019 Californian ACT Scores
actdf = pd.read_csv('../data/act_2019_ca.csv') 
# 2019 Californian SAT Scores
satdf = pd.read_csv('../data/sat_2019_ca.csv')
# 2019 California School District Equitablility Rankings
equitydf = pd.read_csv('../data/equitable_county_rank.csv')

#### Display Raw Data

In [3]:
# 2019 Californian SAT Scores

# Drop the unnamed column filled with NaN's
satdf.drop(labels='Unnamed: 25', axis='columns', inplace=True)

# Drop the last row filled with NaN's
satdf.drop([len(satdf)-1], inplace=True)

# The dtypes of three ID-code columns can be reduced to int now that there are no more NaN's
dtypes = {'CDS':'int64', 'CDCode':'int32', 'CCode':'int16', 'Enroll12':'int32', 'NumTSTTakr12':'int16'}
satdf = satdf.astype(dtype = dtypes)

# Use MAGIC Method to work with clean satdf in Notebook 2
%store satdf

# A link to the data dictionary for the satdf-dataset is in the README
satdf.info()

Stored 'satdf' (DataFrame)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2579 entries, 0 to 2578
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   CDS                    2579 non-null   int64  
 1   CCode                  2579 non-null   int16  
 2   CDCode                 2579 non-null   int32  
 3   SCode                  2579 non-null   float64
 4   RType                  2579 non-null   object 
 5   SName                  1982 non-null   object 
 6   DName                  2521 non-null   object 
 7   CName                  2579 non-null   object 
 8   Enroll12               2579 non-null   int32  
 9   NumTSTTakr12           2579 non-null   int16  
 10  NumERWBenchmark12      2304 non-null   object 
 11  PctERWBenchmark12      2304 non-null   object 
 12  NumMathBenchmark12     2304 non-null   object 
 13  PctMathBenchmark12     2304 non-null   object 
 14  Enroll11               2579 n

In [94]:
# 31 records show more test takers than enrolled 12th Grade Students, so not only Class of 2019 scores were represented in:
# 'PctGE21', % Greater Than or Equal to 21 points
len(actdf[actdf['Enroll12']-actdf['NumTstTakr']<0])

31

In [4]:
# 2019 Californian ACT Scores

# Drop the unnamed column filled with NaN's
actdf.drop(labels='Unnamed: 17', axis='columns', inplace=True)

# Drop the last row filled with NaN's
actdf.drop([len(actdf)-1], inplace=True)

# The dtypes of three ID-code columns can be reduced to int now that there are no more NaN's
dtypes = {'CDS':'int64', 'CDCode':'int32', 'CCode':'int16', 'Enroll12':'int32', 'NumTstTakr':'int16'}
actdf = actdf.astype(dtype = dtypes)

# Add column 'PctTstTakr': 'NumTstTakr' / 'Enroll12'
# Use MAGIC Method to work with clean actdf in Notebook 2
%store actdf

# A link to the data dictionary for the actdf-dataset is in the README
actdf.info()

Stored 'actdf' (DataFrame)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2309 entries, 0 to 2308
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CDS         2309 non-null   int64  
 1   CCode       2309 non-null   int16  
 2   CDCode      2309 non-null   int32  
 3   SCode       1787 non-null   float64
 4   RType       2309 non-null   object 
 5   SName       1729 non-null   object 
 6   DName       2251 non-null   object 
 7   CName       2309 non-null   object 
 8   Enroll12    2309 non-null   int32  
 9   NumTstTakr  2309 non-null   int16  
 10  AvgScrRead  1953 non-null   object 
 11  AvgScrEng   1953 non-null   object 
 12  AvgScrMath  1953 non-null   object 
 13  AvgScrSci   1953 non-null   object 
 14  NumGE21     1953 non-null   object 
 15  PctGE21     1953 non-null   object 
 16  Year        2309 non-null   object 
dtypes: float64(1), int16(2), int32(2), int64(1), object(11)
memory usage: 279.6+ KB


In [6]:
# 1-State Record, 58-County Records, 522-District Records, 1728-School Records
actdf.groupby('RType').count()

Unnamed: 0_level_0,CDS,CCode,CDCode,SCode,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
RType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
C,58,58,58,58,0,0,58,58,58,57,57,57,57,57,57,58
D,522,522,522,0,0,522,522,522,522,459,459,459,459,459,459,522
S,1728,1728,1728,1728,1728,1728,1728,1728,1728,1436,1436,1436,1436,1436,1436,1728
X,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [87]:
# Assign records by type for readability
state = actdf[actdf['RType']=='X']
county = actdf[actdf['RType']=='C']
district = actdf[actdf['RType']=='D']
school = actdf[actdf['RType']=='S']

# California State Record
state

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
2070,0,0,0,0.0,X,State of California,State of California,State of California,489650,17132,22,22,22,22,45466,55.0,2018-19


ALPINE COUNTY:
* Due to the county's low population density, high school students generally transfer to other districts for grades 9-12, either Douglas County School District in Nevada or Lake Tahoe Unified School District in El Dorado County, California. https://alpinecoe.k12.ca.us/
* Alpine County Unified School District is the only district in Alpine County.

In [88]:
# Alpine - California's least populous county ~1,100 founded in 1860's silver boom, 1960's ski resorts
# Alpine County has one school district, "Alpine County Unified School District"
c = county[county['NumTstTakr']==0]
c[['CDS', 'CCode', 'RType', 'CName', 'Enroll12', 'NumTstTakr']]

Unnamed: 0,CDS,CCode,RType,CName,Enroll12,NumTstTakr
1103,2000000000000,2,C,Alpine,0,0


In [77]:
# 63-Districts, from 39-Counties, had 12th Grade Students enrolled in the 2018-19 School Year,
# but had ZERO students take the ACTs

# For readability
d = district[district['NumTstTakr']==0]
# Districts grouped by county
d_c = d.groupby('CName')[['CDS']].count()

# 39-Counties contained at least one district with ZERO students taking the ACTs
print(f'Number of counties: {len(d_c)}')
# 63-Districts contained ZERO Test Taking 12th Grade Students
print(f'Number of districts: {len(d)}')

# Display District-Record DataFrame
d[['DName', 'CName', 'Enroll12', 'NumTstTakr']]

Number of counties: 39
Number of districts: 63


Unnamed: 0,DName,CName,Enroll12,NumTstTakr
1122,Alameda County Office of Education,Alameda,170,0
1127,Emery Unified,Alameda,40,0
1139,Amador County Office of Education,Amador,15,0
1145,Feather Falls Union Elementary,Butte,16,0
1149,Calaveras County Office of Education,Calaveras,65,0
...,...,...,...,...
1619,Tuolumne County Superintendent of Schools,Tuolumne,34,0
1622,Big Oak Flat-Groveland Unified,Tuolumne,32,0
1634,Yolo County Office of Education,Yolo,87,0
1636,Esparto Unified,Yolo,46,0


In [72]:
# 292-Schools, in 192-Districts, from 50-Counties, had 12th Grade Students enrolled,
# but had ZERO students take the ACTs in the 2018-19 School Year.

# For readability
s = school[school['NumTstTakr']==0]
# Schools grouped by county
s_c = s.groupby('CName')[['CDS']].count()
# Schools grouped by district
s_d = s.groupby('DName')[['CDS']].count()

# 50-Counties contained at least one school with ZERO students taking the ACTs
print(f'Number of counties: {len(s_c)}')
# 192-Districts contained at least one school with ZERO students taking the ACTs
print(f'Number of districts: {len(s_d)}')
# 292-Schools contained ZERO Test Taking 12th Grade Students
print(f'Number of schools: {len(s)}')

# Display School-Record DataFrame
s[['SName','DName', 'CName', 'Enroll12', 'NumTstTakr']]

Number of counties: 50
Number of districts: 192
Number of schools: 292


Unnamed: 0,SName,DName,CName,Enroll12,NumTstTakr
0,21st Century Learning Institute,Beaumont Unified,Riverside,18,0
1,ABC Secondary (Alternative),ABC Unified,Los Angeles,58,0
2,Abraham Lincoln Alternative,Southern Kern Unified,Kern,18,0
17,Accelerated Charter High,Tulare Joint Union High,Tulare,78,0
28,Alder Grove Charter School 2,South Bay Union Elementary,Humboldt,51,0
...,...,...,...,...,...
2285,Wilson Alternative,Santa Clara Unified,Santa Clara,140,0
2300,Youth Opportunities Unlimited,Los Angeles Unified,Los Angeles,75,0
2301,YouthBuild Charter School of California,Inyo County Office of Education,Inyo,194,0
2305,Yuba County Career Preparatory Charter,Yuba County Office of Education,Yuba,102,0


In [89]:
# 3-Counties had only BETWEEN 1-14 students take the ACTs
c = county[(county['NumTstTakr']<15) & (county['NumTstTakr']>0)]

# Display only the informative columns
c[['CDS', 'CCode', 'RType', 'CName', 'Enroll12', 'NumTstTakr']]

Unnamed: 0,CDS,CCode,RType,CName,Enroll12,NumTstTakr
1076,46000000000000,46,C,Sierra,23,3
1100,32000000000000,32,C,Plumas,163,9
1105,8000000000000,8,C,Del Norte,363,5


In [81]:
# 109-Districts, from 46-Counties, only had BETWEEN 1-14 students take the ACTs

# For readability
d = district[(district['NumTstTakr']>0) & (district['NumTstTakr']<15)]
# Districts grouped by county
d_c = d.groupby('CName')[['CDS']].count()

# 39-Counties contained at least one district with ZERO students taking the ACTs
print(f'Number of Counties: {len(d_c)}')
# 63-Districts contained ZERO Test Taking 12th Grade Students
print(f'Number of Districts: {len(d)}')

# Display District-Record DataFrame
d[['DName', 'CName', 'Enroll12', 'NumTstTakr']]

Number of Counties: 46
Number of Districts: 109


Unnamed: 0,DName,CName,Enroll12,NumTstTakr
1141,Butte County Office of Education,Butte,207,10
1142,Biggs Unified,Butte,57,5
1144,Durham Unified,Butte,72,3
1150,Bret Harte Union High,Calaveras,170,12
1152,Colusa Unified,Colusa,109,5
...,...,...,...,...
1621,Summerville Union High,Tuolumne,189,12
1623,Ventura County Office of Education,Ventura,323,2
1625,Mesa Union Elementary,Ventura,18,1
1638,Winters Joint Unified,Yolo,119,6


In [82]:
# 420-Schools, in 257-Districts, from 51-Counties, had 12th Grade Students enrolled,
# but had only BETWEEN 1-14 students take the ACTs in the 2018-19 School Year.

# For readability
s = school[(school['NumTstTakr']>0) & (school['NumTstTakr']<15)]
# Schools grouped by county
s_c = s.groupby('CName')[['CDS']].count()
# Schools grouped by district
s_d = s.groupby('DName')[['CDS']].count()

# 51-Counties contained at least one school with ZERO students taking the ACTs
print(f'Number of Counties: {len(s_c)}')
# 257-Districts contained at least one school with ZERO students taking the ACTs
print(f'Number of Districts: {len(s_d)}')
# 420-Schools contained ZERO Test Taking 12th Grade Students
print(f'Number of Schools: {len(s)}')

# Display School-Record DataFrame
s[['SName','DName', 'CName', 'Enroll12', 'NumTstTakr']]

Number of Counties: 51
Number of Districts: 257
Number of Schools: 420


Unnamed: 0,SName,DName,CName,Enroll12,NumTstTakr
7,Academies of the Antelope Valley,Antelope Valley Union High,Los Angeles,14,1
8,Academy (The)- SF @McAteer,San Francisco Unified,San Francisco,84,9
9,Academy for Academic Excellence,Apple Valley Unified,San Bernardino,97,13
10,Academy for Multilingual Arts and Science at M...,Los Angeles Unified,Los Angeles,141,10
11,Academy of Careers and Exploration,Helendale Elementary,San Bernardino,63,5
...,...,...,...,...,...
2283,Willows High,Willows Unified,Glenn,115,5
2288,Winters High,Winters Joint Unified,Yolo,102,6
2296,Yerba Buena High,East Side Union High,Santa Clara,420,13
2304,Yuba City Independence Academy,Yuba City Unified,Sutter,21,1


In [109]:
state

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
2070,0,0,0,0.0,X,State of California,State of California,State of California,489650,17132,22,22,22,22,45466,55.0,2018-19


17132

In [127]:
# state < district < school < county
print(county['NumTstTakr'][:].sum(), \
        school['NumTstTakr'][:].sum(), \ # More in schools than districts?
        district['NumTstTakr'][:].sum(), \
        state['NumTstTakr'][:].sum()) # What even is ~17K 'NumTstTakrs' in the entire state??

82668 82654 82619 17132


In [137]:
# std dev ~ 32.8K
print(actdf.groupby('RType')['NumTstTakr'].sum().describe())

count        4.000000
mean     66268.250000
std      32757.506482
min      17132.000000
25%      66247.250000
50%      82636.500000
75%      82657.500000
max      82668.000000
Name: NumTstTakr, dtype: float64


In [129]:
82668-82654

14

In [130]:
82668-82619

49

In [136]:
# std dev ~ 22.9K
print(actdf.groupby('RType')['Enroll12'].sum().describe())

count         4.000000
mean     478062.750000
std       22868.199279
min      443762.000000
25%      477832.250000
50%      489419.500000
75%      489650.000000
max      489650.000000
Name: Enroll12, dtype: float64


In [131]:
# Certainty of error
# school < district < county = state
school['Enroll12'][:].sum(), county['Enroll12'][:].sum(), district['Enroll12'][:].sum(), state['Enroll12'][:].sum()

(443762, 489650, 489189, 489650)

In [85]:
# EDA: Create visual
# Disply the number of School Districts in each of the 58 counties
district.groupby('CName')['DName'].count()

CName
Alameda            17
Amador              2
Butte               8
Calaveras           3
Colusa              4
Contra Costa       11
Del Norte           2
El Dorado           5
Fresno             24
Glenn               5
Humboldt            8
Imperial            8
Inyo                3
Kern               13
Kings               7
Lake                5
Lassen              4
Los Angeles        63
Madera              6
Marin               5
Mariposa            1
Mendocino          10
Merced              8
Modoc               2
Mono                3
Monterey           10
Napa                4
Nevada              2
Orange             17
Placer              7
Plumas              1
Riverside          21
Sacramento         11
San Benito          2
San Bernardino     25
San Diego          29
San Francisco       2
San Joaquin        11
San Luis Obispo     8
San Mateo           9
Santa Barbara      13
Santa Clara        12
Santa Cruz          6
Shasta              7
Sierra              1
Sisk

In [86]:
# Notice that there are 58 counties in California, so the 58 NULL 'DNAME' entries are aggregates of all the districts in each county
actdf.isnull().sum()

CDS             0
CCode           0
CDCode          0
SCode         522
RType           0
SName         580
DName          58
CName           0
Enroll12        0
NumTstTakr      0
AvgScrRead    356
AvgScrEng     356
AvgScrMath    356
AvgScrSci     356
NumGE21       356
PctGE21       356
Year            0
dtype: int64

In [None]:
# 2019 Californian SAT Scores
# A link to the data dictionary for the satdf-dataset is in the README
satdf.head()

In [None]:
# 2019 California School District Equitablility Rankings
equitydf.head()

3. Check for any obvious issues with the observations (keep in mind the minimum & maximum possible values for each test/subtest).
4. Fix any errors you identified in steps 2-3.
5. Display the data types of each feature.
6. Fix any incorrect data types found in step 5.
    - Fix any individual values preventing other columns from being the appropriate type.
    - If your dataset has a column of percents (ex. '50%', '30.5%', etc.), use the function you wrote in Part 1 (coding challenges, number 3) to convert this to floats! *Hint*: use `.map()` or `.apply()`.
7. Rename Columns.
    - Column names should be all lowercase.
    - Column names should not contain spaces (underscores will suffice--this allows for using the `df.column_name` method to access columns in addition to `df['column_name']`).
    - Column names should be unique and informative.
8. Drop unnecessary rows (if needed).
9. Merge dataframes that can be merged.
10. Perform any additional cleaning that you feel is necessary.
11. Save your cleaned and merged dataframes as csv files.

#### Check for missing values

In [35]:
# A link to the data dictionary for the actdf-dataset is in the README
# 522 District records have no school name
# 
actdf.isnull().sum()

CDS             0
CCode           0
CDCode          0
SCode         522
RType           0
SName         580
DName          58
CName           0
Enroll12        0
NumTstTakr      0
AvgScrRead    356
AvgScrEng     356
AvgScrMath    356
AvgScrSci     356
NumGE21       356
PctGE21       356
Year            0
dtype: int64

In [41]:
school.isnull().sum()

CDS             0
CCode           0
CDCode          0
SCode           0
RType           0
SName           0
DName           0
CName           0
Enroll12        0
NumTstTakr      0
AvgScrRead    292
AvgScrEng     292
AvgScrMath    292
AvgScrSci     292
NumGE21       292
PctGE21       292
Year            0
dtype: int64

In [36]:
# A link to the data dictionary for the satdf-dataset is in the README
satdf.isnull().sum()

CDS                        0
CCode                      0
CDCode                     0
SCode                      0
RType                      0
SName                    597
DName                     58
CName                      0
Enroll12                   0
NumTSTTakr12               0
NumERWBenchmark12        275
PctERWBenchmark12        275
NumMathBenchmark12       275
PctMathBenchmark12       275
Enroll11                   0
NumTSTTakr11               0
NumERWBenchmark11        310
PctERWBenchmark11        310
NumMathBenchmark11       310
PctMathBenchmark11       310
TotNumBothBenchmark12    275
PctBothBenchmark12       275
TotNumBothBenchmark11    310
PctBothBenchmark11       310
Year                       0
dtype: int64

In [33]:
# The equity data
equitydf.isnull().sum()

Rank*                                                                 0
School District                                                       0
Score                                                                 0
Expenditures for Public Elementary and Secondary Schools per Pupil    0
Income by School District                                             0
dtype: int64

In [34]:
equitydf

Unnamed: 0,Rank*,School District,Score,Expenditures for Public Elementary and Secondary Schools per Pupil,Income by School District
0,1,Los Nietos School District,0.03,"$14,525","$63,516"
1,2,Gen Shafter Elementary School District,0.13,"$17,245","$49,167"
2,3,Browns Elementary School District,0.17,"$10,664","$83,942"
3,4,Pajaro Valley Unified School District,0.19,"$13,924","$66,867"
4,5,Bonita Unified School District,0.27,"$10,602","$84,202"
...,...,...,...,...,...
914,915,Ross Elementary School District,311.99,"$20,752","$250,000"
915,916,Woodside Elementary School District,338.51,"$27,516","$232,708"
916,917,Silver Fork Elementary School District,436.07,"$72,333","$63,125"
917,918,Desert Center Unified School District,440.19,"$77,200","$40,156"


In [None]:
#RType == 'C' entries contain aggregate County info
act_countydf = actdf[actdf['RType'] == 'C'].copy()
sat_countydf = satdf[satdf['RType'] == 'C'].copy()

#RType == 'D' entries contain aggregate School District info
act_districtdf = actdf[actdf['RType'] == 'D'].copy()
sat_districtdf = satdf[satdf['RType'] == 'D'].copy()

#Create filters for special cases where score records are not available(or don't exist)
#Filter out DISTRICT records where no seniors took exam, but more than 15 were enrolled
null_district_actdf = act_districtdf[act_districtdf['PctGE21'].isnull()==False]
null_district_satdf = sat_districtdf[sat_districtdf['PctBothBenchmark12'].isnull()==False]

#Filter out COUNTY records where no seniors took exam, but more than 15 were enrolled
null_county_actdf = act_countydf[act_countydf['PctGE21'].isnull()==False]
null_county_satdf = sat_countydf[sat_countydf['PctBothBenchmark12'].isnull()==False]

#Filter out DISTRICT records where less than 15 enrolled seniors took exam
district_scores_actdf = null_district_actdf[null_district_actdf['PctGE21']!='*'].copy()
district_scores_satdf = null_district_satdf[null_district_satdf['PctBothBenchmark12']!='*'].copy()

#Filter out COUNTY records where less than 15 enrolled seniors took exam
county_scores_actdf = null_county_actdf[null_county_actdf['PctGE21']!='*'].copy()
county_scores_satdf = null_county_satdf[null_county_satdf['PctBothBenchmark12']!='*'].copy()

#Drop special characters '$' and ',' from equitable_county_rank.csv
for row in range(len(equitydf.index)):
    equitydf.loc[row,['Expenditures for Public Elementary and Secondary Schools per Pupil']] \
        = equitydf['Expenditures for Public Elementary and Secondary Schools per Pupil'][row].replace(',','').replace('$','')
    equitydf.loc[row,['Income by School District']] = equitydf['Income by School District'][row].replace(',','').replace('$','')


#Fix Incorrect Data Types
for name in ['Expenditures for Public Elementary and Secondary Schools per Pupil', 'Income by School District']:
    equitydf[name] = equitydf[name].apply(int).copy()

for name in ['CCode', 'CDCode', 'Enroll12', 'NumTstTakr']:
    act_districtdf[name] = act_districtdf[name].apply(int).copy()
    act_countydf[name] = act_countydf[name].apply(int).copy()
    district_scores_actdf[name] = district_scores_actdf[name].apply(int).copy()
    county_scores_actdf[name] = county_scores_actdf[name].apply(int).copy()
    
for name in ['CCode', 'CDCode', 'Enroll12', 'NumTSTTakr12']:
    sat_districtdf[name] = sat_districtdf[name].apply(int).copy()
    sat_countydf[name] = sat_countydf[name].apply(int).copy()
    district_scores_satdf[name] = district_scores_satdf[name].apply(int).copy()
    county_scores_satdf[name] = county_scores_satdf[name].apply(int).copy()

#Additional dtypes to fix for 'PctGE21'-(ACT data) & 'PctBothBenchmark12'-(SAT data)
county_scores_actdf['NumGE21'] = county_scores_actdf['NumGE21'].apply(int)
district_scores_actdf['NumGE21'] = district_scores_actdf['NumGE21'].apply(int)

county_scores_actdf['PctGE21'] = county_scores_actdf['PctGE21'].apply(float)
district_scores_actdf['PctGE21'] = district_scores_actdf['PctGE21'].apply(float)

county_scores_satdf['NumTSTTakr12'] = county_scores_satdf['NumTSTTakr12'].apply(int)
district_scores_satdf['NumTSTTakr12'] = district_scores_satdf['NumTSTTakr12'].apply(int)

county_scores_satdf['TotNumBothBenchmark12'] = county_scores_satdf['TotNumBothBenchmark12'].apply(int)
district_scores_satdf['TotNumBothBenchmark12'] = district_scores_satdf['TotNumBothBenchmark12'].apply(int)

county_scores_satdf['PctBothBenchmark12'] = county_scores_satdf['PctBothBenchmark12'].apply(float)
district_scores_satdf['PctBothBenchmark12'] = district_scores_satdf['PctBothBenchmark12'].apply(float)

district_scores_actdf['PctGE21'] = district_scores_actdf['PctGE21'].apply(float)

#Remove excess columns and just focus on DISTRICT data for the sake of time
act_districtdf=act_districtdf[['CCode', 'CDCode','DName', 'CName','Enroll12', 'NumTstTakr']]
district_scores_actdf=district_scores_actdf[['CCode', 'CDCode','DName', 'CName','Enroll12', 'NumTstTakr', 'NumGE21', 'PctGE21']]

sat_districtdf = sat_districtdf[['CCode', 'CDCode', 'DName', 'CName','Enroll12', 'NumTSTTakr12']]
district_scores_satdf = district_scores_satdf[['CCode', 'CDCode', 'DName', 'CName','Enroll12', 'NumTSTTakr12', 'TotNumBothBenchmark12', 'PctBothBenchmark12']]

#Rename Columns
act_districtdf = act_districtdf.rename(columns={'CCode':'county_code','CName':'county_name','CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTstTakr':'tested_seniors'})
district_scores_actdf = district_scores_actdf.rename(columns={'CCode':'county_code','CName':'county_name','CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTstTakr':'tested_seniors', 'NumGE21':'num_over_benchmark_act', 'PctGE21':'pct_over_benchmark_act'})

sat_districtdf = sat_districtdf.rename(columns={'CCode':'county_code','CName':'county_name', 'CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTSTTakr12':'tested_seniors'})
district_scores_satdf = district_scores_satdf.rename(columns={'CCode':'county_code','CName':'county_name', 'CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTSTTakr12':'tested_seniors', 'TotNumBothBenchmark12':'num_over_benchmark_sat', 'PctBothBenchmark12':'pct_over_benchmark_sat'})
    
equitydf = equitydf.rename(columns={'Rank*':'rank', 'School District':'district_name', 'Score':'score','Expenditures for Public Elementary and Secondary Schools per Pupil':'expenditures_per_pupil','Income by School District':'income'})

#Additional Cleaning
#ID equitable counties from respective districts
#Drop 'School District' from the string containing the School District name
for index in range(len(equitydf['district_name'])):
    equitydf.loc[ index, [ 'district_name' ] ]  = equitydf['district_name'][index].replace('School District','')
    
#Hov helped me merge by showing me str.strip() and giving insight into the underlying problem
standardized_test = district_scores_satdf.merge(district_scores_actdf, on='district_name')
standardized_test['district_name'] = standardized_test['district_name'].str.strip()
standardized_test['district_name'] = standardized_test['district_name'].str.lower()
equitydf['district_name'] = equitydf['district_name'].str.strip()
equitydf['district_name'] = equitydf['district_name'].str.lower()
finaldf = standardized_test.merge(equitydf, on='district_name')
finaldf[ ['district_name', 'num_over_benchmark_sat', 'num_over_benchmark_act','pct_over_benchmark_sat','pct_over_benchmark_act','rank', 'score']].describe()

#Save your cleaned and merged dataframes as csv files.
finaldf.to_csv('./finaldf.csv', index=False)