### Equitability of Education: Impacts on College Entrance Exam Performance

## Data Import and Cleaning

In [146]:
# Imports:
# ...
import numpy as np
# Data Wrangling, Cleaning, Reading from CSV file
import pandas as pd
# Hypothesis Testing 
from random import randint
# Vizualizations: 
import matplotlib.pyplot as plt
# Vizualizations: 
import seaborn as sns

#### Import Data Sets

In [147]:
# 2019 Californian ACT Scores
actdf = pd.read_csv('../data/act_2019_ca.csv') 
# 2019 Californian SAT Scores
satdf = pd.read_csv('../data/sat_2019_ca.csv')
# 2019 California School District Equitablility Rankings
equitydf = pd.read_csv('../data/equitable_county_rank.csv')

#### Display Raw Data

In [148]:
# 2019 Californian SAT Scores

# Drop the unnamed column filled with NaN's
satdf.drop(labels='Unnamed: 25', axis='columns', inplace=True)

# Drop the last row filled with NaN's
satdf.drop([len(satdf)-1], inplace=True)

# The dtypes of three ID-code columns can be reduced to int now that there are no more NaN's
dtypes = {'CDS':'int64', 'CDCode':'int32', 'CCode':'int16', 'Enroll12':'int32', 'NumTSTTakr12':'int16'}
satdf = satdf.astype(dtype = dtypes)

# Consider only 12th Grade data
satdf = satdf[['CDS',
               'CCode',
               'CDCode',
               'SCode',
               'RType',
               'SName',
               'DName',
               'CName',
               'Enroll12',
               'NumTSTTakr12',
               'NumERWBenchmark12',
               'PctERWBenchmark12',
               'NumMathBenchmark12',
               'PctMathBenchmark12',
               'TotNumBothBenchmark12',
               'PctBothBenchmark12',
               'Year']]

# Use MAGIC Method to work with clean satdf in Notebook 2
%store satdf

# A link to the data dictionary for the satdf-dataset is in the README
satdf.info()

Stored 'satdf' (DataFrame)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2579 entries, 0 to 2578
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   CDS                    2579 non-null   int64  
 1   CCode                  2579 non-null   int16  
 2   CDCode                 2579 non-null   int32  
 3   SCode                  2579 non-null   float64
 4   RType                  2579 non-null   object 
 5   SName                  1982 non-null   object 
 6   DName                  2521 non-null   object 
 7   CName                  2579 non-null   object 
 8   Enroll12               2579 non-null   int32  
 9   NumTSTTakr12           2579 non-null   int16  
 10  NumERWBenchmark12      2304 non-null   object 
 11  PctERWBenchmark12      2304 non-null   object 
 12  NumMathBenchmark12     2304 non-null   object 
 13  PctMathBenchmark12     2304 non-null   object 
 14  TotNumBothBenchmark12  2304 n

In [151]:
# 2019 Californian ACT Scores

# Drop the unnamed column filled with NaN's
actdf.drop(labels='Unnamed: 17', axis='columns', inplace=True)

# Drop the last row filled with NaN's
actdf.drop([len(actdf)-1], inplace=True)

# The dtypes of three ID-code columns can be reduced to int now that there are no more NaN's
dtypes = {'CDS':'int64', 'CDCode':'int32', 'CCode':'int16', 'Enroll12':'int32', 'NumTstTakr':'int16'}
actdf = actdf.astype(dtype = dtypes)

# Add column 'PctTstTakr': 'NumTstTakr' / 'Enroll12'
# Use MAGIC Method to work with clean actdf in Notebook 2
%store actdf

# A link to the data dictionary for the actdf-dataset is in the README
actdf.info()

Stored 'actdf' (DataFrame)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2309 entries, 0 to 2308
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   CDS         2309 non-null   int64  
 1   CCode       2309 non-null   int16  
 2   CDCode      2309 non-null   int32  
 3   SCode       1787 non-null   float64
 4   RType       2309 non-null   object 
 5   SName       1729 non-null   object 
 6   DName       2251 non-null   object 
 7   CName       2309 non-null   object 
 8   Enroll12    2309 non-null   int32  
 9   NumTstTakr  2309 non-null   int16  
 10  AvgScrRead  1953 non-null   object 
 11  AvgScrEng   1953 non-null   object 
 12  AvgScrMath  1953 non-null   object 
 13  AvgScrSci   1953 non-null   object 
 14  NumGE21     1953 non-null   object 
 15  PctGE21     1953 non-null   object 
 16  Year        2309 non-null   object 
dtypes: float64(1), int16(2), int32(2), int64(1), object(11)
memory usage: 279.6+ KB


In [152]:
# 31 records show more test takers than enrolled 12th Grade Students, so scores represent a larger class than 2019 (12th Grade-Seniors):
# 'PctGE21', % Greater Than or Equal to 21 points
len(actdf[actdf['Enroll12']-actdf['NumTstTakr']<0])

31

In [155]:
# 7 school records show more 12th grade test takers than enrolled 12th Grade Students, so some students took the test more than once:
# 'PctGE21', % Greater Than or Equal to 21 points
satdf[satdf['Enroll12']-satdf['NumTSTTakr12']<0]

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTSTTakr12,NumERWBenchmark12,PctERWBenchmark12,NumMathBenchmark12,PctMathBenchmark12,TotNumBothBenchmark12,PctBothBenchmark12,Year
782,1612590111856,1,161259,111856.0,S,American Indian Public High,Oakland Unified,Alameda,62,63,44,69.84,41,65.08,34,53.97,2018-19
975,19647090100602,19,1964709,100602.0,S,"Lennox Mathematics, Science and Technology Aca...",Lennox,Los Angeles,143,144,84,58.33,56,38.89,54,37.50,2018-19
1056,30666703030608,30,3066670,3030608.0,S,Middle College High,Santa Ana Unified,Orange,86,87,71,81.61,37,42.53,35,40.23,2018-19
1482,19647330132084,19,1964733,132084.0,S,Alliance Marine - Innovation and Technology 6-...,Los Angeles Unified,Los Angeles,0,1,*,*,*,*,*,*,2018-19
1527,7617960730291,7,761796,730291.0,S,Middle College High,West Contra Costa Unified,Contra Costa,72,73,60,82.19,42,57.53,41,56.16,2018-19
1566,19647331996610,19,1964733,1996610.0,S,Los Angeles Leadership Academy,Los Angeles Unified,Los Angeles,44,46,17,36.96,5,10.87,5,10.87,2018-19
1625,19647330127795,19,1964733,127795.0,S,Contreras Learning Center-School of Social Jus...,Los Angeles Unified,Los Angeles,90,92,20,21.74,9,9.78,8,8.70,2018-19


---
### Record Type "RType": X-School, C-County, D-District, S-School

#### ACT

In [156]:
# 1-State Record, 58-County Records, 522-District Records, 1728-School Records
actdf.groupby('RType').count()

Unnamed: 0_level_0,CDS,CCode,CDCode,SCode,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
RType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
C,58,58,58,58,0,0,58,58,58,57,57,57,57,57,57,58
D,522,522,522,0,0,522,522,522,522,459,459,459,459,459,459,522
S,1728,1728,1728,1728,1728,1728,1728,1728,1728,1436,1436,1436,1436,1436,1436,1728
X,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


#### SAT

In [158]:
# 1-State Record, 58-County Records, 539-District Records, 1981-School Records
satdf.groupby('RType').count()

Unnamed: 0_level_0,CDS,CCode,CDCode,SCode,SName,DName,CName,Enroll12,NumTSTTakr12,NumERWBenchmark12,PctERWBenchmark12,NumMathBenchmark12,PctMathBenchmark12,TotNumBothBenchmark12,PctBothBenchmark12,Year
RType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
C,58,58,58,58,0,0,58,58,58,57,57,57,57,57,57,58
D,539,539,539,539,0,539,539,539,539,501,501,501,501,501,501,539
S,1981,1981,1981,1981,1981,1981,1981,1981,1981,1745,1745,1745,1745,1745,1745,1981
X,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


---

#### Assign records by type for readability

In [161]:
# State
state = actdf[actdf['RType']=='X']
state_sat = satdf[satdf['RType']=='X']

# County
county = actdf[actdf['RType']=='C']
county_sat = satdf[satdf['RType']=='C']

# District
district = actdf[actdf['RType']=='D']
district_sat = satdf[satdf['RType']=='D']

# School
school = actdf[actdf['RType']=='S']
school_sat = satdf[satdf['RType']=='S']

In [162]:
# California State Record: ACT
state

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
2070,0,0,0,0.0,X,State of California,State of California,State of California,489650,17132,22,22,22,22,45466,55.0,2018-19


In [160]:
# California State Record: SAT
state_sat

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTSTTakr12,NumERWBenchmark12,PctERWBenchmark12,NumMathBenchmark12,PctMathBenchmark12,TotNumBothBenchmark12,PctBothBenchmark12,Year
2036,0,0,0,0.0,X,State of California,State of California,State of California,489650,-26197,116500,68.36,81551,47.86,77178,45.29,2018-19


---

ALPINE COUNTY:
* California's least populous county(~1,100).
* Students transfer for grades 9-12, to either Lake Tahoe Unified School District in El Dorado County, or Douglas County-NV. https://alpinecoe.k12.ca.us/
* Alpine County Unified School District is the only district in Alpine County.

In [172]:
# ACT
c = county[county['NumTstTakr']==0]
c[['CDS', 'CCode', 'RType', 'CName', 'Enroll12', 'NumTstTakr']]

Unnamed: 0,CDS,CCode,RType,CName,Enroll12,NumTstTakr
1103,2000000000000,2,C,Alpine,0,0


In [174]:
# SAT
c_sat = county_sat[county_sat['NumTSTTakr12']==0]
c_sat[['CDS', 'CCode', 'RType', 'CName', 'Enroll12', 'NumTSTTakr12']]

Unnamed: 0,CDS,CCode,RType,CName,Enroll12,NumTSTTakr12
2012,2000000000000,2,C,Alpine,0,0


---

### District Data

#### ACT

In [167]:
# 63-Districts, from 39-Counties, had 12th Grade Students enrolled in the 2018-19 School Year,
# but had ZERO students take the ACTs

# For readability
d = district[district['NumTstTakr']==0]
# Districts grouped by county
d_c = d.groupby('CName')[['CDS']].count()

# 39-Counties contained at least one district with ZERO students taking the ACTs
print(f'Number of counties: {len(d_c)}')
# 63-Districts contained ZERO Test Taking 12th Grade Students
print(f'Number of districts: {len(d)}')

# Display District-Record DataFrame
d[['DName', 'CName', 'Enroll12', 'NumTstTakr']]

Number of counties: 39
Number of districts: 63


Unnamed: 0,DName,CName,Enroll12,NumTstTakr
1122,Alameda County Office of Education,Alameda,170,0
1127,Emery Unified,Alameda,40,0
1139,Amador County Office of Education,Amador,15,0
1145,Feather Falls Union Elementary,Butte,16,0
1149,Calaveras County Office of Education,Calaveras,65,0
...,...,...,...,...
1619,Tuolumne County Superintendent of Schools,Tuolumne,34,0
1622,Big Oak Flat-Groveland Unified,Tuolumne,32,0
1634,Yolo County Office of Education,Yolo,87,0
1636,Esparto Unified,Yolo,46,0


#### SAT

In [None]:
# 63-Districts, from 39-Counties, had 12th Grade Students enrolled in the 2018-19 School Year,
# but had ZERO students take the ACTs

# For readability
d = district[district['NumTstTakr']==0]
# Districts grouped by county
d_c = d.groupby('CName')[['CDS']].count()

# 39-Counties contained at least one district with ZERO students taking the ACTs
print(f'Number of counties: {len(d_c)}')
# 63-Districts contained ZERO Test Taking 12th Grade Students
print(f'Number of districts: {len(d)}')

# Display District-Record DataFrame
d[['DName', 'CName', 'Enroll12', 'NumTstTakr']]

---

In [171]:
district[district['NumTstTakr']==0]

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
1122,1100170000000,1,110017,,D,,Alameda County Office of Education,Alameda,170,0,,,,,,,2018-19
1123,1611190000000,1,161119,,D,,Alameda Unified,Alameda,919,155,27,26,26,25,134,86.45,2018-19
1124,1611270000000,1,161127,,D,,Albany City Unified,Alameda,307,58,28,27,27,26,51,87.93,2018-19
1125,1611430000000,1,161143,,D,,Berkeley Unified,Alameda,820,227,26,25,25,24,166,73.13,2018-19
1126,1611500000000,1,161150,,D,,Castro Valley Unified,Alameda,737,106,26,25,25,25,82,77.36,2018-19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1639,57727100000000,57,5772710,,D,,Woodland Joint Unified,Yolo,767,139,20,19,19,20,53,38.13,2018-19
1640,58105870000000,58,5810587,,D,,Yuba County Office of Education,Yuba,166,0,,,,,,,2018-19
1641,58727280000000,58,5872728,,D,,Camptonville Elementary,Yuba,42,4,*,*,*,*,*,*,2018-19
1642,58727360000000,58,5872736,,D,,Marysville Joint Unified,Yuba,602,42,20,18,19,19,17,40.48,2018-19


#### SAT

In [170]:
# 38-Districts, from 28-Counties, had 12th Grade Students enrolled in the 2018-19 School Year,
# but had ZERO students take the SATs

# For readability
d_sat = district_sat[district_sat['NumTSTTakr12']==0]
# Districts grouped by county
d_c_sat = d_sat.groupby('CName')[['CDS']].count()

# 39-Counties contained at least one district with ZERO students taking the ACTs
print(f'Number of counties: {len(d_c_sat)}')
# 63-Districts contained ZERO Test Taking 12th Grade Students
print(f'Number of districts: {len(d_sat)}')

# Display District-Record DataFrame
d_sat[['DName', 'CName', 'Enroll12', 'NumTSTTakr12']]

Number of counties: 28
Number of districts: 38


Unnamed: 0,DName,CName,Enroll12,NumTSTTakr12
2051,Big Sur Unified,Monterey,0,0
2067,Buckeye Union Elementary,El Dorado,25,0
2138,Sutter County Office of Education,Sutter,75,0
2140,Tuolumne County Superintendent of Schools,Tuolumne,34,0
2141,Yolo County Office of Education,Yolo,87,0
2163,San Luis Obispo County Office of Education,San Luis Obispo,186,0
2168,Mesa Union Elementary,Ventura,18,0
2193,SBE - Olive Grove Charter - Lompoc,Santa Barbara,36,0
2222,Santa Barbara County Office of Education,Santa Barbara,48,0
2229,Piner-Olivet Union Elementary,Sonoma,17,0


---

In [None]:
# 292-Schools, in 192-Districts, from 50-Counties, had 12th Grade Students enrolled,
# but had ZERO students take the ACTs in the 2018-19 School Year.

# For readability
s = school[school['NumTstTakr']==0]
# Schools grouped by county
s_c = s.groupby('CName')[['CDS']].count()
# Schools grouped by district
s_d = s.groupby('DName')[['CDS']].count()

# 50-Counties contained at least one school with ZERO students taking the ACTs
print(f'Number of counties: {len(s_c)}')
# 192-Districts contained at least one school with ZERO students taking the ACTs
print(f'Number of districts: {len(s_d)}')
# 292-Schools contained ZERO Test Taking 12th Grade Students
print(f'Number of schools: {len(s)}')

# Display School-Record DataFrame
s[['SName','DName', 'CName', 'Enroll12', 'NumTstTakr']]

In [None]:
# 3-Counties had only BETWEEN 1-14 students take the ACTs
c = county[(county['NumTstTakr']<15) & (county['NumTstTakr']>0)]

# Display only the informative columns
c[['CDS', 'CCode', 'RType', 'CName', 'Enroll12', 'NumTstTakr']]

In [None]:
# 109-Districts, from 46-Counties, only had BETWEEN 1-14 students take the ACTs

# For readability
d = district[(district['NumTstTakr']>0) & (district['NumTstTakr']<15)]
# Districts grouped by county
d_c = d.groupby('CName')[['CDS']].count()

# 39-Counties contained at least one district with ZERO students taking the ACTs
print(f'Number of Counties: {len(d_c)}')
# 63-Districts contained ZERO Test Taking 12th Grade Students
print(f'Number of Districts: {len(d)}')

# Display District-Record DataFrame
d[['DName', 'CName', 'Enroll12', 'NumTstTakr']]

In [None]:
# 420-Schools, in 257-Districts, from 51-Counties, had 12th Grade Students enrolled,
# but had only BETWEEN 1-14 students take the ACTs in the 2018-19 School Year.

# For readability
s = school[(school['NumTstTakr']>0) & (school['NumTstTakr']<15)]
# Schools grouped by county
s_c = s.groupby('CName')[['CDS']].count()
# Schools grouped by district
s_d = s.groupby('DName')[['CDS']].count()

# 51-Counties contained at least one school with ZERO students taking the ACTs
print(f'Number of Counties: {len(s_c)}')
# 257-Districts contained at least one school with ZERO students taking the ACTs
print(f'Number of Districts: {len(s_d)}')
# 420-Schools contained ZERO Test Taking 12th Grade Students
print(f'Number of Schools: {len(s)}')

# Display School-Record DataFrame
s[['SName','DName', 'CName', 'Enroll12', 'NumTstTakr']]

In [157]:
state

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
2070,0,0,0,0.0,X,State of California,State of California,State of California,489650,17132,22,22,22,22,45466,55.0,2018-19


In [None]:
# state < district < school < county
# More in schools than districts?
# What even is ~17K 'NumTstTakrs' in the entire state anyway??
print(county['NumTstTakr'][:].sum(),\
      school['NumTstTakr'][:].sum(),\
      district['NumTstTakr'][:].sum(),\
      state['NumTstTakr'][:].sum()) 

In [None]:
# std dev ~ 32.8K
print(actdf.groupby('RType')['NumTstTakr'].sum().describe())

In [None]:
82668-82654

In [None]:
82668-82619

In [None]:
# std dev ~ 22.9K
a = actdf.groupby('RType')['Enroll12'].sum().describe()
print(a)

In [None]:
# Certainty of error
# school < district < county = state
school['Enroll12'][:].sum(), county['Enroll12'][:].sum(), district['Enroll12'][:].sum(), state['Enroll12'][:].sum()

In [None]:
# EDA: Create visual
# Disply the number of School Districts in each of the 58 counties
district.groupby('CName')['DName'].count()

In [None]:
# Notice that there are 58 counties in California, so the 58 NULL 'DNAME' entries are aggregates of all the districts in each county
actdf.isnull().sum()

In [None]:
# 2019 Californian SAT Scores
# A link to the data dictionary for the satdf-dataset is in the README
satdf.head()

In [None]:
# 2019 California School District Equitablility Rankings
equitydf.head()

3. Check for any obvious issues with the observations (keep in mind the minimum & maximum possible values for each test/subtest).
4. Fix any errors you identified in steps 2-3.
5. Display the data types of each feature.
6. Fix any incorrect data types found in step 5.
    - Fix any individual values preventing other columns from being the appropriate type.
    - If your dataset has a column of percents (ex. '50%', '30.5%', etc.), use the function you wrote in Part 1 (coding challenges, number 3) to convert this to floats! *Hint*: use `.map()` or `.apply()`.
7. Rename Columns.
    - Column names should be all lowercase.
    - Column names should not contain spaces (underscores will suffice--this allows for using the `df.column_name` method to access columns in addition to `df['column_name']`).
    - Column names should be unique and informative.
8. Drop unnecessary rows (if needed).
9. Merge dataframes that can be merged.
10. Perform any additional cleaning that you feel is necessary.
11. Save your cleaned and merged dataframes as csv files.

#### Check for missing values

In [None]:
# A link to the data dictionary for the actdf-dataset is in the README
# 522 District records have no school name
# 
actdf.isnull().sum()

In [None]:
school.isnull().sum()

In [None]:
# A link to the data dictionary for the satdf-dataset is in the README
satdf.isnull().sum()

In [None]:
# The equity data
equitydf.isnull().sum()

In [None]:
equitydf

In [None]:
#RType == 'C' entries contain aggregate County info
act_countydf = actdf[actdf['RType'] == 'C'].copy()
sat_countydf = satdf[satdf['RType'] == 'C'].copy()

#RType == 'D' entries contain aggregate School District info
act_districtdf = actdf[actdf['RType'] == 'D'].copy()
sat_districtdf = satdf[satdf['RType'] == 'D'].copy()

#Create filters for special cases where score records are not available(or don't exist)
#Filter out DISTRICT records where no seniors took exam, but more than 15 were enrolled
null_district_actdf = act_districtdf[act_districtdf['PctGE21'].isnull()==False]
null_district_satdf = sat_districtdf[sat_districtdf['PctBothBenchmark12'].isnull()==False]

#Filter out COUNTY records where no seniors took exam, but more than 15 were enrolled
null_county_actdf = act_countydf[act_countydf['PctGE21'].isnull()==False]
null_county_satdf = sat_countydf[sat_countydf['PctBothBenchmark12'].isnull()==False]

#Filter out DISTRICT records where less than 15 enrolled seniors took exam
district_scores_actdf = null_district_actdf[null_district_actdf['PctGE21']!='*'].copy()
district_scores_satdf = null_district_satdf[null_district_satdf['PctBothBenchmark12']!='*'].copy()

#Filter out COUNTY records where less than 15 enrolled seniors took exam
county_scores_actdf = null_county_actdf[null_county_actdf['PctGE21']!='*'].copy()
county_scores_satdf = null_county_satdf[null_county_satdf['PctBothBenchmark12']!='*'].copy()

#Drop special characters '$' and ',' from equitable_county_rank.csv
for row in range(len(equitydf.index)):
    equitydf.loc[row,['Expenditures for Public Elementary and Secondary Schools per Pupil']] \
        = equitydf['Expenditures for Public Elementary and Secondary Schools per Pupil'][row].replace(',','').replace('$','')
    equitydf.loc[row,['Income by School District']] = equitydf['Income by School District'][row].replace(',','').replace('$','')


#Fix Incorrect Data Types
for name in ['Expenditures for Public Elementary and Secondary Schools per Pupil', 'Income by School District']:
    equitydf[name] = equitydf[name].apply(int).copy()

for name in ['CCode', 'CDCode', 'Enroll12', 'NumTstTakr']:
    act_districtdf[name] = act_districtdf[name].apply(int).copy()
    act_countydf[name] = act_countydf[name].apply(int).copy()
    district_scores_actdf[name] = district_scores_actdf[name].apply(int).copy()
    county_scores_actdf[name] = county_scores_actdf[name].apply(int).copy()
    
for name in ['CCode', 'CDCode', 'Enroll12', 'NumTSTTakr12']:
    sat_districtdf[name] = sat_districtdf[name].apply(int).copy()
    sat_countydf[name] = sat_countydf[name].apply(int).copy()
    district_scores_satdf[name] = district_scores_satdf[name].apply(int).copy()
    county_scores_satdf[name] = county_scores_satdf[name].apply(int).copy()

#Additional dtypes to fix for 'PctGE21'-(ACT data) & 'PctBothBenchmark12'-(SAT data)
county_scores_actdf['NumGE21'] = county_scores_actdf['NumGE21'].apply(int)
district_scores_actdf['NumGE21'] = district_scores_actdf['NumGE21'].apply(int)

county_scores_actdf['PctGE21'] = county_scores_actdf['PctGE21'].apply(float)
district_scores_actdf['PctGE21'] = district_scores_actdf['PctGE21'].apply(float)

county_scores_satdf['NumTSTTakr12'] = county_scores_satdf['NumTSTTakr12'].apply(int)
district_scores_satdf['NumTSTTakr12'] = district_scores_satdf['NumTSTTakr12'].apply(int)

county_scores_satdf['TotNumBothBenchmark12'] = county_scores_satdf['TotNumBothBenchmark12'].apply(int)
district_scores_satdf['TotNumBothBenchmark12'] = district_scores_satdf['TotNumBothBenchmark12'].apply(int)

county_scores_satdf['PctBothBenchmark12'] = county_scores_satdf['PctBothBenchmark12'].apply(float)
district_scores_satdf['PctBothBenchmark12'] = district_scores_satdf['PctBothBenchmark12'].apply(float)

district_scores_actdf['PctGE21'] = district_scores_actdf['PctGE21'].apply(float)

#Remove excess columns and just focus on DISTRICT data for the sake of time
act_districtdf=act_districtdf[['CCode', 'CDCode','DName', 'CName','Enroll12', 'NumTstTakr']]
district_scores_actdf=district_scores_actdf[['CCode', 'CDCode','DName', 'CName','Enroll12', 'NumTstTakr', 'NumGE21', 'PctGE21']]

sat_districtdf = sat_districtdf[['CCode', 'CDCode', 'DName', 'CName','Enroll12', 'NumTSTTakr12']]
district_scores_satdf = district_scores_satdf[['CCode', 'CDCode', 'DName', 'CName','Enroll12', 'NumTSTTakr12', 'TotNumBothBenchmark12', 'PctBothBenchmark12']]

#Rename Columns
act_districtdf = act_districtdf.rename(columns={'CCode':'county_code','CName':'county_name','CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTstTakr':'tested_seniors'})
district_scores_actdf = district_scores_actdf.rename(columns={'CCode':'county_code','CName':'county_name','CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTstTakr':'tested_seniors', 'NumGE21':'num_over_benchmark_act', 'PctGE21':'pct_over_benchmark_act'})

sat_districtdf = sat_districtdf.rename(columns={'CCode':'county_code','CName':'county_name', 'CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTSTTakr12':'tested_seniors'})
district_scores_satdf = district_scores_satdf.rename(columns={'CCode':'county_code','CName':'county_name', 'CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTSTTakr12':'tested_seniors', 'TotNumBothBenchmark12':'num_over_benchmark_sat', 'PctBothBenchmark12':'pct_over_benchmark_sat'})
    
equitydf = equitydf.rename(columns={'Rank*':'rank', 'School District':'district_name', 'Score':'score','Expenditures for Public Elementary and Secondary Schools per Pupil':'expenditures_per_pupil','Income by School District':'income'})

#Additional Cleaning
#ID equitable counties from respective districts
#Drop 'School District' from the string containing the School District name
for index in range(len(equitydf['district_name'])):
    equitydf.loc[ index, [ 'district_name' ] ]  = equitydf['district_name'][index].replace('School District','')
    
#Hov helped me merge by showing me str.strip() and giving insight into the underlying problem
standardized_test = district_scores_satdf.merge(district_scores_actdf, on='district_name')
standardized_test['district_name'] = standardized_test['district_name'].str.strip()
standardized_test['district_name'] = standardized_test['district_name'].str.lower()
equitydf['district_name'] = equitydf['district_name'].str.strip()
equitydf['district_name'] = equitydf['district_name'].str.lower()
finaldf = standardized_test.merge(equitydf, on='district_name')
finaldf[ ['district_name', 'num_over_benchmark_sat', 'num_over_benchmark_act','pct_over_benchmark_sat','pct_over_benchmark_act','rank', 'score']].describe()

#Save your cleaned and merged dataframes as csv files.
finaldf.to_csv('./finaldf.csv', index=False)