## Data Import and Cleaning

In [None]:
# Imports:
# ...
import numpy as np
# Data Wrangling, Cleaning, Reading from file
import pandas as pd
# Hypothesis Testing
from random import randint
# Vizualizations:
import matplotlib.pyplot as plt
# Vizualizations: 
import seaborn as sns

### Data Import & Cleaning

Import the datasets that you selected for this project and go through the following steps at a minimum. You are welcome to do further cleaning as you feel necessary:
1. Display the data: print the first 5 rows of each dataframe to your Jupyter notebook.
2. Check for missing values.
3. Check for any obvious issues with the observations (keep in mind the minimum & maximum possible values for each test/subtest).
4. Fix any errors you identified in steps 2-3.
5. Display the data types of each feature.
6. Fix any incorrect data types found in step 5.
    - Fix any individual values preventing other columns from being the appropriate type.
    - If your dataset has a column of percents (ex. '50%', '30.5%', etc.), use the function you wrote in Part 1 (coding challenges, number 3) to convert this to floats! *Hint*: use `.map()` or `.apply()`.
7. Rename Columns.
    - Column names should be all lowercase.
    - Column names should not contain spaces (underscores will suffice--this allows for using the `df.column_name` method to access columns in addition to `df['column_name']`).
    - Column names should be unique and informative.
8. Drop unnecessary rows (if needed).
9. Merge dataframes that can be merged.
10. Perform any additional cleaning that you feel is necessary.
11. Save your cleaned and merged dataframes as csv files.

In [None]:
# Code:
#Import Data Sets
actdf = pd.read_csv('../data/act_2019_ca.csv') #2019 ACT Scores in California by School
satdf = pd.read_csv('../data/sat_2019_ca.csv') #2019 SAT Scores in California by School
equitydf = pd.read_csv('../data/equitable_county_rank.csv') #2019 California School District Equitablility Ranking

#Display Data
print(actdf.head())
print(satdf.head())
print(equitydf.head())

#Drop the empty (last) row
actdf.drop(actdf.tail(1).index,inplace=True)
satdf.drop(satdf.tail(1).index,inplace=True)

#RType == 'C' entries contain aggregate County info
act_countydf = actdf[actdf['RType'] == 'C'].copy()
sat_countydf = satdf[satdf['RType'] == 'C'].copy()

#RType == 'D' entries contain aggregate School District info
act_districtdf = actdf[actdf['RType'] == 'D'].copy()
sat_districtdf = satdf[satdf['RType'] == 'D'].copy()

#Create filters for special cases where score records are not available(or don't exist)
#Filter out DISTRICT records where no seniors took exam, but more than 15 were enrolled
null_district_actdf = act_districtdf[act_districtdf['PctGE21'].isnull()==False]
null_district_satdf = sat_districtdf[sat_districtdf['PctBothBenchmark12'].isnull()==False]

#Filter out COUNTY records where no seniors took exam, but more than 15 were enrolled
null_county_actdf = act_countydf[act_countydf['PctGE21'].isnull()==False]
null_county_satdf = sat_countydf[sat_countydf['PctBothBenchmark12'].isnull()==False]

#Filter out DISTRICT records where less than 15 enrolled seniors took exam
district_scores_actdf = null_district_actdf[null_district_actdf['PctGE21']!='*'].copy()
district_scores_satdf = null_district_satdf[null_district_satdf['PctBothBenchmark12']!='*'].copy()

#Filter out COUNTY records where less than 15 enrolled seniors took exam
county_scores_actdf = null_county_actdf[null_county_actdf['PctGE21']!='*'].copy()
county_scores_satdf = null_county_satdf[null_county_satdf['PctBothBenchmark12']!='*'].copy()

#Drop special characters '$' and ',' from equitable_county_rank.csv
for row in range(len(equitydf.index)):
    equitydf.loc[row,['Expenditures for Public Elementary and Secondary Schools per Pupil']] \
        = equitydf['Expenditures for Public Elementary and Secondary Schools per Pupil'][row].replace(',','').replace('$','')
    equitydf.loc[row,['Income by School District']] = equitydf['Income by School District'][row].replace(',','').replace('$','')

    #Display datatypes
print(sat_districtdf.dtypes)
print(act_districtdf.dtypes)
print(district_scores_actdf.dtypes)
print(district_scores_satdf.dtypes)
print(equitydf.dtypes)

#Fix Incorrect Data Types
for name in ['Expenditures for Public Elementary and Secondary Schools per Pupil', 'Income by School District']:
    equitydf[name] = equitydf[name].apply(int).copy()

for name in ['CCode', 'CDCode', 'Enroll12', 'NumTstTakr']:
    act_districtdf[name] = act_districtdf[name].apply(int).copy()
    act_countydf[name] = act_countydf[name].apply(int).copy()
    district_scores_actdf[name] = district_scores_actdf[name].apply(int).copy()
    county_scores_actdf[name] = county_scores_actdf[name].apply(int).copy()
    
for name in ['CCode', 'CDCode', 'Enroll12', 'NumTSTTakr12']:
    sat_districtdf[name] = sat_districtdf[name].apply(int).copy()
    sat_countydf[name] = sat_countydf[name].apply(int).copy()
    district_scores_satdf[name] = district_scores_satdf[name].apply(int).copy()
    county_scores_satdf[name] = county_scores_satdf[name].apply(int).copy()

#Additional dtypes to fix for 'PctGE21'-(ACT data) & 'PctBothBenchmark12'-(SAT data)
county_scores_actdf['NumGE21'] = county_scores_actdf['NumGE21'].apply(int)
district_scores_actdf['NumGE21'] = district_scores_actdf['NumGE21'].apply(int)

county_scores_actdf['PctGE21'] = county_scores_actdf['PctGE21'].apply(float)
district_scores_actdf['PctGE21'] = district_scores_actdf['PctGE21'].apply(float)

county_scores_satdf['NumTSTTakr12'] = county_scores_satdf['NumTSTTakr12'].apply(int)
district_scores_satdf['NumTSTTakr12'] = district_scores_satdf['NumTSTTakr12'].apply(int)

county_scores_satdf['TotNumBothBenchmark12'] = county_scores_satdf['TotNumBothBenchmark12'].apply(int)
district_scores_satdf['TotNumBothBenchmark12'] = district_scores_satdf['TotNumBothBenchmark12'].apply(int)

county_scores_satdf['PctBothBenchmark12'] = county_scores_satdf['PctBothBenchmark12'].apply(float)
district_scores_satdf['PctBothBenchmark12'] = district_scores_satdf['PctBothBenchmark12'].apply(float)

district_scores_actdf['PctGE21'] = district_scores_actdf['PctGE21'].apply(float)

#Remove excess columns and just focus on DISTRICT data for the sake of time
act_districtdf=act_districtdf[['CCode', 'CDCode','DName', 'CName','Enroll12', 'NumTstTakr']]
district_scores_actdf=district_scores_actdf[['CCode', 'CDCode','DName', 'CName','Enroll12', 'NumTstTakr', 'NumGE21', 'PctGE21']]

sat_districtdf = sat_districtdf[['CCode', 'CDCode', 'DName', 'CName','Enroll12', 'NumTSTTakr12']]
district_scores_satdf = district_scores_satdf[['CCode', 'CDCode', 'DName', 'CName','Enroll12', 'NumTSTTakr12', 'TotNumBothBenchmark12', 'PctBothBenchmark12']]

#Rename Columns
act_districtdf = act_districtdf.rename(columns={'CCode':'county_code','CName':'county_name','CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTstTakr':'tested_seniors'})
district_scores_actdf = district_scores_actdf.rename(columns={'CCode':'county_code','CName':'county_name','CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTstTakr':'tested_seniors', 'NumGE21':'num_over_benchmark_act', 'PctGE21':'pct_over_benchmark_act'})

sat_districtdf = sat_districtdf.rename(columns={'CCode':'county_code','CName':'county_name', 'CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTSTTakr12':'tested_seniors'})
district_scores_satdf = district_scores_satdf.rename(columns={'CCode':'county_code','CName':'county_name', 'CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTSTTakr12':'tested_seniors', 'TotNumBothBenchmark12':'num_over_benchmark_sat', 'PctBothBenchmark12':'pct_over_benchmark_sat'})
    
equitydf = equitydf.rename(columns={'Rank*':'rank', 'School District':'district_name', 'Score':'score','Expenditures for Public Elementary and Secondary Schools per Pupil':'expenditures_per_pupil','Income by School District':'income'})

#Additional Cleaning
#ID equitable counties from respective districts
#Drop 'School District' from the string containing the School District name
for index in range(len(equitydf['district_name'])):
    equitydf.loc[ index, [ 'district_name' ] ]  = equitydf['district_name'][index].replace('School District','')
    
#Hov helped me merge by sowing me str.strip() and giving insiight into the underlying problem
standardized_test = district_scores_satdf.merge(district_scores_actdf, on='district_name')
standardized_test['district_name'] = standardized_test['district_name'].str.strip()
standardized_test['district_name'] = standardized_test['district_name'].str.lower()
equitydf['district_name'] = equitydf['district_name'].str.strip()
equitydf['district_name'] = equitydf['district_name'].str.lower()
finaldf = standardized_test.merge(equitydf, on='district_name')
finaldf[ ['district_name', 'num_over_benchmark_sat', 'num_over_benchmark_act','pct_over_benchmark_sat','pct_over_benchmark_act','rank', 'score']].describe()

#Save your cleaned and merged dataframes as csv files.
finaldf.to_csv('./finaldf.csv', index=False)