## Data Import and Cleaning

In [75]:
# Imports:
# ...
import numpy as np
# Data Wrangling, Cleaning, Reading from CSV file
import pandas as pd
# Hypothesis Testing 
from random import randint
# Vizualizations: 
import matplotlib.pyplot as plt
# Vizualizations: 
import seaborn as sns

#### Import Data Sets

In [76]:
# 2019 Californian ACT Scores
actdf = pd.read_csv('../data/act_2019_ca.csv') 
# 2019 Californian SAT Scores
satdf = pd.read_csv('../data/sat_2019_ca.csv')
# 2019 California School District Equitablility Rankings
equitydf = pd.read_csv('../data/equitable_county_rank.csv')

#### Display Raw Data

In [77]:
# 2019 Californian ACT Scores

# Drop the unnamed column filled with NaN's
actdf.drop(labels='Unnamed: 17',axis='columns', inplace=True)

# Drop the last row filled with NaN's
actdf.drop([len(actdf)-1], inplace=True)

# A link to the data dictionary for the actdf-dataset is in the README
actdf.head()

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
0,33669930000000.0,33.0,3366993.0,129882.0,S,21st Century Learning Institute,Beaumont Unified,Riverside,18.0,0.0,,,,,,,2018-19
1,19642120000000.0,19.0,1964212.0,1995596.0,S,ABC Secondary (Alternative),ABC Unified,Los Angeles,58.0,0.0,,,,,,,2018-19
2,15637760000000.0,15.0,1563776.0,1530377.0,S,Abraham Lincoln Alternative,Southern Kern Unified,Kern,18.0,0.0,,,,,,,2018-19
3,43696660000000.0,43.0,4369666.0,4333795.0,S,Abraham Lincoln High,San Jose Unified,Santa Clara,463.0,53.0,23.0,22.0,22.0,23.0,34.0,64.15,2018-19
4,19647330000000.0,19.0,1964733.0,1935121.0,S,Abraham Lincoln Senior High,Los Angeles Unified,Los Angeles,226.0,19.0,21.0,20.0,23.0,22.0,11.0,57.89,2018-19


In [78]:
# 58 counties, 522 school districts, 1728 schools
actdf.groupby('RType').count()

Unnamed: 0_level_0,CDS,CCode,CDCode,SCode,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
RType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
C,58,58,58,58,0,0,58,58,58,57,57,57,57,57,57,58
D,522,522,522,0,0,522,522,522,522,459,459,459,459,459,459,522
S,1728,1728,1728,1728,1728,1728,1728,1728,1728,1436,1436,1436,1436,1436,1436,1728
X,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1


In [79]:
# Disply the number of School Districts in each of the 58 counties
actdf[actdf['RType']=='D'].groupby('CName')['DName'].count()

CName
Alameda            17
Amador              2
Butte               8
Calaveras           3
Colusa              4
Contra Costa       11
Del Norte           2
El Dorado           5
Fresno             24
Glenn               5
Humboldt            8
Imperial            8
Inyo                3
Kern               13
Kings               7
Lake                5
Lassen              4
Los Angeles        63
Madera              6
Marin               5
Mariposa            1
Mendocino          10
Merced              8
Modoc               2
Mono                3
Monterey           10
Napa                4
Nevada              2
Orange             17
Placer              7
Plumas              1
Riverside          21
Sacramento         11
San Benito          2
San Bernardino     25
San Diego          29
San Francisco       2
San Joaquin        11
San Luis Obispo     8
San Mateo           9
Santa Barbara      13
Santa Clara        12
Santa Cruz          6
Shasta              7
Sierra              1
Sisk

ALPINE COUNTY:
* Due to the county's low population density, high school students generally transfer to other districts for grades 9-12, either Douglas County School District in Nevada or Lake Tahoe Unified School District in El Dorado County, California. https://alpinecoe.k12.ca.us/
* Alpine County Unified School District is the only district in Alpine County.

In [80]:
# Alpine -  California's least populous county ~1,100 founded in 1860's silver boom, 1960's ski resorts
actdf[actdf.CName =='Alpine']

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
1103,2000000000000.0,2.0,0.0,0.0,C,,,Alpine,0.0,0.0,,,,,,,2018-19


In [81]:
# Alpine County has one school district, Alpine County Unified School District
actdf.iloc[1100].SName

nan

In [82]:
actdf[actdf.RType == 'C'].PctGE21

1064    49.79
1065    51.41
1066    43.28
1067    75.72
1068    59.69
1069    87.10
1070    80.00
1071    69.97
1072    80.48
1073    76.99
1074    64.71
1075    34.72
1076        *
1077    73.15
1078    45.53
1079    65.59
1080    24.47
1081    73.33
1082    67.86
1083    33.33
1084    73.83
1085    55.91
1086    53.52
1087    61.41
1088    41.54
1089    71.60
1090    70.69
1091    78.38
1092    64.73
1093    44.98
1094    20.00
1095    78.26
1096    62.20
1097    64.60
1098    46.88
1099    59.86
1100        *
1101    28.61
1102    55.56
1103      NaN
1104    34.62
1105        *
1106    67.74
1107    76.86
1108    50.00
1109    34.28
1110    34.77
1111    41.72
1112    69.47
1113    37.87
1114    62.82
1115    36.49
1116    48.42
1117    78.05
1118    52.41
1119    30.30
1120    75.00
1121    44.40
Name: PctGE21, dtype: object

In [57]:
#county_records['PctGE21'].isnull()

In [62]:
county_records = actdf[actdf.RType == 'C']
for percent in county_records.PctGE21:
    if percent == ('*'):
        print(percent)

*
*
*


In [84]:
#county_records.iloc[[:]]
# All rows, one column
for index in county_records.loc[ :, ['PctGE21']].index:
    if county_records.loc[ index, ['PctGE21']] == '*':
        print(index)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [85]:
county_records

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
1064,34000000000000.0,34.0,0.0,0.0,C,**,**,Sacramento,19540.0,3065.0,21,20,21,21,1526,49.79,2018-19
1065,39000000000000.0,39.0,0.0,0.0,C,**,**,San Joaquin,11778.0,1348.0,22,21,21,21,693,51.41,2018-19
1066,15000000000000.0,15.0,0.0,0.0,C,**,**,Kern,14229.0,1257.0,21,20,20,20,544,43.28,2018-19
1067,45000000000000.0,45.0,0.0,0.0,C,**,**,Shasta,2161.0,173.0,26,24,24,24,131,75.72,2018-19
1068,51000000000000.0,51.0,0.0,0.0,C,**,**,Sutter,1827.0,196.0,23,22,22,22,117,59.69,2018-19
1069,5000000000000.0,5.0,0.0,0.0,C,**,**,Calaveras,451.0,31.0,26,24,24,24,27,87.10,2018-19
1070,26000000000000.0,26.0,0.0,0.0,C,**,**,Mono,420.0,20.0,26,24,24,24,16,80.00,2018-19
1071,56000000000000.0,56.0,0.0,0.0,C,**,**,Ventura,10750.0,1552.0,25,24,24,24,1086,69.97,2018-19
1072,21000000000000.0,21.0,0.0,0.0,C,**,**,Marin,2647.0,753.0,27,26,25,25,606,80.48,2018-19
1073,9000000000000.0,9.0,0.0,0.0,C,**,**,El Dorado,2226.0,465.0,25,24,25,24,358,76.99,2018-19


In [83]:
# Fill 'NaN' score percentages with '**'
county_records.fillna(value={'PctGE21': '**'}, inplace=True)
# 
county_records.index[county_records['PctGE21']].tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


IndexError: arrays used as indices must be of integer (or boolean) type

In [64]:
for i in list(county_records.index):
    if county_records[['PctGE21']]

1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121


In [None]:
for county in actdf.CName

In [13]:
# Four counties are lacking test scores: Sierra, Plumas, Alpine, and Del Norte
no_score = [actdf.CName for county in ]
# In counties with less than 
actdf[actdf['DName'].isnull()]

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTstTakr,AvgScrRead,AvgScrEng,AvgScrMath,AvgScrSci,NumGE21,PctGE21,Year
1064,34000000000000.0,34.0,0.0,0.0,C,,,Sacramento,19540.0,3065.0,21,20,21,21,1526,49.79,2018-19
1065,39000000000000.0,39.0,0.0,0.0,C,,,San Joaquin,11778.0,1348.0,22,21,21,21,693,51.41,2018-19
1066,15000000000000.0,15.0,0.0,0.0,C,,,Kern,14229.0,1257.0,21,20,20,20,544,43.28,2018-19
1067,45000000000000.0,45.0,0.0,0.0,C,,,Shasta,2161.0,173.0,26,24,24,24,131,75.72,2018-19
1068,51000000000000.0,51.0,0.0,0.0,C,,,Sutter,1827.0,196.0,23,22,22,22,117,59.69,2018-19
1069,5000000000000.0,5.0,0.0,0.0,C,,,Calaveras,451.0,31.0,26,24,24,24,27,87.10,2018-19
1070,26000000000000.0,26.0,0.0,0.0,C,,,Mono,420.0,20.0,26,24,24,24,16,80.00,2018-19
1071,56000000000000.0,56.0,0.0,0.0,C,,,Ventura,10750.0,1552.0,25,24,24,24,1086,69.97,2018-19
1072,21000000000000.0,21.0,0.0,0.0,C,,,Marin,2647.0,753.0,27,26,25,25,606,80.48,2018-19
1073,9000000000000.0,9.0,0.0,0.0,C,,,El Dorado,2226.0,465.0,25,24,25,24,358,76.99,2018-19


In [29]:
# Notice that there are 58 counties in California, so the 58 NULL 'DNAME' entries are aggregates of all the districts in each county
actdf.isnull().sum()

CDS             0
CCode           0
CDCode          0
SCode         522
RType           0
SName         580
DName          58
CName           0
Enroll12        0
NumTstTakr      0
AvgScrRead    356
AvgScrEng     356
AvgScrMath    356
AvgScrSci     356
NumGE21       356
PctGE21       356
Year            0
dtype: int64

In [4]:
# 2019 Californian SAT Scores
# A link to the data dictionary for the satdf-dataset is in the README
satdf.head()

Unnamed: 0,CDS,CCode,CDCode,SCode,RType,SName,DName,CName,Enroll12,NumTSTTakr12,...,NumERWBenchmark11,PctERWBenchmark11,NumMathBenchmark11,PctMathBenchmark11,TotNumBothBenchmark12,PctBothBenchmark12,TotNumBothBenchmark11,PctBothBenchmark11,Year,Unnamed: 25
0,6615981000000.0,6.0,661598.0,630046.0,S,Colusa Alternative Home,Colusa Unified,Colusa,18.0,0.0,...,,,,,,,,,2018-19,
1,6616061000000.0,6.0,661606.0,634758.0,S,Maxwell Sr High,Maxwell Unified,Colusa,29.0,10.0,...,*,*,*,*,*,*,*,*,2018-19,
2,19647330000000.0,19.0,1964733.0,1930924.0,S,Belmont Senior High,Los Angeles Unified,Los Angeles,206.0,102.0,...,42,24.14,12,6.90,14,13.73,11,6.32,2018-19,
3,19647330000000.0,19.0,1964733.0,1931476.0,S,Canoga Park Senior High,Los Angeles Unified,Los Angeles,227.0,113.0,...,97,35.27,37,13.45,18,15.93,35,12.73,2018-19,
4,19647330000000.0,19.0,1964733.0,1931856.0,S,Whitman Continuation,Los Angeles Unified,Los Angeles,18.0,14.0,...,*,*,*,*,*,*,*,*,2018-19,


In [5]:
# 2019 California School District Equitablility Rankings
equitydf.head()

Unnamed: 0,Rank*,School District,Score,Expenditures for Public Elementary and Secondary Schools per Pupil,Income by School District
0,1,Los Nietos School District,0.03,"$14,525","$63,516"
1,2,Gen Shafter Elementary School District,0.13,"$17,245","$49,167"
2,3,Browns Elementary School District,0.17,"$10,664","$83,942"
3,4,Pajaro Valley Unified School District,0.19,"$13,924","$66,867"
4,5,Bonita Unified School District,0.27,"$10,602","$84,202"


3. Check for any obvious issues with the observations (keep in mind the minimum & maximum possible values for each test/subtest).
4. Fix any errors you identified in steps 2-3.
5. Display the data types of each feature.
6. Fix any incorrect data types found in step 5.
    - Fix any individual values preventing other columns from being the appropriate type.
    - If your dataset has a column of percents (ex. '50%', '30.5%', etc.), use the function you wrote in Part 1 (coding challenges, number 3) to convert this to floats! *Hint*: use `.map()` or `.apply()`.
7. Rename Columns.
    - Column names should be all lowercase.
    - Column names should not contain spaces (underscores will suffice--this allows for using the `df.column_name` method to access columns in addition to `df['column_name']`).
    - Column names should be unique and informative.
8. Drop unnecessary rows (if needed).
9. Merge dataframes that can be merged.
10. Perform any additional cleaning that you feel is necessary.
11. Save your cleaned and merged dataframes as csv files.

#### Check for missing values

In [12]:
# A link to the data dictionary for the actdf-dataset is in the README
actdf.isnull().sum()

CDS               1
CCode             1
CDCode            1
SCode           523
RType             1
SName           581
DName            59
CName             1
Enroll12          1
NumTstTakr        1
AvgScrRead      357
AvgScrEng       357
AvgScrMath      357
AvgScrSci       357
NumGE21         357
PctGE21         357
Year              1
Unnamed: 17    2310
dtype: int64

In [13]:
# A link to the data dictionary for the satdf-dataset is in the README
satdf.isnull().sum()

CDS                         1
CCode                       1
CDCode                      1
SCode                       1
RType                       1
SName                     598
DName                      59
CName                       1
Enroll12                    1
NumTSTTakr12                1
NumERWBenchmark12         276
PctERWBenchmark12         276
NumMathBenchmark12        276
PctMathBenchmark12        276
Enroll11                    1
NumTSTTakr11                1
NumERWBenchmark11         311
PctERWBenchmark11         311
NumMathBenchmark11        311
PctMathBenchmark11        311
TotNumBothBenchmark12     276
PctBothBenchmark12        276
TotNumBothBenchmark11     311
PctBothBenchmark11        311
Year                        1
Unnamed: 25              2580
dtype: int64

In [11]:
# The equity data
equitydf.isnull().sum()

Rank*                                                                 0
School District                                                       0
Score                                                                 0
Expenditures for Public Elementary and Secondary Schools per Pupil    0
Income by School District                                             0
dtype: int64

In [None]:


#Drop the empty (last) row
actdf.drop(actdf.tail(1).index,inplace=True)
satdf.drop(satdf.tail(1).index,inplace=True)

#RType == 'C' entries contain aggregate County info
act_countydf = actdf[actdf['RType'] == 'C'].copy()
sat_countydf = satdf[satdf['RType'] == 'C'].copy()

#RType == 'D' entries contain aggregate School District info
act_districtdf = actdf[actdf['RType'] == 'D'].copy()
sat_districtdf = satdf[satdf['RType'] == 'D'].copy()

#Create filters for special cases where score records are not available(or don't exist)
#Filter out DISTRICT records where no seniors took exam, but more than 15 were enrolled
null_district_actdf = act_districtdf[act_districtdf['PctGE21'].isnull()==False]
null_district_satdf = sat_districtdf[sat_districtdf['PctBothBenchmark12'].isnull()==False]

#Filter out COUNTY records where no seniors took exam, but more than 15 were enrolled
null_county_actdf = act_countydf[act_countydf['PctGE21'].isnull()==False]
null_county_satdf = sat_countydf[sat_countydf['PctBothBenchmark12'].isnull()==False]

#Filter out DISTRICT records where less than 15 enrolled seniors took exam
district_scores_actdf = null_district_actdf[null_district_actdf['PctGE21']!='*'].copy()
district_scores_satdf = null_district_satdf[null_district_satdf['PctBothBenchmark12']!='*'].copy()

#Filter out COUNTY records where less than 15 enrolled seniors took exam
county_scores_actdf = null_county_actdf[null_county_actdf['PctGE21']!='*'].copy()
county_scores_satdf = null_county_satdf[null_county_satdf['PctBothBenchmark12']!='*'].copy()

#Drop special characters '$' and ',' from equitable_county_rank.csv
for row in range(len(equitydf.index)):
    equitydf.loc[row,['Expenditures for Public Elementary and Secondary Schools per Pupil']] \
        = equitydf['Expenditures for Public Elementary and Secondary Schools per Pupil'][row].replace(',','').replace('$','')
    equitydf.loc[row,['Income by School District']] = equitydf['Income by School District'][row].replace(',','').replace('$','')

    #Display datatypes
print(sat_districtdf.dtypes)
print(act_districtdf.dtypes)
print(district_scores_actdf.dtypes)
print(district_scores_satdf.dtypes)
print(equitydf.dtypes)

#Fix Incorrect Data Types
for name in ['Expenditures for Public Elementary and Secondary Schools per Pupil', 'Income by School District']:
    equitydf[name] = equitydf[name].apply(int).copy()

for name in ['CCode', 'CDCode', 'Enroll12', 'NumTstTakr']:
    act_districtdf[name] = act_districtdf[name].apply(int).copy()
    act_countydf[name] = act_countydf[name].apply(int).copy()
    district_scores_actdf[name] = district_scores_actdf[name].apply(int).copy()
    county_scores_actdf[name] = county_scores_actdf[name].apply(int).copy()
    
for name in ['CCode', 'CDCode', 'Enroll12', 'NumTSTTakr12']:
    sat_districtdf[name] = sat_districtdf[name].apply(int).copy()
    sat_countydf[name] = sat_countydf[name].apply(int).copy()
    district_scores_satdf[name] = district_scores_satdf[name].apply(int).copy()
    county_scores_satdf[name] = county_scores_satdf[name].apply(int).copy()

#Additional dtypes to fix for 'PctGE21'-(ACT data) & 'PctBothBenchmark12'-(SAT data)
county_scores_actdf['NumGE21'] = county_scores_actdf['NumGE21'].apply(int)
district_scores_actdf['NumGE21'] = district_scores_actdf['NumGE21'].apply(int)

county_scores_actdf['PctGE21'] = county_scores_actdf['PctGE21'].apply(float)
district_scores_actdf['PctGE21'] = district_scores_actdf['PctGE21'].apply(float)

county_scores_satdf['NumTSTTakr12'] = county_scores_satdf['NumTSTTakr12'].apply(int)
district_scores_satdf['NumTSTTakr12'] = district_scores_satdf['NumTSTTakr12'].apply(int)

county_scores_satdf['TotNumBothBenchmark12'] = county_scores_satdf['TotNumBothBenchmark12'].apply(int)
district_scores_satdf['TotNumBothBenchmark12'] = district_scores_satdf['TotNumBothBenchmark12'].apply(int)

county_scores_satdf['PctBothBenchmark12'] = county_scores_satdf['PctBothBenchmark12'].apply(float)
district_scores_satdf['PctBothBenchmark12'] = district_scores_satdf['PctBothBenchmark12'].apply(float)

district_scores_actdf['PctGE21'] = district_scores_actdf['PctGE21'].apply(float)

#Remove excess columns and just focus on DISTRICT data for the sake of time
act_districtdf=act_districtdf[['CCode', 'CDCode','DName', 'CName','Enroll12', 'NumTstTakr']]
district_scores_actdf=district_scores_actdf[['CCode', 'CDCode','DName', 'CName','Enroll12', 'NumTstTakr', 'NumGE21', 'PctGE21']]

sat_districtdf = sat_districtdf[['CCode', 'CDCode', 'DName', 'CName','Enroll12', 'NumTSTTakr12']]
district_scores_satdf = district_scores_satdf[['CCode', 'CDCode', 'DName', 'CName','Enroll12', 'NumTSTTakr12', 'TotNumBothBenchmark12', 'PctBothBenchmark12']]

#Rename Columns
act_districtdf = act_districtdf.rename(columns={'CCode':'county_code','CName':'county_name','CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTstTakr':'tested_seniors'})
district_scores_actdf = district_scores_actdf.rename(columns={'CCode':'county_code','CName':'county_name','CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTstTakr':'tested_seniors', 'NumGE21':'num_over_benchmark_act', 'PctGE21':'pct_over_benchmark_act'})

sat_districtdf = sat_districtdf.rename(columns={'CCode':'county_code','CName':'county_name', 'CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTSTTakr12':'tested_seniors'})
district_scores_satdf = district_scores_satdf.rename(columns={'CCode':'county_code','CName':'county_name', 'CDCode':'district_code', 'DName':'district_name','Enroll12':'enrolled_seniors', 'NumTSTTakr12':'tested_seniors', 'TotNumBothBenchmark12':'num_over_benchmark_sat', 'PctBothBenchmark12':'pct_over_benchmark_sat'})
    
equitydf = equitydf.rename(columns={'Rank*':'rank', 'School District':'district_name', 'Score':'score','Expenditures for Public Elementary and Secondary Schools per Pupil':'expenditures_per_pupil','Income by School District':'income'})

#Additional Cleaning
#ID equitable counties from respective districts
#Drop 'School District' from the string containing the School District name
for index in range(len(equitydf['district_name'])):
    equitydf.loc[ index, [ 'district_name' ] ]  = equitydf['district_name'][index].replace('School District','')
    
#Hov helped me merge by sowing me str.strip() and giving insiight into the underlying problem
standardized_test = district_scores_satdf.merge(district_scores_actdf, on='district_name')
standardized_test['district_name'] = standardized_test['district_name'].str.strip()
standardized_test['district_name'] = standardized_test['district_name'].str.lower()
equitydf['district_name'] = equitydf['district_name'].str.strip()
equitydf['district_name'] = equitydf['district_name'].str.lower()
finaldf = standardized_test.merge(equitydf, on='district_name')
finaldf[ ['district_name', 'num_over_benchmark_sat', 'num_over_benchmark_act','pct_over_benchmark_sat','pct_over_benchmark_act','rank', 'score']].describe()

#Save your cleaned and merged dataframes as csv files.
finaldf.to_csv('./finaldf.csv', index=False)