# Capstone 3 - Data pre-processing

### Table of contents

<div class="alert alert-block alert-info">
<b>Put table of contents here</b>
</div>

## Introduction

### Import relevant libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer

### Retrieve variables

In [2]:
# Retrieve original noise_data dataframe
%store -r noise_data

# Retrieve GeoPandas dataframes
%store -r districts_gdf 
%store -r schoolpoints_gdf

# Retrieve schools covered by sensor range
%store -r coverage_matrix

# Retrieve school demographic information
%store -r combined_summary_df

# Retrieve lower grades achievement information
%store -r combined_lg_achievement_df

# Retrieve high school achievement information
%store -r combined_hs_achievement_df

# # Retrieve combined achievement data
# %store -r coverage_combined_achievement_df
# %store -r non_coverage_combined_achievement_df

# # Retrieve merged dataset with all metrics
# %store -r merged_coverage_df

# Retrieve school lists
%store -r elem_middle_schools
%store -r high_schools

## Preparing to pre-process the data

To simplify our pre-processing stage, we will write a few functions.

### Define a function `identify_missing_values`
To start, we will define a function, `identify_missing_values`, that will help us isolate columns with missing values.

In [3]:
def identify_missing_values(df):
    # Isolate numerical variables
    numerical_variables = df.select_dtypes(exclude=['object', 'category', 'bool']).columns
    
    # Loop through numerical variables of summary dfs to find number of null values in each column
    # Also, create an empty list for adding any columns with missing values
    missing_numerical_variables = []
    for item in numerical_variables:
        missing_values = df[item].isnull().sum()
        if missing_values > 0:
            print(f"{item}: {missing_values}")
            missing_numerical_variables.append(item)
    
    return missing_numerical_variables

### Define a function `knn_impute`

In [4]:
def knn_impute(df, n_neighbors=5):
    print(f"{'-'*80}\nMissing numerical variables before imputing:")
    missing_numerical_variables = identify_missing_values(df)
    print(f"{'-'*80}")
    missing_numerical_data = df[missing_numerical_variables]
    
    # Perform KNN imputation
    imputer = KNNImputer(n_neighbors=n_neighbors)
    numerical_data_imputed = imputer.fit_transform(missing_numerical_data)
    
    # Convert imputed data back to a dataframe
    numerical_data_imputed = pd.DataFrame(numerical_data_imputed, columns=missing_numerical_variables, 
                                          index=df.index)
    
    # Combine with original categorical columns
    df_imputed = df.copy()
    df_imputed[missing_numerical_variables] = numerical_data_imputed
    
    return df_imputed

## Imputing missing values

### Summary data (`combined_summary_df`)

In [5]:
combined_summary_imputed = knn_impute(combined_summary_df)
combined_summary_imputed

--------------------------------------------------------------------------------
Missing numerical variables before imputing:
student_attendance_rate: 98
chronic_absence_pct: 108
teacher_attendance_rate: 705
--------------------------------------------------------------------------------


Unnamed: 0,DBN,school_name,enrollment,ell_pct,disability_pct,self_contained_pct,asian_pct,black_pct,hispanic_pct,white_pct,student_attendance_rate,chronic_absence_pct,teacher_attendance_rate,academic_year,coverage,grade_level
0,01M015,P.S. 015 Roberto Clemente,161,0.075,0.304,0.031,0.068,0.298,0.584,0.025,0.938000,0.215000,0.982000,2016-2017,False,lg
1,01M019,P.S. 019 Asher Levy,247,0.036,0.340,0.117,0.077,0.202,0.664,0.053,0.910000,0.324000,0.972000,2016-2017,False,lg
2,01M020,P.S. 020 Anna Silver,499,0.186,0.224,0.072,0.329,0.098,0.491,0.042,0.929000,0.256000,0.962000,2016-2017,False,lg
3,01M034,P.S. 034 Franklin D. Roosevelt,337,0.080,0.380,0.021,0.053,0.297,0.614,0.030,0.912000,0.305000,0.972000,2016-2017,False,lg
4,01M064,P.S. 064 Robert Simon,226,0.044,0.354,0.009,0.058,0.190,0.673,0.071,0.913000,0.346000,0.969000,2016-2017,False,lg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5299,24Q550,High School for Arts and Business,812,0.095,0.158,0.001,0.085,0.079,0.765,0.054,0.912000,0.249000,0.965000,2018-2019,True,hs
5300,28Q220,P.S. 220 Edward Mandel,670,0.224,0.113,0.016,0.330,0.076,0.200,0.357,0.944000,0.162000,0.955000,2018-2019,True,lg
5301,84M174,Success Academy Union Square (Manhattan 1),619,0.031,0.195,0.039,0.186,0.221,0.258,0.283,0.915104,0.267778,0.963933,2018-2019,True,lg
5302,84M202,Great Oaks Charter School,215,0.042,0.302,0.028,0.084,0.256,0.619,0.028,0.915104,0.267778,0.963933,2018-2019,True,lg


### Lower grades achievement data (`combined_lg_achievement_df`)

In [6]:
combined_lg_achievemnet_imputed = knn_impute(combined_lg_achievement_df)
combined_lg_achievemnet_imputed

--------------------------------------------------------------------------------
Missing numerical variables before imputing:
achievement_score: 252
ela_proficient_pct: 12
ela_avg_proficiency: 12
ela_lowest_third_proficiency: 12
math_proficient_pct: 17
math_avg_proficiency: 17
math_lowest_third_proficiency: 17
attendance_90_plus_pct: 94
--------------------------------------------------------------------------------


Unnamed: 0,DBN,school_name,school_type,achievement_score,ela_proficient_pct,ela_avg_proficiency,ela_lowest_third_proficiency,math_proficient_pct,math_avg_proficiency,math_lowest_third_proficiency,attendance_90_plus_pct,ela_proficient_n,math_proficient_n,ela_lowest_third_n,math_lowest_third_n,academic_year,coverage,grade_level
0,01M015,P.S. 015 Roberto Clemente,Elementary,4.920,0.338,2.67,2.67,0.429,2.79,2.20,0.7850,68,70,19,18,2016-2017,False,lg
1,01M019,P.S. 019 Asher Levy,Elementary,4.230,0.486,2.85,2.35,0.481,3.01,2.24,0.6760,109,106,24,23,2016-2017,False,lg
2,01M020,P.S. 020 Anna Silver,Elementary,2.210,0.289,2.52,1.92,0.311,2.52,1.77,0.7440,218,219,44,45,2016-2017,False,lg
3,01M034,P.S. 034 Franklin D. Roosevelt,K-8,2.890,0.254,2.52,2.04,0.177,2.34,1.83,0.6950,240,232,74,68,2016-2017,False,lg
4,01M064,P.S. 064 Robert Simon,Elementary,3.770,0.286,2.56,2.08,0.321,2.54,1.90,0.6540,105,106,27,27,2016-2017,False,lg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3836,24Q014,P.S. 014 Fairview,Elementary,2.850,0.288,2.52,1.94,0.324,2.57,1.82,0.8170,877,905,192,202,2018-2019,True,lg
3837,28Q220,P.S. 220 Edward Mandel,Elementary,2.050,0.468,2.89,1.99,0.480,2.91,1.93,0.8380,316,321,69,70,2018-2019,True,lg
3838,84M174,Success Academy Union Square (Manhattan 1),Elementary,4.860,0.925,3.75,3.48,0.977,4.14,3.92,0.9110,305,304,82,80,2018-2019,True,lg
3839,84M202,Great Oaks Charter School,Middle,3.658,0.261,2.55,2.11,0.311,2.53,1.90,0.6674,207,206,71,67,2018-2019,True,lg


### High school achievement data (`combined_hs_achievement_df`)

In [7]:
combined_hs_achievement_imputed = knn_impute(combined_hs_achievement_df)
combined_hs_achievement_imputed

--------------------------------------------------------------------------------
Missing numerical variables before imputing:
achievement_score: 93
grad_rate_4yr: 73
grad_rate_6yr: 206
regents_english: 28
regents_algebra: 91
regents_living_env: 137
regents_global: 151
regents_us_history: 166
college_prep_index: 74
college_ready_4yr: 74
college_ready_6yr: 206
postsec_enroll_6mo: 129
postsec_enroll_18mo: 206
credits_yr1: 35
credits_yr2: 53
credits_yr3: 66
attendance_90_plus_pct: 14
--------------------------------------------------------------------------------


Unnamed: 0,DBN,school_name,school_type,achievement_score,grad_rate_4yr,grad_rate_6yr,regents_english,regents_algebra,regents_living_env,regents_global,...,credits_yr1,credits_yr2,credits_yr3,grad_rate_n,regents_english_n,college_ready_n,attendance_90_plus_pct,academic_year,coverage,grade_level
0,01M292,Orchard Collegiate Academy,High School,3.72,0.658,0.708,76.5,67.1,65.60,59.4,...,0.868,0.824,0.879,38,43,38,0.552,2016-2017,False,hs
1,01M448,University Neighborhood High School,High School,4.45,0.897,0.855,73.1,72.5,72.60,71.8,...,0.945,0.941,0.903,78,146,78,0.756,2016-2017,False,hs
2,01M509,Marta Valle High School,High School,2.89,0.741,0.604,67.4,59.4,58.60,56.7,...,0.862,0.882,0.760,58,90,58,0.369,2016-2017,False,hs
3,01M539,"New Explorations into Science, Technology and ...",High School,4.23,0.973,0.992,88.1,78.2,80.90,90.6,...,0.913,0.986,0.962,183,62,183,0.898,2016-2017,False,hs
4,01M696,Bard High School Early College,High School,4.41,0.991,0.987,89.4,79.6,75.62,86.5,...,0.950,0.967,0.942,113,151,113,0.870,2016-2017,False,hs
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1458,13K674,"City Polytechnic High School of Engineering, A...",High School,2.52,0.782,0.909,68.2,58.9,64.50,78.0,...,0.835,0.747,0.702,110,112,110,0.643,2018-2019,True,hs
1459,14K610,Automotive High School,High School,2.90,0.767,0.650,65.3,59.2,65.70,60.8,...,0.760,0.713,0.573,116,132,116,0.459,2018-2019,True,hs
1460,15K592,Khalil Gibran International Academy,High School,2.93,0.767,0.706,61.3,57.1,57.40,58.2,...,0.797,0.712,0.653,73,96,73,0.538,2018-2019,True,hs
1461,15K656,Brooklyn High School of the Arts,High School,2.75,0.881,0.934,74.5,63.9,66.70,64.2,...,0.866,0.827,0.845,185,349,185,0.671,2018-2019,True,hs
