# Pre-processing & training

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df_import = pd.read_csv('../data/EDA_end.csv').drop(['Unnamed: 0'], axis=1, errors='ignore')

#### One-hot encoding completed during EDA

In [3]:
df_import.head()

Unnamed: 0,Mean Score Math,Mean Score EBRW,Median Family Income,Gend_Female,Gend_Male,Ethn_Asian,Ethn_Black,Ethn_Hispanic,Ethn_Two or More Races,Ethn_White,FRL_FRL Eligible,FRL_Not FRL Eligible,ELL_English Learners,ELL_Not English Learners,IEP_Students with IEPs,IEP_Students without IEPs
0,505.882353,560.588235,118620.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,537.757009,562.056075,68454.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2,574.725275,564.395604,68454.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
3,503.529412,517.058824,121412.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,560.444444,579.222222,121412.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


In [4]:
#Deal with missed missing data
df_import['Median Family Income'] = df_import['Median Family Income'].fillna(df_import['Median Family Income'].median())

In [5]:
#Train/test split & scaling

# Create training & testing sets
X = df_import.drop(['Mean Score Math', 'Mean Score EBRW'], axis=1)
y = df_import[['Mean Score Math', 'Mean Score EBRW']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=137)

scaler_X = StandardScaler()
scaler_y = StandardScaler()

#Fit & transform training X data
scaler_X.fit(X_train[['Median Family Income']].values)
X_train[['Median Family Income']] = scaler_X.transform(X_train[['Median Family Income']].values)

#Transform training X data
X_test[['Median Family Income']] = scaler_X.transform(X_test[['Median Family Income']].values)

#Fit & transform y training data
y_cols = y.columns
scaler_y.fit(y_train)
y_train[y_cols] = scaler_y.transform(y_train)

#Transform y test data
y_test[['Mean Score Math', 'Mean Score EBRW']] = scaler_y.transform(y_test)

In [6]:
# Combine X & y for the training sets and for testing sets
training_data =y_train.join(X_train)

# And the testing data
test_data = y_test.join(X_test)

In [7]:
#Export to csv
training_data.to_csv('../data/training_data.csv')
test_data.to_csv('../data/test_data.csv')