# Pre-processing & training

In [16]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#### Demographic based scores

In [17]:
df_import = pd.read_csv('../data/demographic_scores_onehot.csv').drop(['Unnamed: 0'], axis=1, errors='ignore')

One-hot encoding completed during EDA

In [18]:
df_import.head()

Unnamed: 0,Mean Score Math,Mean Score EBRW,Median Family Income,Gend_Female,Gend_Male,Ethn_Asian,Ethn_Black,Ethn_Hispanic,Ethn_Two or More Races,Ethn_White,FRL_FRL Eligible,FRL_Not FRL Eligible,ELL_English Learners,ELL_Not English Learners,IEP_Students with IEPs,IEP_Students without IEPs
0,505.882353,560.588235,118620.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
1,537.757009,562.056075,68454.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
2,574.725275,564.395604,68454.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
3,503.529412,517.058824,121412.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
4,560.444444,579.222222,121412.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0


In [19]:
df_import.isna().sum()

Mean Score Math              0
Mean Score EBRW              0
Median Family Income         0
Gend_Female                  0
Gend_Male                    0
Ethn_Asian                   0
Ethn_Black                   0
Ethn_Hispanic                0
Ethn_Two or More Races       0
Ethn_White                   0
FRL_FRL Eligible             0
FRL_Not FRL Eligible         0
ELL_English Learners         0
ELL_Not English Learners     0
IEP_Students with IEPs       0
IEP_Students without IEPs    0
dtype: int64

In [20]:
#Deal with missed missing data
#df_import['Median Family Income'] = df_import['Median Family Income'].fillna(df_import['Median Family Income'].median())

In [21]:
#Train/test split & scaling

# Create training & testing sets
X = df_import.drop(['Mean Score Math', 'Mean Score EBRW'], axis=1)
y = df_import[['Mean Score Math', 'Mean Score EBRW']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=137)

scaler_X = StandardScaler()
# Not scaling y vals
## scaler_y = StandardScaler()

#Fit & transform training X data
scaler_X.fit(X_train[['Median Family Income']].values)
X_train[['Median Family Income']] = scaler_X.transform(X_train[['Median Family Income']].values)

#Transform training X data
X_test[['Median Family Income']] = scaler_X.transform(X_test[['Median Family Income']].values)

In [22]:
# Combine X & y for the training sets and for testing sets
training_data = y_train.join(X_train)

# And the testing data
test_data = y_test.join(X_test)

In [23]:
#Export to csv
training_data.to_csv('../data/demog_training_data.csv')
test_data.to_csv('../data/demog_testing_data.csv')

### PSAT/SAT score data

In [24]:
psat_sat = pd.read_csv('../data/SAT_PSAT_aggregated.csv').drop(['Unnamed: 0'], axis=1, errors='ignore')

# Fix dtypes after data import
psat_sat = psat_sat.astype({'District Number': 'str',
                 'District Name': 'str',
                 'School Number': 'str',
                 'School Name': 'str',
                 '2018 EBRW Mean SAT': 'float64',
                 '2018 Math Mean SAT': 'float64',
                 '2018 Overall Mean Score SAT': 'float64',
                 '2017 EBRW Mean PSAT10': 'float64',
                 '2017 Math Mean PSAT10': 'float64',
                 '2017 Overall Mean Score PSAT10': 'float64',
                 'Zip Code': 'str',
                 'Median Family Income': 'float64',
                 'Population': 'float64'})

psat_sat['Zip Code'] = psat_sat['Zip Code'].str.strip('.0')

In [25]:
psat_sat.isna().sum()

District Number                    0
District Name                      0
School Number                      0
School Name                        0
2018 EBRW Mean SAT                 0
2018 Math Mean SAT                 0
2018 Overall Mean Score SAT        0
2017 EBRW Mean PSAT10              0
2017 Math Mean PSAT10              0
2017 Overall Mean Score PSAT10     0
Zip Code                           0
Median Family Income              10
Population                        10
dtype: int64

Dropping income & population from this dataset since only comparing scores here.

In [26]:
psat_sat.drop(['Zip Code', 'Median Family Income', 'Population'],axis=1, inplace=True)

#### Scores only data do not need to be scaled.

In [27]:
psat_sat.columns

Index(['District Number', 'District Name', 'School Number', 'School Name',
       '2018 EBRW Mean SAT', '2018 Math Mean SAT',
       '2018 Overall Mean Score SAT', '2017 EBRW Mean PSAT10',
       '2017 Math Mean PSAT10', '2017 Overall Mean Score PSAT10'],
      dtype='object')

In [28]:
#Train/test split & scaling

# Create training & testing sets
X = psat_sat.loc[:,['2017 EBRW Mean PSAT10','2017 Math Mean PSAT10', '2017 Overall Mean Score PSAT10']]
y = psat_sat.loc[:,['2018 EBRW Mean SAT','2018 Math Mean SAT', '2018 Overall Mean Score SAT']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=137)

In [29]:
# Combine X & y for the training sets and for testing sets
training_data = y_train.join(X_train)

# And the testing data
test_data = y_test.join(X_test)

In [30]:
#Export to csv
training_data.to_csv('../data/psat_training_data.csv')
test_data.to_csv('../data/psat_testing_data.csv')