In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostRegressor 
from yellowbrick.regressor import residuals_plot
%matplotlib inline

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from feature_engine.encoding import OneHotEncoder
from feature_engine.wrappers import SklearnTransformerWrapper
from sklearn.metrics import mean_squared_error

In [2]:
pd.set_option("display.max_columns", 99)
df = pd.read_csv("../data/mergeA_for_modeling.csv")
df.head(n=3)


Unnamed: 0,Year,STNAM,FIPST,LEAID,LEANM,NCESSCH,SCHNAM,ALL_COHORT_,zip_mailing,zip_location,latitude,urban_centric_locale,county_code,school_level,school_type,lowest_grade_offered,highest_grade_offered,title_i_status,title_i_eligible,charter,magnet,virtual,teachers_fte,free_lunch,reduced_price_lunch,free_or_reduced_price_lunch,enrollment,ungrade_cedp,Math_Pct_Part,Rla_Pct_Part,ALL_RATE_
0,2012,ALASKA,2,200001,Lower Kuskokwim School District,20000100208,Bethel Regional High School,75,99559.0,99559.0,60.802584,41.0,2050.0,4.0,1.0,6.0,12.0,5.0,1.0,0.0,0.0,0.0,32.849998,210.0,28.0,238.0,506.0,0.0,97.5,97.5,52.0
1,2012,ALASKA,2,200001,Lower Kuskokwim School District,20000100318,Bethel Alternative Boarding School,28,99559.0,99559.0,60.79596,33.0,2050.0,3.0,4.0,8.0,12.0,5.0,1.0,0.0,0.0,0.0,4.0,58.0,0.0,58.0,61.0,0.0,90.0,90.0,10.0
2,2012,ALASKA,2,200006,Mount Edgecumbe High School Agency,20000600558,Mt. Edgecumbe High School,71,99835.0,99835.0,57.05181,41.0,2220.0,3.0,1.0,9.0,12.0,5.0,1.0,0.0,0.0,0.0,22.5,224.0,35.0,259.0,398.0,0.0,97.5,97.5,97.5


In [3]:
# Subset to columns that we believe have unique information in them; e.g. removing ID columns, names as strings
# Categoricals like LEAID, zip_location, and county_code like have some useful information but may have too many categories (1000s) to dummy encode.
# I am including the county_code in this test.
X = df.copy()
X.drop(["STNAM", "LEANM", "NCESSCH", "SCHNAM", "zip_mailing",
       "LEAID", "zip_location"], inplace=True, axis=1)

y = X.pop("ALL_RATE_")

# Tag categoricals so that they can be treated properly by the modeling packages
numeric_cols = ["Rla_Pct_Part", "Math_Pct_Part", "enrollment", "free_lunch", "reduced_price_lunch", "free_or_reduced_price_lunch",
                "teachers_fte", "lowest_grade_offered", "highest_grade_offered", "latitude", "ALL_COHORT_", "Year"]
for col in numeric_cols:
    assert col in X.columns
categoricals = list(set(X.columns.tolist()) - set(numeric_cols))

# Convert categoricals to the pandas type 'category'
for col in categoricals:
    X[col] = X[col].astype("category")
X.dtypes

In [6]:
# Create train-test split
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X, y, test_size=0.25,stratify=y, random_state=42)


In [7]:
Xtrain.head(n=2)


Unnamed: 0,Year,FIPST,ALL_COHORT_,latitude,urban_centric_locale,county_code,school_level,school_type,lowest_grade_offered,highest_grade_offered,title_i_status,title_i_eligible,charter,magnet,virtual,teachers_fte,free_lunch,reduced_price_lunch,free_or_reduced_price_lunch,enrollment,ungrade_cedp,Math_Pct_Part,Rla_Pct_Part
52993,2016,26,333,42.625651,21.0,26125.0,3.0,1.0,8.0,12.0,6.0,0.0,0.0,0.0,0.0,89.559998,198.0,53.0,251.0,1413.0,0.0,99.5,99.5
14348,2013,11,205,38.944,11.0,11001.0,3.0,1.0,9.0,12.0,5.0,1.0,0.0,0.0,0.0,43.0,435.0,0.0,435.0,438.0,0.0,97.5,97.5


In [8]:
# Show impact of our stratification on y. 
print(ytrain.describe() - ytest.describe())
print(ytrain.describe() - df.ALL_RATE_.describe())


count    43642.000000
mean        -0.015211
std          0.018880
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.000000
Name: ALL_RATE_, dtype: float64
count   -21822.000000
mean        -0.003803
std          0.004829
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.000000
Name: ALL_RATE_, dtype: float64


# AdaBoost Regression

In [12]:
from sklearn.tree import DecisionTreeRegressor
param_grid = {
    'ada__random_state': [42],
}
scaler = SklearnTransformerWrapper(transformer=StandardScaler(),
                                    variables=numeric_cols)
pipe = Pipeline(steps=[("scaler", scaler), ("onehot", OneHotEncoder(drop_last=True, variables=categoricals)),
                        ("ada", AdaBoostRegressor(DecisionTreeRegressor(max_depth=20)))])
pipe.fit(Xtrain,ytrain)
# AdaBoost default estimator is DecisionTreeRegressor with max_depth=3
# 13.91 rmse on test with default parameters and no scaling
# 13.88 rmse on test with default parameters and scaling
# 11.32 rmse on test with max_depth=10 (took 9 minutes)

  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{cat

KeyboardInterrupt: 

* Tune parameters
* Run model on train and test and sets
*  Visualize
*  Feature Importance

In [10]:
# Check model on test set 
(pipe.predict(Xtest)-ytest).describe()[2]


  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{category}"] = np.where(X[feature] == category, 1, 0)
  X[f"{feature}_{cat

11.32149164891223

# Check best parameters on the test set

In [None]:
# Validate the model on the train and test sets
import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

pipe2 = Pipeline()
pipe2.fit(Xtrain,ytrain)

In [None]:
# Validate the model on the train and test sets
from sklearn.metrics import r2_score
train_residuals = pipe2.predict(Xtrain) - ytrain
plt.hist(train_residuals,density=True)
plt.show()
test_residuals = pipe2.predict(Xtest) - ytest
plt.hist(test_residuals,color="orange",density=True)
plt.show()
print("train rmsd = ", train_residuals.describe()[2])
print("test rmsd = ", test_residuals.describe()[2])
print("test r2 = ", r2_score(ytest, pipe2.predict(Xtest)))