https://towardsdatascience.com/simple-model-stacking-explained-and-automated-1b54e4357916

In [1]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb

randomstate =42

# svr = SVR(gamma = 'scale', kernel = 'linear', C=10, epsilon=.05)
ridge = Ridge(random_state = randomstate, tol=1e-3, solver='auto')
neighbor = KNeighborsRegressor(n_neighbors = 11)
linreg = LinearRegression()
xgbr = xgb.XGBRegressor(n_estimators=1000, eval_metric='mae', max_depth = 7,eta = .1, min_child_weight = 5, 
                        colsample_bytree = .4, reg_lambda = 50)

In [2]:
# svr_yhat, ridge_yhat, neighbor_yhat, linreg_yhat, xgbr_yhat = [], [], [], [], []

ridge_yhat, neighbor_yhat, linreg_yhat, xgbr_yhat = [], [], [], []

In [3]:
# models_dict = {'SVR' : [svr, svr_yhat], 
#                 'Ridge' : [ridge, ridge_yhat],  
#                 'KNN' : [neighbor, neighbor_yhat], 
#                 'Linear Regression' : [linreg, linreg_yhat], 
#                 'XGB' : [xgbr, xgbr_yhat]}


models_dict = {'Ridge' : [ridge, ridge_yhat],  
                'KNN' : [neighbor, neighbor_yhat], 
                'Linear Regression' : [linreg, linreg_yhat], 
                'XGB' : [xgbr, xgbr_yhat]}

In [4]:
from sklearn.model_selection import KFold
def train_oof_predictions(x, y, models, verbose=True):
    '''Function to perform Out-Of-Fold predictions on train data
    returns re-ordered predictors x, re-ordered target y, and model dictionary with filled predictors
    Parameters:
    x: training predictors
    y: training targets
    models: dictionary of models in form of model name : [instantiated model, predictors list]
    verbose: if True, prints status update as the function works
    '''
    
    # instantiate a KFold with 10 splits
    kfold = KFold(n_splits=10, shuffle=True, random_state=randomstate)
    
    # prepare lists to hold the re-ordered x and y values
    data_x, data_y  = [], []
    
    # run the following block for each of the 10 kfold splits
    for train_ix, test_ix in kfold.split(x, y):
    
        if verbose: print("\nStarting a new fold\n")
    
        if verbose: print("Creating splits")
        #create this fold's training and test sets
        train_X, test_X = x[train_ix], x[test_ix] 
        train_y, test_y = y[train_ix], y[test_ix]
    
        if verbose: print("Adding x and y to lists\n")
        # add the data that is used in this fold to the re-ordered lists
        data_x.extend(test_X)
        data_y.extend(test_y)
    
        # run each model on this kfold and add the predictors to the model's running predictors list
        for item in models:
            
            label = item # get label for reporting purposes
            model = models[item][0] # get the model to use on the kfold
        
            # fit and make predictions 
            if verbose: print("Running",label,"on this fold")
            model.fit(train_X, train_y) # fit to the train set for the kfold
            predictions = model.predict(test_X) # fit on the out-of-fold set
            models[item][1].extend(predictions) # add predictions to the model's running predictors list
    
    return data_x, data_y, models

In [5]:
import numpy as np
import pandas as pd
df = pd.read_csv('kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [6]:
df_cols = list(df.columns)
x_cols = df_cols[3:]

X = df[x_cols]
y = df.price

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [9]:
X_train.shape, y_train.shape

((14480, 18), (14480,))

In [10]:
data_x, data_y, trained_models = train_oof_predictions(X_train, y_train, models_dict)


Starting a new fold

Creating splits
Adding x and y to lists

Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Running XGB on this fold

Starting a new fold

Creating splits
Adding x and y to lists

Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Running XGB on this fold

Starting a new fold

Creating splits
Adding x and y to lists

Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Running XGB on this fold

Starting a new fold

Creating splits
Adding x and y to lists

Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Running XGB on this fold

Starting a new fold

Creating splits
Adding x and y to lists

Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Running XGB on this fold

Starting a new fold

Creating splits
Adding x and y to lists

Running Ridge on this fold
Running KNN on 

In [11]:
len(data_x), len(data_y)

(14480, 14480)

In [14]:
len(trained_models['Ridge'][1])

14480

In [15]:
from sklearn.model_selection import cross_validate
def model_selector(X, y, meta_model, models_dict, model_label, verbose=True):
    
    """ 
    Perform a forward model selection based on MAE improvement
    Parameters:
        X - baseline X_train with all features
        y - baseline y_train with all targets
        meta_model - meta_model to be trained
        models_dict - dictionary of models in format of model name : [model object, out-of-fold predictions]
        label - the label for the current meta model
        verbose - whether to print the sequence of inclusions(True recommended)
    Returns: list of selected models, best MAE 
    """
    print("\n\nRunning model selector for ", model_label)
    included_models = []
     
    while True:
        changed=False
        
        # forward step
        
        if verbose: print("\nNEW ROUND - Setting up score charts")
        excluded_models = list(set(models_dict.keys())-set(included_models)) # make a list of the current excluded_models
        if verbose: print("Included models: {}".format(included_models))
        if verbose: print("Exluded models: {}".format(excluded_models))
        new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models
        
        current_meta_x = np.array(X)
        
        if len(included_models) > 0:
            for included in included_models:
                included = np.array(models_dict[included][1]).reshape((len(models_dict[included][1]), 1))
                current_meta_x = np.hstack((current_meta_x, included))
# score the current model
        scores = cross_validate(meta_model, current_meta_x, y, cv=5, n_jobs=-1, scoring=('neg_mean_absolute_error'))
        starting_mae = round(scores['test_score'].mean(),3)
        if verbose: print("Starting mae: {}\n".format(starting_mae))
        
       
        for excluded in excluded_models:  # for each item in the excluded_models list:
            
            new_yhat = np.array(models_dict[excluded][1]).reshape(-1, 1) # get the current item's predictions
            meta_x = np.hstack((current_meta_x, new_yhat)) # add the predictions to the meta set
            
            # score the current item
            scores = cross_validate(meta_model, meta_x, y, cv=5, n_jobs=-1, scoring=('neg_mean_absolute_error'))
            mae = round(scores['test_score'].mean(),3)
            if verbose: print("{} score: {}".format(excluded, mae))
            
            new_mae[excluded] = mae # append the mae to the series field
        
        best_mae = new_mae.max() # evaluate best mae of the excluded_models in this round
        if verbose: print("Best mae: {}\n".format(best_mae))
        
        if best_mae > starting_mae:  # if the best mae is better than the initial mae
            best_feature = new_mae.idxmax()  # define this as the new best feature
            included_models.append(str(best_feature)) # append this model name to the included list
            changed=True # flag that we changed it
            if verbose: print('Add  {} with mae {}\n'.format(best_feature, best_mae))
        else: changed = False
        
        if not changed:
            break
            
    print(model_label, "model optimized")
    print('resulting models:', included_models)
    print('MAE:', starting_mae)
    
    return included_models, starting_mae

In [16]:
# Set up a scoring dictionary to hold the model stack selector results
scores = {}
scores['Model'] = []
scores['MAE'] = []
scores['Included'] = []
# Run the model stack selector for each model in our trained_models
for model in trained_models:
    
    meta_model = trained_models[model][0]
    resulting_models, best_mae = model_selector(data_x, data_y,  meta_model, trained_models, model, verbose=True)
    
    scores['Model'].append(model)
    scores['MAE'].append(best_mae)
    scores['Included'].append(resulting_models)



Running model selector for  Ridge

NEW ROUND - Setting up score charts
Included models: []
Exluded models: ['XGB', 'KNN', 'Ridge', 'Linear Regression']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -124169.423

XGB score: -64433.946
KNN score: -119485.059
Ridge score: -123794.234
Linear Regression score: -123797.774
Best mae: -64433.946

Add  XGB with mae -64433.946


NEW ROUND - Setting up score charts
Included models: ['XGB']
Exluded models: ['KNN', 'Ridge', 'Linear Regression']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -64433.946

KNN score: -64450.187
Ridge score: -64370.612
Linear Regression score: -64370.858
Best mae: -64370.612

Add  Ridge with mae -64370.612


NEW ROUND - Setting up score charts
Included models: ['XGB', 'Ridge']
Exluded models: ['KNN', 'Linear Regression']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -64370.612

KNN score: -64387.58
Linear Regression score: -64484.397
Best mae: -64387.58

Ridge model optimized
resulting models: ['XGB', 'Ridge']
MAE: -64370.612


Running model selector for  KNN

NEW ROUND - Setting up score charts
Included models: []
Exluded models: ['XGB', 'KNN', 'Ridge', 'Linear Regression']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -154433.867

XGB score: -67353.509
KNN score: -160385.572
Ridge score: -112965.726
Linear Regression score: -113078.278
Best mae: -67353.509

Add  XGB with mae -67353.509


NEW ROUND - Setting up score charts
Included models: ['XGB']
Exluded models: ['KNN', 'Ridge', 'Linear Regression']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -67353.509

KNN score: -67626.213
Ridge score: -67442.159
Linear Regression score: -67453.334
Best mae: -67442.159

KNN model optimized
resulting models: ['XGB']
MAE: -67353.509


Running model selector for  Linear Regression

NEW ROUND - Setting up score charts
Included models: []
Exluded models: ['XGB', 'KNN', 'Ridge', 'Linear Regression']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -124198.632

XGB score: -64433.445
KNN score: -119512.664
Ridge score: -123994.427
Linear Regression score: -123985.853
Best mae: -64433.445

Add  XGB with mae -64433.445


NEW ROUND - Setting up score charts
Included models: ['XGB']
Exluded models: ['KNN', 'Ridge', 'Linear Regression']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -64433.445

KNN score: -64449.292
Ridge score: -65065.914
Linear Regression score: -65060.896
Best mae: -64449.292

Linear Regression model optimized
resulting models: ['XGB']
MAE: -64433.445


Running model selector for  XGB

NEW ROUND - Setting up score charts
Included models: []
Exluded models: ['XGB', 'KNN', 'Ridge', 'Linear Regression']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -64499.895

XGB score: -66437.885
KNN score: -64622.99
Ridge score: -64851.695
Linear Regression score: -64432.921
Best mae: -64432.921

Add  Linear Regression with mae -64432.921


NEW ROUND - Setting up score charts
Included models: ['Linear Regression']
Exluded models: ['XGB', 'KNN', 'Ridge']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -64432.921

XGB score: -66938.753
KNN score: -64518.79
Ridge score: -64715.154
Best mae: -64518.79

XGB model optimized
resulting models: ['Linear Regression']
MAE: -64432.921


In [17]:
scores

{'Model': ['Ridge', 'KNN', 'Linear Regression', 'XGB'],
 'MAE': [-64370.612, -67353.509, -64433.445, -64432.921],
 'Included': [['XGB', 'Ridge'], ['XGB'], ['XGB'], ['Linear Regression']]}

In [15]:
# Look at the scores of our model combinations
best_model = pd.DataFrame(scores).reset_index(drop=True)
best_model.sort_values('MAE', ascending=False)

Unnamed: 0,Model,MAE,Included
0,Ridge,-64370.612,"[XGB, Ridge]"
3,XGB,-64432.921,[Linear Regression]
2,Linear Regression,-64433.445,[XGB]
1,KNN,-67353.509,[XGB]


In [26]:
# Check our meta model on the original train/test set only
# Instantiate the chosen meta model

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# meta_model =  Ridge(random_state = randomstate, tol=1e-3, solver='auto')
meta_model = xgb.XGBRegressor(n_estimators=1000, eval_metric='mae', max_depth = 7,eta = .1, min_child_weight = 5, 
                        colsample_bytree = .4, reg_lambda = 50)

meta_model.fit(X_train, y_train)
predictions = meta_model.predict(X_test)
# pred_exp = np.exp(predictions)
# actual = np.exp(y_test)

pred_exp = predictions
actual = y_test

print("MAE: ",int(mean_absolute_error(pred_exp, actual)))
print("RMSE:",int(np.sqrt(mean_squared_error(pred_exp, actual))))
print(("R2:",r2_score(pred_exp, actual)*100))

MAE:  67260
RMSE: 129894
('R2:', 87.27728795793273)


In [43]:
print("Fitting Models")
xgbr.fit(X_train, y_train)
ridge.fit(X_train, y_train)

Fitting Models


In [44]:
yhat_predics = [trained_models['XGB'][1], trained_models['Ridge'][1]]

In [45]:
def create_meta_dataset(data_x, items):
    '''Function that takes in a data set and list of predictions, and forges into one dataset
    parameters:
    data_x - original data set
    items - list of predictions
    returns: stacked data set
    '''
    
    meta_x = data_x
    
    for z in items:
        z = np.array(z).reshape((len(z), 1))
        meta_x = np.hstack((meta_x, z))
        
    return meta_x

In [46]:
# create the meta data set using the oof predictions
meta_X_train = create_meta_dataset(data_x, yhat_predics)

In [53]:
final_models = [xgbr, ridge]

In [48]:
def stack_prediction(X_test, final_models): 
    '''takes in a test set and a list of fitted models.
    Fits each model in the list on the test set and stores it in a predictions list. Then sends the test set and the predictions to the create_meta_dataset to be combined
    Returns: combined meta test set
    Parameters:
    X_test - testing dataset
    final_models - list of fitted models
    '''
    predictions = []
    
    for item in final_models:
        print(item)
        preds = item.predict(X_test).reshape(-1,1)
        predictions.append(preds)
    
    meta_X = create_meta_dataset(X_test, predictions)
        
    return meta_X

In [49]:
final_models

[XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=0.4, early_stopping_rounds=None,
              enable_categorical=False, eta=0.1, eval_metric='mae',
              feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=7,
              max_leaves=None, min_child_weight=5, missing=nan,
              monotone_constraints=None, n_estimators=1000, n_jobs=None,
              num_parallel_tree=None, predictor=None, ...),
 sklearn.linear_model._ridge.Ridge]

In [54]:
final_models[1].predict(X_test)

array([ 462273.95632038,  748176.32004847, 1239464.50520646, ...,
        535675.25812558,  468373.17571305,  330218.07853849])

In [55]:
meta_X_test = stack_prediction(X_test, final_models)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.4, early_stopping_rounds=None,
             enable_categorical=False, eta=0.1, eval_metric='mae',
             feature_types=None, gamma=None, gpu_id=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=7,
             max_leaves=None, min_child_weight=5, missing=nan,
             monotone_constraints=None, n_estimators=1000, n_jobs=None,
             num_parallel_tree=None, predictor=None, ...)
Ridge(random_state=42, tol=0.001)


In [42]:
meta_model

In [57]:
# fit the meta model to the Train meta dataset
# There is no data leakage in the meta dataset since we did all of our predictions out-of-sample!
meta_model.fit(meta_X_train, data_y)
# predict on the meta test set
predictions = meta_model.predict(meta_X_test)
# pred_exp = np.exp(predictions)
# actual = np.exp(y_test)

pred_exp = predictions
actual = y_test
print("MAE: ",int(mean_absolute_error(pred_exp, actual)))
print("RMSE:",int(np.sqrt(mean_squared_error(pred_exp, actual))))
print(("R2:",r2_score(pred_exp, actual)*100))

MAE:  65850
RMSE: 127566
('R2:', 88.1856146658057)


In [58]:
(67260-65850)/67260

0.02096342551293488

In [60]:
(67260-64370)/67260

0.04296758846268213