https://towardsdatascience.com/simple-model-stacking-explained-and-automated-1b54e4357916

In [68]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [69]:
RANDOM_STATE : int = 42
N_SAMPLES : int = 1000
N_FEATURES : int = 25
N_CLASSES : int = 2
N_CLUSTERS_PER_CLASS : int = 2
    
FEATURE_NAME_PREFIX : str = "Feature"
TARGET_NAME : str = "Target"
    
N_SPLITS : int = 5
    
np.set_printoptions(suppress=True)

In [70]:
def make_classification_dataframe(n_samples : int = 10000, n_features : int = 25, n_classes : int = 2, n_clusters_per_class : int = 2, feature_name_prefix : str = "Feature", target_name : str = "Target", random_state : int = 42) -> pd.DataFrame:
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes, n_informative = n_classes * n_clusters_per_class, random_state=random_state)

    feature_names = [feature_name_prefix + " " + str(v) for v in np.arange(1, n_features+1)]
    return pd.concat([pd.DataFrame(X, columns=feature_names), pd.DataFrame(y, columns=[target_name])], axis=1)

df_data = make_classification_dataframe(n_samples=N_SAMPLES, n_features=N_FEATURES, n_classes=N_CLASSES, n_clusters_per_class=N_CLUSTERS_PER_CLASS, feature_name_prefix=FEATURE_NAME_PREFIX, target_name=TARGET_NAME, random_state=RANDOM_STATE)

X = df_data.drop([TARGET_NAME], axis=1)
y = df_data[TARGET_NAME]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

df_data.head()

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25,Target
0,0.474489,-0.635241,0.853401,0.173065,-0.343114,0.693941,0.669566,-0.184548,0.876434,-0.122793,...,0.698702,1.763173,1.109425,2.113207,0.467081,-0.110596,-1.303475,1.433103,0.332273,1
1,0.726083,0.377495,-0.237514,-1.347979,0.691946,-0.961006,-0.425396,1.085357,-0.232414,2.590092,...,0.148723,-1.660917,-1.221744,-1.140272,0.777911,-0.181927,-2.712449,-0.212154,1.050325,1
2,2.203006,-1.189548,0.591983,2.339432,-1.252321,-1.22626,-0.850051,0.242036,-0.919765,-1.554612,...,-0.399935,0.081384,0.695738,-0.438492,-0.075429,-0.373356,0.903945,-0.915866,2.264526,0
3,1.825892,0.962322,-1.924548,2.032966,0.64247,0.038507,-0.176127,-0.080422,-1.986481,1.077307,...,-0.911353,-1.846082,-0.387748,-0.574282,0.085952,-0.139388,-0.367477,1.661867,1.356306,0
4,-0.740266,-1.005705,2.550129,-0.077865,-0.75419,-0.832578,2.645702,-0.676791,-0.500167,-1.799172,...,-0.183032,-1.192227,-2.112299,-1.387602,0.09734,-0.362738,-1.446197,0.174605,0.405741,0


In [4]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb

svr = SVR(gamma = 'scale', kernel = 'linear', C=10, epsilon=.05)
ridge = Ridge(random_state = RANDOM_STATE, tol=1e-3, normalize=False, solver='auto')
neighbor = KNeighborsRegressor(n_neighbors = 11)
linreg = LinearRegression()
xgbr = xgb.XGBRegressor(n_estimators=1000, eval_metric='mae', max_depth = 7,eta = .1, min_child_weight = 5, colsample_bytree = .4, reg_lambda = 50)

In [5]:
svr_yhat, ridge_yhat, neighbor_yhat, linreg_yhat, xgbr_yhat = [], [], [], [], []

In [6]:
models_dict = {'SVR' : [svr, svr_yhat], 
                'Ridge' : [ridge, ridge_yhat],  
                'KNN' : [neighbor, neighbor_yhat], 
                'Linear Regression' : [linreg, linreg_yhat], 
                'XGB' : [xgbr, xgbr_yhat]}

In [7]:
from sklearn.model_selection import KFold
def train_oof_predictions(x, y, models, verbose=True):
    '''Function to perform Out-Of-Fold predictions on train data
    returns re-ordered predictors x, re-ordered target y, and model dictionary with filled predictors
    Parameters:
    x: training predictors
    y: training targets
    models: dictionary of models in form of model name : [instantiated model, predictors list]
    verbose: if True, prints status update as the function works
    '''
    
    # instantiate a KFold with 10 splits
    kfold = KFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)
    
    # prepare lists to hold the re-ordered x and y values
    data_x, data_y  = [], []
    
    # run the following block for each of the 10 kfold splits
    for train_ix, test_ix in kfold.split(x, y):
    
        if verbose: print("\nStarting a new fold\n")
    
        if verbose: print("Creating splits")
        #create this fold's training and test sets
        train_X, test_X = x[train_ix], x[test_ix] 
        train_y, test_y = y[train_ix], y[test_ix]
    
        if verbose: print("Adding x and y to lists\n")
        # add the data that is used in this fold to the re-ordered lists
        data_x.extend(test_X)
        data_y.extend(test_y)
    
        # run each model on this kfold and add the predictors to the model's running predictors list
        for item in models:
            
            label = item # get label for reporting purposes
            model = models[item][0] # get the model to use on the kfold
        
            # fit and make predictions 
            if verbose: print("Running",label,"on this fold")
            model.fit(train_X, train_y) # fit to the train set for the kfold
            predictions = model.predict(test_X) # fit on the out-of-fold set
            models[item][1].extend(predictions) # add predictions to the model's running predictors list
    
    return data_x, data_y, models

In [8]:
data_x, data_y, trained_models = train_oof_predictions(X_train.to_numpy(), y_train.to_numpy(), models_dict)


Starting a new fold

Creating splits
Adding x and y to lists

Running SVR on this fold
Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Running XGB on this fold

Starting a new fold

Creating splits
Adding x and y to lists

Running SVR on this fold
Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Running XGB on this fold

Starting a new fold

Creating splits
Adding x and y to lists

Running SVR on this fold
Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Running XGB on this fold

Starting a new fold

Creating splits
Adding x and y to lists

Running SVR on this fold
Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Running XGB on this fold

Starting a new fold

Creating splits
Adding x and y to lists

Running SVR on this fold
Running Ridge on this fold
Running KNN on this fold
Running Linear Regression on this fold
Runni

In [17]:
len(models_dict["SVR"][1])

670

In [22]:
X_train.shape

(670, 25)

In [20]:
np.array(data_x).shape

(670, 25)

In [23]:
X_train

Unnamed: 0,Feature 1,Feature 2,Feature 3,Feature 4,Feature 5,Feature 6,Feature 7,Feature 8,Feature 9,Feature 10,...,Feature 16,Feature 17,Feature 18,Feature 19,Feature 20,Feature 21,Feature 22,Feature 23,Feature 24,Feature 25
703,1.452914,-1.353484,-1.163976,0.968981,-1.431659,0.131271,0.599146,0.761622,0.729353,-2.185309,...,1.140494,0.310544,-1.682558,-1.108564,0.513573,-0.636843,-0.357920,0.136553,0.385657,0.425407
311,-0.026298,0.457762,1.465424,-2.759468,-0.913735,0.301109,-0.016188,-1.244752,0.678920,1.465526,...,0.538965,0.870532,1.210138,1.179334,-0.065954,-0.918112,0.532466,2.013169,-1.892037,-4.899604
722,1.308747,1.315285,-1.332165,-1.434710,-1.814415,0.381065,-0.854355,2.929520,1.850419,-0.575638,...,0.062815,-0.507557,-0.821438,-0.620693,2.018031,-1.411207,-0.488684,0.162036,-0.185922,1.127646
629,2.121438,-0.053315,-1.237468,-0.985284,0.864597,-0.164172,-0.054883,0.510647,-0.070962,0.231803,...,-0.739468,1.125721,-2.342692,-2.261676,-0.303016,-0.042695,-1.137224,-0.815994,-1.217919,1.808162
0,0.474489,-0.635241,0.853401,0.173065,-0.343114,0.693941,0.669566,-0.184548,0.876434,-0.122793,...,-1.651879,0.698702,1.763173,1.109425,2.113207,0.467081,-0.110596,-1.303475,1.433103,0.332273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,1.469225,1.055448,2.877638,0.681128,0.429016,-2.261338,-0.923210,-0.638213,0.453167,-0.864700,...,0.847131,0.636355,-1.103350,-1.067578,1.075032,0.626674,-0.280988,-2.282302,1.225845,1.054445
270,-0.659869,-0.170539,0.487380,2.200257,0.499990,0.536917,0.410675,0.288056,-0.399134,0.802790,...,0.329497,0.046068,0.643001,0.883895,0.395130,-0.272830,0.504402,0.408425,-1.002254,0.332843
860,-0.161236,1.344682,0.385137,-1.071777,-1.704147,-2.258499,0.395391,-1.217547,1.184787,0.268065,...,1.419269,0.609337,-2.528484,-0.663315,0.644555,-0.524068,1.366133,0.925256,0.585381,1.400540
435,-0.644577,1.164906,1.477994,-1.803274,-0.829726,-0.711305,0.048050,-0.531037,0.555132,-1.891129,...,0.771607,-0.995648,-1.311984,-2.075613,-1.273738,0.855759,0.223919,0.317258,-0.445050,-1.515388


In [24]:
data_x

[array([ 1.30874656,  1.31528475, -1.33216498, -1.43470959, -1.81441495,
         0.38106524, -0.85435451,  2.92952022,  1.85041851, -0.57563837,
         0.36513031,  0.9118272 ,  0.02958122,  2.50311526, -0.51567344,
         0.06281534, -0.50755673, -0.82143814, -0.62069258,  2.01803103,
        -1.41120692, -0.48868385,  0.16203594, -0.18592165,  1.12764577]),
 array([ 0.52550872, -1.42688067, -1.35171802,  4.60327387, -0.24853547,
        -1.32163957, -0.79204502, -1.77185854, -0.20356143,  0.5497501 ,
         1.81699735,  1.4288266 , -2.07545067, -4.49701502,  2.10625786,
        -0.94756643,  0.98015412, -0.02545996,  0.06373002,  0.08017001,
        -1.08532119, -0.77961623,  0.58308102,  1.33148652, -2.3902779 ]),
 array([ 0.40789385,  0.86099262,  1.33865263, -1.15471716,  1.29234451,
         1.31077671, -0.25228734, -1.45070945,  0.6266158 ,  0.81019461,
        -1.29444643, -0.10391293, -1.47196242, -1.29100254, -0.69997593,
        -0.46939816, -0.27991052,  0.57905479, 

In [26]:
from sklearn.model_selection import cross_validate
def model_selector(X, y, meta_model, models_dict, model_label, verbose=True):
    
    """ 
    Perform a forward model selection based on MAE improvement
    Parameters:
        X - baseline X_train with all features
        y - baseline y_train with all targets
        meta_model - meta_model to be trained
        models_dict - dictionary of models in format of model name : [model object, out-of-fold predictions]
        label - the label for the current meta model
        verbose - whether to print the sequence of inclusions(True recommended)
    Returns: list of selected models, best MAE 
    """
    print("\n\nRunning model selector for ", model_label)
    included_models = []
     
    while True:
        changed=False
        
        # forward step
        
        if verbose: print("\nNEW ROUND - Setting up score charts")
        excluded_models = list(set(models_dict.keys())-set(included_models)) # make a list of the current excluded_models
        if verbose: print("Included models: {}".format(included_models))
        if verbose: print("Exluded models: {}".format(excluded_models))
        new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models
        
        current_meta_x = np.array(X)
        
        if len(included_models) > 0:
            for included in included_models:
                included = np.array(models_dict[included][1]).reshape((len(models_dict[included][1]), 1))
                current_meta_x = np.hstack((current_meta_x, included))
# score the current model
        scores = cross_validate(meta_model, current_meta_x, y, cv=5, n_jobs=-1, scoring=('neg_mean_absolute_error'))
        starting_mae = round(scores['test_score'].mean(),3)
        if verbose: print("Starting mae: {}\n".format(starting_mae))
        
       
        for excluded in excluded_models:  # for each item in the excluded_models list:
            
            new_yhat = np.array(models_dict[excluded][1]).reshape(-1, 1) # get the current item's predictions
            meta_x = np.hstack((current_meta_x, new_yhat)) # add the predictions to the meta set
            
            # score the current item
            scores = cross_validate(meta_model, meta_x, y, cv=5, n_jobs=-1, scoring=('neg_mean_absolute_error'))
            mae = round(scores['test_score'].mean(),3)
            if verbose: print("{} score: {}".format(excluded, mae))
            
            new_mae[excluded] = mae # append the mae to the series field
        
        best_mae = new_mae.max() # evaluate best mae of the excluded_models in this round
        if verbose: print("Best mae: {}\n".format(best_mae))
        
        if best_mae > starting_mae:  # if the best mae is better than the initial mae
            best_feature = new_mae.idxmax()  # define this as the new best feature
            included_models.append(str(best_feature)) # append this model name to the included list
            changed=True # flag that we changed it
            if verbose: print('Add  {} with mae {}\n'.format(best_feature, best_mae))
        else: changed = False
        
        if not changed:
            break
            
    print(model_label, "model optimized")
    print('resulting models:', included_models)
    print('MAE:', starting_mae)
    
    return included_models, starting_mae

In [32]:
# Set up a scoring dictionary to hold the model stack selector results
scores = {}
scores['Model'] = []
scores['MAE'] = []
scores['Included'] = []
# Run the model stack selector for each model in our trained_models
for label, model in trained_models.items():
    
    meta_model = trained_models[label][0]
    resulting_models, best_mae = model_selector(data_x, data_y,  meta_model, trained_models, label, verbose=True)
    
    scores['Model'].append(model)
    scores['MAE'].append(best_mae)
    scores['Included'].append(resulting_models)



Running model selector for  SVR

NEW ROUND - Setting up score charts
Included models: []
Exluded models: ['Ridge', 'XGB', 'KNN', 'Linear Regression', 'SVR']


  new_mae = pd.Series(index=excluded_models) # make a series where the index is the current excluded_models


Starting mae: -0.418

Ridge score: -0.325
XGB score: -0.213
KNN score: -0.251
Linear Regression score: -0.326
SVR score: -0.347
Best mae: -0.213

Add  XGB with mae -0.213


NEW ROUND - Setting up score charts
Included models: ['XGB']
Exluded models: ['Ridge', 'SVR', 'KNN', 'Linear Regression']
Starting mae: -0.213

Ridge score: -0.199
SVR score: -0.204
KNN score: -0.211
Linear Regression score: -0.199
Best mae: -0.199

Add  Ridge with mae -0.199


NEW ROUND - Setting up score charts
Included models: ['XGB', 'Ridge']
Exluded models: ['SVR', 'KNN', 'Linear Regression']
Starting mae: -0.199

SVR score: -0.2
KNN score: -0.199
Linear Regression score: -0.199
Best mae: -0.199

SVR model optimized
resulting models: ['XGB', 'Ridge']
MAE: -0.199


Running model selector for  Ridge

NEW ROUND - Setting up score charts
Included models: []
Exluded models: ['Ridge', 'XGB', 'KNN', 'Linear Regression', 'SVR']
Starting mae: -0.415

Ridge score: -0.377
XGB score: -0.223
KNN score: -0.263
Linear Regress

In [33]:
# Look at the scores of our model combinations
best_model = pd.DataFrame(scores).reset_index(drop=True)
best_model.sort_values('MAE', ascending=False)

Unnamed: 0,Model,MAE,Included
0,"[SVR(C=10, epsilon=0.05, kernel='linear'), [0....",-0.199,"[XGB, Ridge]"
4,"[XGBRegressor(base_score=0.5, booster='gbtree'...",-0.2,"[XGB, Ridge, KNN]"
1,"[Ridge(random_state=42), [0.5445160804585562, ...",-0.203,"[XGB, Ridge, Linear Regression, KNN, SVR]"
3,"[LinearRegression(), [0.5445946790521577, 0.20...",-0.203,"[XGB, Linear Regression, KNN]"
2,"[KNeighborsRegressor(n_neighbors=11), [0.63636...",-0.259,"[XGB, KNN]"


In [39]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [40]:
# Check our meta model on the original train/test set only
# Instantiate the chosen meta model
meta_model =  SVR(gamma = 'scale', kernel = 'linear', C=10, epsilon=.05)
meta_model.fit(X_train, y_train)
predictions = meta_model.predict(X_test)
pred_exp = np.exp(predictions)
actual = np.exp(y_test)
print("MAE: ",int(mean_absolute_error(pred_exp, actual)))
print("RMSE:",int(np.sqrt(mean_squared_error(pred_exp, actual))))
print(("R2:",r2_score(pred_exp, actual)*100))

MAE:  0
RMSE: 0
('R2:', -46.08516979175021)


In [42]:
print("Fitting Models")
linreg.fit(X_train, y_train)
xgbr.fit(X_train, y_train)
neighbor.fit(X_train, y_train)

Fitting Models


KNeighborsRegressor(n_neighbors=11)

In [43]:
yhat_predics = [trained_models['XGB'][1], trained_models['Linear Regression'][1], trained_models['KNN'][1]]

In [44]:
def create_meta_dataset(data_x, items):
    '''Function that takes in a data set and list of predictions, and forges into one dataset
    parameters:
    data_x - original data set
    items - list of predictions
    returns: stacked data set
    '''
    
    meta_x = data_x
    
    for z in items:
        z = np.array(z).reshape((len(z), 1))
        meta_x = np.hstack((meta_x, z))
        
    return meta_x

In [45]:
# create the meta data set using the oof predictions
meta_X_train = create_meta_dataset(data_x, yhat_predics)

In [52]:
np.array(yhat_predics).shape

(3, 670)

In [49]:
X_train.shape

(670, 25)

In [48]:
meta_X_train.shape

(670, 28)

In [54]:
final_models = [xgbr, linreg, neighbor]

In [57]:
def stack_prediction(X_test, final_models): 
    '''takes in a test set and a list of fitted models.
    Fits each model in the list on the test set and stores it in a predictions list. Then sends the test set and the predictions to the create_meta_dataset to be combined
    Returns: combined meta test set
    Parameters:
    X_test - testing dataset
    final_models - list of fitted models
    '''
    predictions = []
    
    for item in final_models:
        print(item)
        preds = item.predict(X_test).reshape(-1,1)
        predictions.append(preds)
    
    meta_X = create_meta_dataset(X_test, predictions)
        
    return meta_X

In [58]:
meta_X_test = stack_prediction(X_test, final_models)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.4, eta=0.1,
             eval_metric='mae', gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.100000001,
             max_delta_step=0, max_depth=7, min_child_weight=5, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=8,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=50,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
LinearRegression()
KNeighborsRegressor(n_neighbors=11)


In [61]:
meta_X_test.shape

(330, 28)

In [67]:
# fit the meta model to the Train meta dataset
# There is no data leakage in the meta dataset since we did all of our predictions out-of-sample!
meta_model.fit(meta_X_train, data_y)
# predict on the meta test set
predictions = meta_model.predict(meta_X_test)
pred_exp = np.exp(predictions)
actual = np.exp(y_test)
print("MAE: ",int(mean_absolute_error(pred_exp, actual)))
print("RMSE:",int(np.sqrt(mean_squared_error(pred_exp, actual))))
print(("R2:",r2_score(pred_exp, actual)*100))

MAE:  0
RMSE: 0
('R2:', 61.992026991666194)
