# Stacking

In [24]:
import pandas as pd
import numpy as np
import pickle
import time

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, plot_confusion_matrix, confusion_matrix

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.over_sampling import SMOTE

from category_encoders import TargetEncoder

import warnings
warnings.filterwarnings('ignore')

In [2]:
X_train = pd.read_csv('Data/X_train.csv', index_col='building_id')
X_test = pd.read_csv('Data/X_test.csv', index_col='building_id')
y_train = pd.read_csv('Data/y_train.csv', index_col='building_id')
y_test = pd.read_csv('Data/y_test.csv', index_col='building_id')

In [3]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

In [21]:
X_train_sample, X_train_leftover, y_train_sample, y_train_leftover = \
    train_test_split(X_train, y_train, test_size=.9, random_state=42)

### Divide Columns and drop unimportant features

As before, I'll divide the columns for the Pipeline and drop unimportant features.

In [4]:
binary_cols = []
for col in X_train.columns:
    if col.startswith('has'):
        binary_cols.append(col)

cat_cols = list(X_train.select_dtypes(include='object').columns)

integer_cols = ['count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage', 'count_families']

geo_cols = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

all_cols = geo_cols + cat_cols + integer_cols + binary_cols

In [5]:
binary_cols_dropped = binary_cols.copy()
for col in binary_cols_dropped:
    if col.startswith('has_secondary'):
        binary_cols_dropped.remove(col)
binary_cols_dropped.append('has_secondary_use')

cat_cols_dropped = cat_cols.copy()
cat_cols_dropped.remove('legal_ownership_status')
cat_cols_dropped.remove('plan_configuration')

### Pipelines

The plan is to use Random Forest and KNN as the estimators and LogisticRegression as the final estimator in the stacked model.  First, I will need to create pipelines for the various column transformations for RF and KNN.

In [8]:
def log_transform(x):
    return np.log(x+1)

In [11]:
#Create the intial Pipelines for OHE, Target Encoding, log transform, and Standard Scaler
ohe_pipe = imbPipeline([('ohe', OneHotEncoder(sparse=False, handle_unknown='ignore'))])
target_pipe = imbPipeline([('target', TargetEncoder(cols=geo_cols))])

function_transformer = FunctionTransformer(log_transform)
integer_pipe = imbPipeline([
    ('function', function_transformer),
    ('ss', StandardScaler())])

In [16]:
#Create the Random Forest ColumnTransformer and final pipeline
rf_preprocessor = ColumnTransformer([
    ('binary', 'passthrough', binary_cols_dropped),
    ('categorical', ohe_pipe, cat_cols_dropped),
    ('integer', 'passthrough', integer_cols),
    ('geo', target_pipe, geo_cols)
])

rf_pipe = imbPipeline([
    ('rf_preprocessor', rf_preprocessor),
    ('rf', RandomForestClassifier(max_depth=50, min_samples_split=10, n_jobs=-1, random_state=42))
])

In [17]:
#Create the KNN ColumnTransformer and final pipeline
knn_preprocessor = ColumnTransformer([
    ('binary', 'passthrough', binary_cols_dropped),
    ('categorical', ohe_pipe, cat_cols_dropped),
    ('geo', target_pipe, geo_cols),
    ('integer', integer_pipe, integer_cols),
])

knn_pipe = imbPipeline([
    ('knn_preprocessor', knn_preprocessor),
    ('knn', KNeighborsClassifier(n_neighbors=75, p=1, n_jobs=-1))
])

In [14]:
#logreg_preprocessor = ColumnTransformer([
#    ('binary', 'passthrough', binary_cols_dropped),
#    ('categorical', ohe_pipe, cat_cols_dropped),
#    ('geo', target_pipe, geo_cols),
#    ('integer', integer_pipe, integer_cols),
#])

### Create the Stacking Classifer

In [25]:
estimators = [('rf', rf_pipe),
              ('knn', knn_pipe)]
final_estimator = LogisticRegression(C=.01, solver='saga', n_jobs=-1, random_state=42)

stacked = StackingClassifier(estimators=estimators, final_estimator=final_estimator, n_jobs=-1, verbose=2)

### Fit the stacked models

In [None]:
start = time.time()
stacked.fit(X_train, y_train)
end = time.time()
print(f'Runtime: {end-start} seconds')

In [None]:
stacked_preds = stacked.predict(X_test)
stacked_score = f1_score(y_test, stacked_preds, average='micro')
stacked_score