# House Prices 6: Feature Selection
This notebook is heavily inspired by [Feature Engineering and Selection: A Practical Approach for Predictive Models](http://www.feat.engineering/). 

In [house-prices-5-interactions](https://www.kaggle.com/code/danielmcampos/house-prices-5-interactions), brute force interaction with ElasticNet was used to implicitly select features. This notebook will investigate explicit methods of feature engineering:
1. Transformations.
2. Interactions.

And feature selection:
1. Filter methods.
2. Wrapper methods.


In [None]:
from house_price_utils import *

setup_notebook()

# Data

In [None]:
data, data_test = load_data()
data = data[sorted(data)]
X_raw = data.drop(columns=["SalePrice"])
y = data.SalePrice

In [None]:
from scipy import stats
from sklearn.compose import TransformedTargetRegressor, make_column_transformer
from sklearn import linear_model
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import (
    SelectKBest,
    r_regression,
    f_regression,
    chi2,
    VarianceThreshold,
)
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    OrdinalEncoder,
    PolynomialFeatures,
)

# Feature Grouping

In [None]:
features = group_features(X_raw)

In [None]:
feature_counts = {k: len(v) for k, v in features.items()}
feature_counts

# CV Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X_raw,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True,
)

# Preprocessor

In [None]:
continuous_transformer = make_pipeline(SimpleImputer(strategy="mean"), StandardScaler())
nominal_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"),
)
ordinal_transformer = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
)
preprocessor = make_column_transformer(
    (continuous_transformer, features["continuous"]),
    (nominal_transformer, features["nominal"]),
    (ordinal_transformer, features["ordinal"]),
)

In [None]:
scores = {}

# Base Model
Will use Linear Regression as the base model since it is more sensitive to over-fitting irrelevant features.

In [None]:
base_model = make_pipeline(preprocessor, linear_model.LinearRegression())

In [None]:
scores["base_model"] = -cross_val_score(
    base_model, X_train, y_train, cv=5, scoring="neg_mean_absolute_error"
)

In [None]:
summary(scores)

# Add Square Root Transforms and Select Constrained Features

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin


class SqrtFeatureExpander(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        features = []
        for x in X.T:
            features.append(x)
            features.append(np.sign(x) * np.sqrt(abs(x)))
        return np.column_stack(features)

In [None]:
# Test the power transformer
sqrt_expander = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy="mean"), StandardScaler(), SqrtFeatureExpander()
        ),
        features["continuous"],
    )
)

X_power = sqrt_expander.fit_transform(X_train)

shapes = X_power.shape, X_train[features["continuous"]].shape
print(*shapes)
assert shapes[0][0] == shapes[1][0]
assert shapes[0][1] > shapes[1][1]

In [None]:
continuous_transformer_2 = make_pipeline(
    SimpleImputer(strategy="mean"),
    SqrtFeatureExpander(),
    StandardScaler(),
    SelectKBest(
        r_regression,
        k=feature_counts["continuous"],
    ),
)
preprocessor_2 = make_column_transformer(
    (continuous_transformer_2, features["continuous"]),
    (nominal_transformer, features["nominal"]),
    (ordinal_transformer, features["ordinal"]),
)
model_2 = make_pipeline(preprocessor_2, linear_model.LinearRegression())

In [None]:
scores["model_2"] = -cross_val_score(
    model_2, X_train, y_train, cv=5, scoring="neg_mean_absolute_error"
)
summary(scores)

# Filter & Select Nominal Features

In [None]:
nominal_transformer_2 = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"),
    SelectKBest(score_func=chi2, k=int(0.5 * feature_counts["nominal"])),
)
preprocessor_3 = make_column_transformer(
    (continuous_transformer_2, features["continuous"]),
    (nominal_transformer_2, features["nominal"]),
    (ordinal_transformer, features["ordinal"]),
)
model_3 = make_pipeline(preprocessor_3, linear_model.LinearRegression())

In [None]:
scores["model_3"] = -cross_val_score(
    model_3, X_train, y_train, cv=5, scoring="neg_mean_absolute_error"
)
summary(scores)

# Interaction Search

In [None]:
continuous_transformer_3 = make_pipeline(
    SimpleImputer(strategy="mean"),
    SqrtFeatureExpander(),
    SelectKBest(
        r_regression,
        k=feature_counts["continuous"],
    ),
)
preprocessor_4 = make_column_transformer(
    (continuous_transformer_3, features["continuous"]),
    (nominal_transformer_2, features["nominal"]),
    (ordinal_transformer, features["ordinal"]),
)
model_4 = make_pipeline(
    preprocessor_4,
    PolynomialFeatures(2, include_bias=False, interaction_only=True),
    StandardScaler(),
    SelectKBest(r_regression, k=int(sum(feature_counts.values()))),
    linear_model.LinearRegression(),
)

In [None]:
scores["model_4"] = -cross_val_score(
    model_4, X_train, y_train, cv=5, scoring="neg_mean_absolute_error"
)
summary(scores)

# Intrinsic Search with ElasticNet

In [None]:
model_5 = make_pipeline(
    preprocessor_4,
    PolynomialFeatures(2, include_bias=False, interaction_only=True),
    StandardScaler(),
    linear_model.ElasticNetCV(n_jobs=4, cv=5),
)

In [None]:
scores["model_5"] = -cross_val_score(
    model_5,
    X_train,
    y_train,
    cv=5,
    scoring="neg_mean_absolute_error",
)
summary(scores)

# Recursive Feature Elimination

In [None]:
from sklearn.feature_selection import RFE


model_6 = make_pipeline(
    preprocessor_4,
    PolynomialFeatures(2, include_bias=False, interaction_only=True),
    StandardScaler(),
    RFE(
        estimator=linear_model.LinearRegression(),
        n_features_to_select=int(sum(feature_counts.values())),
    ),
)

In [None]:
# model_6.fit(X_train, y_train)
# report(model_6, X_train, y_train, X_val, y_val)

## Observations
* Super slow. Took ~30 minutes to run.
* Train MAE = 14322.15, Test MAE = 21565.44

# Stepwise Selection

## Forward Selection

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
fwd_selector = SequentialFeatureSelector(
    linear_model.LinearRegression(),
    direction="forward",
    tol=50,
    scoring="neg_mean_absolute_error",
    cv=2,
    n_jobs=3,
)

### Using Base Model Features

In [None]:
model_7 = make_pipeline(
    preprocessor,
    fwd_selector,
    linear_model.LinearRegression(),
)

In [None]:
scores["model_7"] = -cross_val_score(
    model_7,
    X_train,
    y_train,
    cv=5,
    scoring="neg_mean_absolute_error",
)
summary(scores)

In [None]:
model_7.fit(X_train, y_train)
report(model_7, X_train, y_train, X_val, y_val)

In [None]:
fwd_selector.get_feature_names_out(preprocessor.get_feature_names_out())

In [None]:
m = model_7[1]
m.n_features_in_, m.n_features_to_select_

### Using Interaction Features

In [None]:
model_7i = make_pipeline(
    preprocessor_4,
    PolynomialFeatures(2, include_bias=False, interaction_only=True),
    StandardScaler(),
    fwd_selector,
    linear_model.LinearRegression(),
)

In [None]:
scores["model_7i"] = -cross_val_score(
    model_7i,
    X_train,
    y_train,
    cv=5,
    scoring="neg_mean_absolute_error",
)
summary(scores)

## Backward Selection

In [None]:
bwd_selector = SequentialFeatureSelector(
    linear_model.LinearRegression(),
    direction="backward",
    tol=-50,
    scoring="neg_mean_absolute_error",
    cv=2,
    n_jobs=3,
)

### Using Base Model Features

In [None]:
model_8 = make_pipeline(
    preprocessor,
    bwd_selector,
    linear_model.LinearRegression(),
)

In [None]:
scores["model_8"] = -cross_val_score(
    model_8,
    X_train,
    y_train,
    cv=5,
    scoring="neg_mean_absolute_error",
)
summary(scores)

### Using Interaction Features

In [None]:
model_8i = make_pipeline(
    preprocessor_4,
    PolynomialFeatures(2, include_bias=False, interaction_only=True),
    StandardScaler(),
    bwd_selector,
    linear_model.LinearRegression(),
)

In [None]:
scores["model_8i"] = -cross_val_score(
    model_8i,
    X_train,
    y_train,
    cv=5,
    scoring="neg_mean_absolute_error",
)
summary(scores)

# Evaluation

In [None]:
# model.fit(X_train, y_train)

In [None]:
# report(model, X_train, y_train, X_val, y_val)

# Observations
...

In [None]:
# fig = residual_plots(y_val, model.predict(X_val))

# Summary
...

# Submit

In [None]:
# model.fit(X_raw, y)
# print(f"Full data MAE = {evaluate(model, X_raw, y):.2f}")

In [None]:
# create_submission(model, data_test)

In [None]:
if False:
    import kaggle

    result = kaggle.api.competition_submit(
        "submission.csv",
        f"{regressor} with log transform of target, all interactions",
        "home-data-for-ml-course",
    )
    print(result)