# Feature Engineering

This notebook demonstrates key feature engineering techniques:
- Numerical transformations (scaling, binning, polynomial features)
- Categorical encoding (target, frequency, hash encoding)
- Date/time features (cyclical encoding, lag features)
- Text features (TF-IDF, basic statistics)
- Feature selection (filter, wrapper, embedded methods)
- End-to-end pipeline with feature engineering

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import (
    StandardScaler, MinMaxScaler, RobustScaler,
    PowerTransformer, QuantileTransformer,
    OneHotEncoder, OrdinalEncoder, PolynomialFeatures,
    FunctionTransformer, KBinsDiscretizer
)
from sklearn.feature_selection import (
    mutual_info_regression, f_regression, SelectKBest, RFECV
)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, r2_score

np.random.seed(42)
print('Libraries loaded successfully')

## 1. Numerical Transformations

In [None]:
# Compare different scalers on skewed data
data = np.random.exponential(scale=2, size=1000).reshape(-1, 1)

scalers = {
    'Original': None,
    'StandardScaler': StandardScaler(),
    'MinMaxScaler': MinMaxScaler(),
    'RobustScaler': RobustScaler(),
    'PowerTransformer': PowerTransformer(method='yeo-johnson'),
    'QuantileTransformer': QuantileTransformer(output_distribution='normal'),
}

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
for ax, (name, scaler) in zip(axes.ravel(), scalers.items()):
    transformed = data if scaler is None else scaler.fit_transform(data)
    ax.hist(transformed, bins=50, edgecolor='black', alpha=0.7)
    ax.set_title(name)
plt.tight_layout()
plt.show()

In [None]:
# Interaction and polynomial features
house_data = pd.DataFrame({
    'length': [10, 15, 20, 12, 18],
    'width': [8, 10, 12, 9, 11],
    'floors': [1, 2, 1, 2, 3],
})

# Manual interactions
house_data['area'] = house_data['length'] * house_data['width']
house_data['volume'] = house_data['length'] * house_data['width'] * house_data['floors']

# sklearn polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(house_data[['length', 'width']])
print('Polynomial feature names:', poly.get_feature_names_out(['length', 'width']))
print(pd.DataFrame(X_poly, columns=poly.get_feature_names_out(['length', 'width'])))

## 2. Categorical Encoding

In [None]:
# Target encoding with smoothing
n = 1000
df = pd.DataFrame({
    'city': np.random.choice(['NYC', 'LA', 'Chicago', 'Houston', 'Phoenix'], n),
    'target': np.random.binomial(1, 0.3, n)
})
df.loc[df['city'] == 'NYC', 'target'] = np.random.binomial(1, 0.7, (df['city'] == 'NYC').sum())
df.loc[df['city'] == 'LA', 'target'] = np.random.binomial(1, 0.2, (df['city'] == 'LA').sum())

# Smoothed target encoding
global_mean = df['target'].mean()
smoothing = 10

stats = df.groupby('city')['target'].agg(['mean', 'count'])
stats['encoded'] = (stats['count'] * stats['mean'] + smoothing * global_mean) / (stats['count'] + smoothing)
print('Target encoding with smoothing:')
print(stats)

## 3. Temporal Feature Engineering

In [None]:
# Cyclical encoding for hours
hours = pd.DataFrame({'hour': range(24)})
hours['hour_sin'] = np.sin(2 * np.pi * hours['hour'] / 24)
hours['hour_cos'] = np.cos(2 * np.pi * hours['hour'] / 24)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(hours['hour'], marker='o')
axes[0].set_title('Linear: hour 23 and 0 are far apart')
axes[0].set_xlabel('Index')
axes[0].set_ylabel('Hour')

scatter = axes[1].scatter(hours['hour_sin'], hours['hour_cos'], c=hours['hour'], cmap='twilight')
for i in range(0, 24, 3):
    axes[1].annotate(f'{i}h', (hours['hour_sin'][i], hours['hour_cos'][i]))
axes[1].set_title('Cyclical: hour 23 and 0 are adjacent')
axes[1].set_xlabel('sin(hour)')
axes[1].set_ylabel('cos(hour)')
axes[1].set_aspect('equal')
plt.colorbar(scatter, ax=axes[1], label='Hour')
plt.tight_layout()
plt.show()

In [None]:
# Lag and rolling features
dates = pd.date_range('2023-01-01', '2023-12-31', freq='D')
sales = 100 + 20 * np.sin(np.arange(len(dates)) * 2 * np.pi / 365) + np.random.normal(0, 10, len(dates))
ts = pd.DataFrame({'date': dates, 'sales': sales})

# Lag features
for lag in [1, 7, 14]:
    ts[f'lag_{lag}'] = ts['sales'].shift(lag)

# Rolling features (shift first to prevent leakage)
for window in [7, 14, 30]:
    ts[f'rolling_mean_{window}'] = ts['sales'].shift(1).rolling(window).mean()
    ts[f'rolling_std_{window}'] = ts['sales'].shift(1).rolling(window).std()

print('Lag and rolling features (last 5 rows):')
ts[['date', 'sales', 'lag_1', 'lag_7', 'rolling_mean_7', 'rolling_std_7']].tail()

## 4. Feature Selection

In [None]:
# Load California Housing dataset
housing = fetch_california_housing(as_frame=True)
X, y = housing.data, housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Mutual Information scores
mi_scores = mutual_info_regression(X_train, y_train, random_state=42)
mi_df = pd.Series(mi_scores, index=X_train.columns).sort_values(ascending=True)

# Random Forest feature importance
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
rf_imp = pd.Series(rf.feature_importances_, index=X_train.columns).sort_values(ascending=True)

# Lasso importance
lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X_train, y_train)
lasso_imp = pd.Series(np.abs(lasso.coef_), index=X_train.columns).sort_values(ascending=True)

fig, axes = plt.subplots(1, 3, figsize=(18, 5))
mi_df.plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Mutual Information')
rf_imp.plot(kind='barh', ax=axes[1], color='forestgreen')
axes[1].set_title('Random Forest Importance')
lasso_imp.plot(kind='barh', ax=axes[2], color='coral')
axes[2].set_title('Lasso |Coefficient|')
plt.tight_layout()
plt.show()

## 5. End-to-End Pipeline

In [None]:
# Custom feature creation function
def create_housing_features(X):
    X = X.copy()
    X['rooms_per_household'] = X['AveRooms'] / X['AveOccup'].clip(lower=0.1)
    X['bedrooms_ratio'] = X['AveBedrms'] / X['AveRooms'].clip(lower=0.1)
    X['population_density'] = X['Population'] / X['AveOccup'].clip(lower=0.1)
    X['income_per_room'] = X['MedInc'] / X['AveRooms'].clip(lower=0.1)
    return X

feature_creator = FunctionTransformer(create_housing_features, validate=False)

preprocessor = ColumnTransformer([
    ('power', PowerTransformer(method='yeo-johnson'),
     ['MedInc', 'Population', 'AveOccup']),
    ('standard', StandardScaler(),
     ['HouseAge', 'AveRooms', 'AveBedrms', 'Latitude', 'Longitude',
      'rooms_per_household', 'bedrooms_ratio', 'population_density', 'income_per_room']),
], remainder='drop')

pipeline = Pipeline([
    ('features', feature_creator),
    ('preprocessor', preprocessor),
    ('model', GradientBoostingRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)),
])

# Cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='r2')
print(f'Engineered CV R²: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})')

# Baseline (no feature engineering)
baseline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', GradientBoostingRegressor(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)),
])
baseline_scores = cross_val_score(baseline, X_train, y_train, cv=5, scoring='r2')
print(f'Baseline CV R²:   {baseline_scores.mean():.4f} (+/- {baseline_scores.std():.4f})')
print(f'Improvement:       {cv_scores.mean() - baseline_scores.mean():.4f}')