# Day 25: Feature Engineering

## Foundation Drilling

In [1]:
# Standard Week 3 data load (use in all Foundation Drilling)
from sklearn.datasets import fetch_california_housing
import pandas as pd

housing = fetch_california_housing(as_frame=True)
df = housing.frame
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Verify
print(df.shape)  # (20640, 9)
print(df.columns.tolist())

(20640, 9)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal']


In [2]:
# Task 1: Create a dictionary with 3 keys
# Use .get() to access an existing key
# Use .get() to access a non-existing key with a default
# Compare to what happens with regular dict['missing_key']

fam = {"Dad": 42, "Clebo": 16, "Henry": 11}

print(fam.get('Dad', 'Unknown'))
print(fam.get('Delta', 'Unknown'))


42
Unknown


In [3]:
# Task 2: Given a list of words, count occurrences using .get()
words = ['apple', 'banana', 'apple', 'cherry', 'banana', 'apple']
counts = {}
# Build counts dict using: counts[word] = counts.get(word, 0) + 1
# Print the final counts

for word in words:
    counts[word] = counts.get(word, 0) + 1

print(counts)

{'apple': 3, 'banana': 2, 'cherry': 1}


### Part 2: Week 2 Classification Metrics Reinforcement

**Focus:** Confusion matrix + all 4 metrics (reinforcing previous week - Day 1 rule)

In [4]:
# Task: From memory, write the complete classification metrics workflow
# 1. Import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# 2. Assume you have y_test and predictions
# 3. Create confusion matrix, extract TN, FP, FN, TP
# 4. Calculate all 4 metrics
# 5. Print formatted results

# Verbal check:
# - Precision answers: "Of positive PREDICTIONS, how many were correct?"
# - Recall answers: "Of actual POSITIVES, how many did we catch?"
# - When would you prioritize recall over precision?

# from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
# from sklearn.model_selection import train_test_split

# cm = confusion_matrix(y_test, predictions)

# tn, fp, fn, tp = cm.ravel()

# accuracy = accuracy_score(y_test, predictions)
# precision = precision_score(y_test, predictions)
# recall = recall_score(y_test, predictions)
# f1 = f1_score(y_test, predictions)

# print(f"Accuracy Score: {accuracy:.3f}")
# print(f"Precision Score: {precision:.3f}")
# print(f"Recall Score: {recall:.3f}")
# print(f"F1 Score: {f1:.3f}")

# I would prioritize Recall over precision, when a False Negative is more costly than a False Positive.

### Part 3: Visualization Practice

**Focus:** Actual vs Predicted scatter plot (regression refresher for Week 3)

In [5]:
# Task: Create actual vs predicted scatter plot pattern
# This is Week 1 regression skill - refreshing before feature engineering

# Pattern:
# plt.scatter(y_test, predictions, alpha=0.5)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
# plt.xlabel('Actual')
# plt.ylabel('Predicted')
# plt.title('Actual vs Predicted')

# If you have a model fitted, create the plot
# If not, just write the code pattern from memory

# plt.figure(figsize=(8,5))
# plt.scatter(y_test, predictions, alpha=0.5)
# plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
# plt.xlabel('Actual')
# plt.ylabel('Predicted')
# plt.title('Actual vs Predicted')

## BLOCK 0: Data Setup (Clean Dataset)

**California Housing - No cleaning needed:**

In [6]:
# Standard Week 3 data load
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load data
housing = fetch_california_housing(as_frame=True)
df = housing.frame

# Quick inspection
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Target: MedHouseVal (median house value in $100k)")
df.head(2).T

Shape: (20640, 9)
Columns: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal']
Target: MedHouseVal (median house value in $100k)


Unnamed: 0,0,1
MedInc,8.3252,8.3014
HouseAge,41.0,21.0
AveRooms,6.984127,6.238137
AveBedrms,1.02381,0.97188
Population,322.0,2401.0
AveOccup,2.555556,2.109842
Latitude,37.88,37.86
Longitude,-122.23,-122.22
MedHouseVal,4.526,3.585


## BLOCK 1: Baseline Model (No Feature Engineering)
**Goal: Build a model with RAW features only. This gives us the performance bar to beat with engineered features.**

**Task sequence - attempt each, then report results:**

**Task 1: Create baseline model**

In [7]:
# Separate features and target
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit baseline model
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)
baseline_preds = baseline_model.predict(X_test)

# Calculate metrics
baseline_r2 = r2_score(y_test, baseline_preds)
baseline_mae = mean_absolute_error(y_test, baseline_preds)
baseline_rmse = np.sqrt(mean_squared_error(y_test, baseline_preds))

print("=== BASELINE MODEL (Raw Features) ===")
print(f"R² Score: {baseline_r2:.4f}")
print(f"MAE: ${baseline_mae * 100000:,.0f}")  # Convert to actual dollars
print(f"RMSE: ${baseline_rmse * 100000:,.0f}")


=== BASELINE MODEL (Raw Features) ===
R² Score: 0.5758
MAE: $53,320
RMSE: $74,558


## BLOCK 2: Feature Engineering - Creating New Features

**New Pattern — Ratio features:**

In [8]:
# Basic pattern
# df['new_feature'] = df['column1'] / df['column2']

# Real example
# df['rooms_per_household'] = df['AveRooms'] / df['AveOccup']
# Translation: "Normalize rooms by occupancy for fair comparison across different household sizes"

# Handling division issues
# df['safe_ratio'] = df['col1'] / df['col2'].replace(0, np.nan)
# Translation: "Replace zeros with NaN to avoid infinity"


In [10]:
# Feature 1: Bedrooms per room (what fraction of rooms are bedrooms?)
df['bedroom_ratio'] = df['AveBedrms'] / df['AveRooms']

# Feature 2: Rooms per person (how spacious per occupant?)
df['rooms_per_person'] = df['AveRooms'] / df['AveOccup']

# Feature 3: Your choice - create one more that makes domain sense
# Ideas: population density, income per room, location interactions
# Feature 3: Location interaction
df['lat_long_interaction'] = df['Latitude'] * df['Longitude']

# Verify no NaN or inf created
print(df[['bedroom_ratio', 'rooms_per_person', 'lat_long_interaction']].describe())
print(f"NaN counts:\\n{df[['bedroom_ratio', 'rooms_per_person', 'lat_long_interaction']].isna().sum()}")


       bedroom_ratio  rooms_per_person  lat_long_interaction
count   20640.000000      20640.000000          20640.000000
mean        0.213075          1.976970          -4264.448003
std         0.058023          1.146020            323.904337
min         0.100000          0.002547          -5207.673000
25%         0.175426          1.522382          -4601.026500
50%         0.203181          1.937936          -4060.278900
75%         0.239834          2.296090          -4004.349775
max         1.000000         55.222222          -3753.968400
NaN counts:\nbedroom_ratio           0
rooms_per_person        0
lat_long_interaction    0
dtype: int64


## BLOCK 3: Engineered Features Model

In [11]:
# Create feature set WITH engineered features
feature_cols = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population',
                'AveOccup', 'Latitude', 'Longitude',
                'bedroom_ratio', 'rooms_per_person', 'lat_long_interaction']  # Add your feature name

X_engineered = df[feature_cols]
y = df['MedHouseVal']

# Fresh split (same random_state for fair comparison)
X_train_eng, X_test_eng, y_train, y_test = train_test_split(
    X_engineered, y, test_size=0.2, random_state=42
)

# Fit model with engineered features
eng_model = LinearRegression()
eng_model.fit(X_train_eng, y_train)
eng_preds = eng_model.predict(X_test_eng)

# Calculate metrics
eng_r2 = r2_score(y_test, eng_preds)
eng_mae = mean_absolute_error(y_test, eng_preds)
eng_rmse = np.sqrt(mean_squared_error(y_test, eng_preds))

print("=== ENGINEERED FEATURES MODEL ===")
print(f"R² Score: {eng_r2:.4f}")
print(f"MAE: ${eng_mae * 100000:,.0f}")
print(f"RMSE: ${eng_rmse * 100000:,.0f}")


=== ENGINEERED FEATURES MODEL ===
R² Score: 0.6525
MAE: $48,581
RMSE: $67,476


In [16]:
print("=== COMPARISON ===")
print(f"R² Change: {eng_r2 - baseline_r2:+.4f} ({'improved' if eng_r2 > baseline_r2 else 'worse'})")
print(f"MAE Change: ${(eng_mae - baseline_mae) * 100000:+,.0f} ({'improved' if eng_mae < baseline_mae else 'worse'})")
print(f"RMSE Change: ${(eng_rmse - baseline_rmse) * 100000:+,.0f} ({'improved' if eng_rmse < baseline_rmse else 'worse'})")


=== COMPARISON ===
R² Change: +0.0768 (improved)
MAE Change: $-4,739 (improved)
RMSE Change: $-7,082 (improved)


In [17]:
# Which features matter most?
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': eng_model.coef_
}).sort_values('coefficient', key=abs, ascending=False)

print("=== FEATURE IMPORTANCE (by coefficient magnitude) ===")
print(feature_importance)


=== FEATURE IMPORTANCE (by coefficient magnitude) ===
                 feature  coefficient
8          bedroom_ratio     4.148810
7              Longitude    -0.690368
3              AveBedrms    -0.522658
6               Latitude     0.514924
9       rooms_per_person     0.465323
0                 MedInc     0.414377
2               AveRooms    -0.063191
1               HouseAge     0.010115
10  lat_long_interaction     0.007728
5               AveOccup    -0.001540
4             Population     0.000028
