# Day 26: Feature Selection + Advanced Engineering

## Foundation Drilling

In [1]:
# Standard Week 3 data load (use in all Foundation Drilling)
from sklearn.datasets import fetch_california_housing
import pandas as pd

housing = fetch_california_housing(as_frame=True)
df = housing.frame
X = df.drop('MedHouseVal', axis=1)
y = df['MedHouseVal']

# Verify
print(df.shape)  # (20640, 9)
print(df.columns.tolist())

(20640, 9)
['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude', 'MedHouseVal']


### Part 1: Python Fundamentals & Libraries

**Focus:** sorted() with key parameter (new pattern)

In [4]:
# New pattern: sorted() with custom key
# Basic pattern
# sorted(iterable, key=function, reverse=False)

# Real example
# words = ['banana', 'pie', 'Washington', 'a']
# by_length = sorted(words, key=len)  # ['a', 'pie', 'banana', 'Washington']
# Translation: "Sort by the result of applying function to each element"

# Task 1: Sort a list of numbers by their absolute value

numbers = [-5, 2, -1, 8, -3]
numbers_sorted = sorted(numbers, key=abs)
print(numbers_sorted)

# Use sorted() with key=abs

[-1, 2, -3, -5, 8]


In [10]:
# Task 2: Sort a list of dictionaries by a specific key
people = [{'name': 'Alice', 'age': 30}, {'name': 'Bob', 'age': 25}, {'name': 'Carol', 'age': 35}]
# Sort by age using sorted() with key=lambda
# basic pattern
# sorted(iterable, key=lambda item: item['key_name'])

people_sorted = sorted(people, key=lambda person: person['age'])
print(people_sorted)

[{'name': 'Bob', 'age': 25}, {'name': 'Alice', 'age': 30}, {'name': 'Carol', 'age': 35}]


In [12]:
# Task 3: Sort DataFrame columns by their mean value (advanced)
# Hint: df[sorted(df.columns, key=lambda col: df[col].mean())]
# Translation: For each column name, calculate df[column_name].mean()

sorted_cols = sorted(df.columns, key=lambda col: df[col].mean())
print(sorted_cols)

['Longitude', 'AveBedrms', 'MedHouseVal', 'AveOccup', 'MedInc', 'AveRooms', 'HouseAge', 'Latitude', 'Population']


### Part 2: Feature Engineering Patterns (learned Day 25)

**Focus:** Creating ratio features and new columns

In [13]:
# Task: Using California Housing data, create these features from memory:

# 1. Bedroom ratio (bedrooms as fraction of total rooms)
df['bedroom_ratio'] = df['AveBedrms'] / df['AveRooms']
# Translation: "What percentage of rooms are bedrooms?"

# 2. Rooms per person (rooms divided by household size)
df['rooms_per_person'] = df['AveRooms'] / df['AveOccup']
# Translation: "How many rooms per person in the household?"

# Checkpoint: Verify these match what you did on Day 25
print(df[['AveRooms', 'AveBedrms', 'AveOccup', 'bedroom_ratio', 'rooms_per_person']].head())

# Check for NaN/inf
print(f"\nNaN counts:\n{df[['bedroom_ratio', 'rooms_per_person']].isna().sum()}")


   AveRooms  AveBedrms  AveOccup  bedroom_ratio  rooms_per_person
0  6.984127   1.023810  2.555556       0.146591          2.732919
1  6.238137   0.971880  2.109842       0.155797          2.956685
2  8.288136   1.073446  2.802260       0.129516          2.957661
3  5.817352   1.073059  2.547945       0.184458          2.283154
4  6.281853   1.081081  2.181467       0.172096          2.879646

NaN counts:
bedroom_ratio       0
rooms_per_person    0
dtype: int64


In [14]:
# Notes on Math Logic

# Pattern
# "thing_per_unit" = thing / unit

# Examples
# miles_per_hour = miles / hours          # 60 miles / 2 hours = 30 mph
# price_per_square_foot = price / sqft    # $300,000 / 1,500 sqft = $200/sqft
# rooms_per_person = rooms / people       # 6 rooms / 3 people = 2 rooms per person

# Feature 1: Bedroom Ratio

# ❌ Incorrect VERSION (backwards)
# df['bedroom_ratio'] = df['AveRooms'] / df['AveBedrms']
# This gives: "rooms per bedroom"
# Example: 5 rooms / 2 bedrooms = 2.5 rooms per bedroom
# Interpretation: "For every bedroom, there are 2.5 total rooms" (confusing!)

# ✅ CORRECT VERSION
# df['bedroom_ratio'] = df['AveBedrms'] / df['AveRooms']
# This gives: "bedrooms as fraction of total rooms"
# Example: 2 bedrooms / 5 rooms = 0.4 = 40%
# Interpretation: "40% of the rooms are bedrooms" (clear!)

# Feature 2: Rooms Per Person

# ❌ Incorrect VERSION (backwards)
# df['rooms_per_person'] = df['AveOccup'] / df['AveRooms']
# This gives: "people per room"
# Example: 3 people / 5 rooms = 0.6 people per room
# Interpretation: "Each room has 0.6 people" (crowded = LOW value... backwards!)

# ✅ CORRECT VERSION
# df['rooms_per_person'] = df['AveRooms'] / df['AveOccup']
# This gives: "rooms per person"
# Example: 5 rooms / 3 people = 1.67 rooms per person
# Interpretation: "Each person has 1.67 rooms" (spacious = HIGH value... correct!)