In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import median_absolute_error

# 1. Data Loading
data = fetch_california_housing()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MedHouseVal'] = data.target

# 2. Feature Engineering
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate the great circle distance between two points (in km)"""
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat, dlon = lat2 - lat1, lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2 # the distance between two points on a sphere
    return 2 * np.arcsin(np.sqrt(a)) * 6371

# Ratios
df['rooms_per_household'] = df['AveRooms'] / df['AveOccup'] # to present the average number of rooms per household  
df['bedrooms_ratio'] = df['AveBedrms'] / df['AveRooms'] # to present the ratio of bedrooms to rooms
df['population_per_household'] = df['Population'] / df['AveOccup'] # to present the average number of people per household

# Geographic Clusters
kmeans = KMeans(n_clusters=10, random_state=42, n_init=10) # use kMeans to cluster the data into 10 clusters based on the latitude and longitude
df['location_cluster'] = kmeans.fit_predict(df[['Latitude', 'Longitude']])

# Distance to Major Cities
SF, LA, SD = (37.7749, -122.4194), (34.0522, -118.2437), (32.7157, -117.1611)
df['distance_to_SF'] = haversine_distance(df['Latitude'], df['Longitude'], SF[0], SF[1])
df['distance_to_LA'] = haversine_distance(df['Latitude'], df['Longitude'], LA[0], LA[1])
df['distance_to_SD'] = haversine_distance(df['Latitude'], df['Longitude'], SD[0], SD[1])
df['distance_to_nearest_city'] = df[['distance_to_SF', 'distance_to_LA', 'distance_to_SD']].min(axis=1)

# Income Categories
df['income_category_num'] = pd.cut(df['MedInc'], bins=[0, 2.5, 4.5, np.inf], labels=[0, 1, 2]).astype(int)

# --- 3. DATA SPLITTING ---
# Select the features used in your metadata
features = [
    "MedInc", "HouseAge", "AveRooms", "AveBedrms", "Population", "AveOccup", 
    "Latitude", "Longitude", "rooms_per_household", "bedrooms_ratio", 
    "population_per_household", "location_cluster", "distance_to_SF", 
    "distance_to_LA", "distance_to_SD", "distance_to_nearest_city", "income_category_num"
]

X = df[features]
y = df['MedHouseVal']

# Train-Test Split (Standard 80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Lyfe Estimate Engine
def calculate_mdape(y_true, y_pred):
    return np.median(np.abs((y_true - y_pred) / y_true)) * 100

def train_lyfe_estimate(X_tr, y_tr):
    # Point Estimate
    m_point = xgb.XGBRegressor(n_estimators=250, max_depth=9, learning_rate=0.07, objective='reg:squarederror')
    m_point.fit(X_tr, y_tr)
    
    # Range Bounds (Quantile Regression)
    m_low = xgb.XGBRegressor(n_estimators=250, objective='reg:quantileerror', quantile_alpha=0.1)
    m_high = xgb.XGBRegressor(n_estimators=250, objective='reg:quantileerror', quantile_alpha=0.9)
    m_low.fit(X_tr, y_tr)
    m_high.fit(X_tr, y_tr)
    
    return m_low, m_point, m_high

# 5. Deployment
m_low, m_point, m_high = train_lyfe_estimate(X_train, y_train)

# Final Results
lyfe_results = pd.DataFrame({
    'Actual': y_test,
    'Lyfe_Low': m_low.predict(X_test),
    'Lyfe_Estimate': m_point.predict(X_test),
    'Lyfe_High': m_high.predict(X_test)
})

print(f"Settlyfe MdAPE: {calculate_mdape(y_test, lyfe_results['Lyfe_Estimate']):.2f}%")
print(lyfe_results)