In [8]:
import pandas as pd
import numpy as np

def clean_settlyfe_data(df):
    """
    Cleans physical anomalies as per Settlyfe integrity principles.
    """
    # Filter impossible bedroom counts mentioned in documentation 
    #df = df[df['BedroomAbvGr'] < 10]
    
    # Filter extreme outliers in square footage to stabilize the baseline
    #df = df[df['GrLivArea'] < 4000]

    df = df.drop(columns=['Alley'])
    
    # Handle missing values: use 'None' for categorical and 0 for numerical
    # For Ames, many 'NA' values actually mean the feature doesn't exist (e.g., No Pool)
    df['PoolQC'] = df['PoolQC'].fillna('NA')
    df['GarageType'] = df['GarageType'].fillna('NA')
    
    return df

In [9]:
def engineer_temporal_features(df):
    """
    Implements cyclical encoding and long-term trend analysis[cite: 55, 231].
    """
    # 1. Seasonal Cycles (Sine/Cosine Transformation)
    df['month_sin'] = np.sin(2 * np.pi * df['MoSold'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['MoSold'] / 12)
    
    # 2. Market Trend: Number of months since data start
    min_year = df['YrSold'].min()
    df['market_trend'] = (df['YrSold'] - min_year) * 12 + df['MoSold']
    
    # 3. Property Age: Captures appreciation/depreciation potential
    df['house_age'] = df['YrSold'] - df['YearBuilt']
    
    return df

In [10]:
def engineer_geospatial_features(df):
    """
    Prepares neighborhood data as a proxy for multi-scale spatial grids[cite: 46, 243].
    """
    # For Phase 1, we use One-Hot Encoding for Neighborhoods [cite: 243]
    # In Phase 2, this would be replaced by H3 Hexagon ID Embeddings [cite: 46]
    df = pd.get_dummies(df, columns=['Neighborhood'], prefix='geo_block')
    
    return df

In [11]:
def encode_physical_attributes(df):
    """
    Converts qualitative assessments into numeric values for regression[cite: 64, 110].
    """
    # Mapping Ordinal Qualities (ExterQual, KitchenQual, etc.)
    quality_map = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
    
    ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 
                    'HeatingQC', 'KitchenQual', 'FireplaceQu']
    
    for col in ordinal_cols:
        if col in df.columns:
            df[col] = df[col].map(quality_map).fillna(0)
            
    # Nominal Features (One-Hot Encoding)
    nominal_cols = ['Foundation', 'Heating', 'CentralAir', 'GarageType']
    df = pd.get_dummies(df, columns=nominal_cols, drop_first=True)
    
    return df

In [12]:
# Load the dataset
train_df = pd.read_csv('train.csv')

# Execute Settlyfe Data Engineering Principles
processed_df = (train_df.pipe(clean_settlyfe_data)
                        .pipe(engineer_temporal_features)
                        .pipe(engineer_geospatial_features)
                        .pipe(encode_physical_attributes))

# Target Variable: SalePrice
y = processed_df['SalePrice']
X = processed_df.drop(columns=['SalePrice', 'Id'])

print(f"Features prepared: {X.shape[1]}")
print(f"Market Trend Range: {X['market_trend'].min()} to {X['market_trend'].max()} months")

Features prepared: 119
Market Trend Range: 1 to 55 months


In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split

# --- 1. Model Training Function ---
def train_valuation_engine(X_train, y_train):
    """
    Trains the three models required for the LyfeEstimate point + range UI.
    """
    # Point Estimate Model (Median prediction)
    model_mid = RandomForestRegressor(n_estimators=100, random_state=42)
    model_mid.fit(X_train, y_train)
    
    # Lower Bound Model (alpha=0.1 means 10th percentile)
    model_low = GradientBoostingRegressor(loss='quantile', alpha=0.1, n_estimators=100)
    model_low.fit(X_train, y_train)
    
    # Upper Bound Model (alpha=0.9 means 90th percentile)
    model_high = GradientBoostingRegressor(loss='quantile', alpha=0.9, n_estimators=100)
    model_high.fit(X_train, y_train)
    
    return model_low, model_mid, model_high

# --- 2. Deployment / Prediction Logic ---
# Splitting your Ames 'train.csv' data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the engine
low_m, mid_m, high_m = train_valuation_engine(X_train, y_train)

# Generate the results for the UI
predictions = pd.DataFrame({
    'Actual_Price': y_test,
    'LyfeEstimate_Low': low_m.predict(X_test),
    'LyfeEstimate_Point': mid_m.predict(X_test),
    'LyfeEstimate_High': high_m.predict(X_test)
})

# Display a sample (prices in USD)
print(predictions.head())

ValueError: could not convert string to float: 'RL'