In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_tweedie_deviance

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model as lm
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression, RFE
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import lightgbm as lgb

import ssl
import urllib.request

# Create unverified SSL context to handle certificate issues
ssl._create_default_https_context = ssl._create_unverified_context

The cell below outputs the data files

In [2]:
from data_load import Data
load = Data()
load.HOMES_FOR_SALE()
load.INCOME()
load.HOMICIDES()
load.POPULATION()
load.ZIP_COUNTY()

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [None]:
from Final_Data_Output import Final_Data as FD
FD = FD()
Master_df = FD.Merge_all(min_price=1000, max_bed=12, max_bath=10)
Master_df.shape

  df['Count'] = df['Count'].fillna(0)


(1083406, 55)

The below function is new and for prepping all features in the dataframe for feature selection

In [None]:
# Add this cell after loading Master_df but before train_test_split
def prepare_features_for_selection(df):
    """Prepare all features including engineered ones for selection"""
    
    # Handle categorical variables with label encoding
    le = LabelEncoder()
    categorical_cols = ['affordability_score', 'value_tier', 'family_suitability', 
                       'area_prosperity', 'safety_score', 'lifestyle_type', 
                       'expansion_potential', 'buyer_profile', 'state_market']
    
    df_encoded = df.copy()
    for col in categorical_cols:
        if col in df_encoded.columns:
            df_encoded[col + '_encoded'] = le.fit_transform(df_encoded[col].astype(str))
    
    # Define all potential features
    all_features = [
        # Original features
        'bed', 'bath', 'house_size', 'zip_code', 'acre_lot', 'Household_AGI', 'Total_Pop',
        
        # Engineered numeric features
        'price_to_income_ratio', 'price_per_sqft', 'total_rooms', 'bed_bath_ratio',
        'space_per_room', 'economic_health', 'population_density', 'lot_to_house_ratio',
        'affordability_numeric', 'value_numeric', 'safety_numeric', 'recommendation_score',
        'log_income', 'log_house_size',
        
        # Encoded categorical features
        'affordability_score_encoded', 'value_tier_encoded', 'family_suitability_encoded',
        'area_prosperity_encoded', 'safety_score_encoded', 'lifestyle_type_encoded',
        'expansion_potential_encoded', 'buyer_profile_encoded', 'state_market_encoded'
    ]
    
    # Filter to only include features that exist in the dataframe
    available_features = [f for f in all_features if f in df_encoded.columns]
    
    return df_encoded, available_features

# Prepare the data
Master_df_encoded, all_feature_columns = prepare_features_for_selection(Master_df)
Master_df_encoded.dropna(axis=0, inplace=True)

print(f"Total available features: {len(all_feature_columns)}")
print("Available features:", all_feature_columns)

Total available features: 30
Available features: ['bed', 'bath', 'house_size', 'zip_code', 'acre_lot', 'Household_AGI', 'Total_Pop', 'price_to_income_ratio', 'price_per_sqft', 'total_rooms', 'bed_bath_ratio', 'space_per_room', 'economic_health', 'population_density', 'lot_to_house_ratio', 'affordability_numeric', 'value_numeric', 'safety_numeric', 'recommendation_score', 'log_income', 'log_house_size', 'affordability_score_encoded', 'value_tier_encoded', 'family_suitability_encoded', 'area_prosperity_encoded', 'safety_score_encoded', 'lifestyle_type_encoded', 'expansion_potential_encoded', 'buyer_profile_encoded', 'state_market_encoded']


The below function is new and for analyzing the missing data each of the csvs

In [None]:
# REPLACE Cell #6 - Fix the missing data analysis function
def analyze_missing_data(df_name, df):
    """Comprehensive missing data analysis"""
    print(f"\n=== MISSING DATA ANALYSIS: {df_name} ===")
    print(f"Dataset shape: {df.shape}")
    
    # Count missing values
    missing_counts = df.isnull().sum()
    missing_percent = (missing_counts / len(df)) * 100
    
    # Create summary
    missing_summary = pd.DataFrame({  # FIX: Use consistent variable name
        'Column': missing_counts.index,
        'Missing_Count': missing_counts.values,
        'Missing_Percent': missing_percent.values
    })
    
    # Filter only columns with missing data - FIXED
    missing_summary = missing_summary[missing_summary['Missing_Count'] > 0].sort_values('Missing_Percent', ascending=False)
    
    if len(missing_summary) > 0:
        print("Columns with missing data:")
        print(missing_summary.to_string(index=False))
        
        # Show data types for missing columns
        print("\nData types of columns with missing data:")
        for col in missing_summary['Column']:
            print(f"{col}: {df[col].dtype}")
            
        return missing_summary
    else:
        print("✅ No missing values found!")
        return pd.DataFrame()


In [None]:
# Houses data
houses_df = pd.read_csv('Houses_Sold.csv')
houses_missing = analyze_missing_data("Houses_Sold", houses_df)



=== MISSING DATA ANALYSIS: Houses_Sold ===
Dataset shape: (2226382, 13)
Columns with missing data:
        Column  Missing_Count  Missing_Percent
prev_sold_date         734297        32.981627
    house_size         568484        25.533983
          bath         511771        22.986666
           bed         481317        21.618797
      acre_lot         325589        14.624130
        street          10866         0.488056
   brokered_by           4533         0.203604
         price           1541         0.069215
          city           1407         0.063197
      zip_code            299         0.013430
         state              8         0.000359

Data types of columns with missing data:
prev_sold_date: object
house_size: float64
bath: float64
bed: float64
acre_lot: float64
street: float64
brokered_by: float64
price: float64
city: object
zip_code: float64
state: object


In [None]:
# Income data
income_df = pd.read_csv('Income_ZipCode.csv')
income_missing = analyze_missing_data("Income_ZipCode", income_df)


=== MISSING DATA ANALYSIS: Income_ZipCode ===
Dataset shape: (166131, 166)
✅ No missing values found!


In [None]:
# Population data
pop_df = pd.read_csv('zip_pop.csv')
pop_missing = analyze_missing_data("Zip_Pop", pop_df)



=== MISSING DATA ANALYSIS: Zip_Pop ===
Dataset shape: (33772, 3)
✅ No missing values found!


In [None]:
# Homicides data
homicides_df = pd.read_csv('HHS_homicides.csv')
homicides_missing = analyze_missing_data("HHS_homicides", homicides_df)



=== MISSING DATA ANALYSIS: HHS_homicides ===
Dataset shape: (132000, 13)
Columns with missing data:
        Column  Missing_Count  Missing_Percent
TTM_Date_Range         113142        85.713636
     Rate_M_CI          61871        46.871970

Data types of columns with missing data:
TTM_Date_Range: object
Rate_M_CI: object


In [None]:
# County mapping data
county_df = pd.read_csv('mapping_County.csv')
county_missing = analyze_missing_data("mapping_County", county_df)


=== MISSING DATA ANALYSIS: mapping_County ===
Dataset shape: (54559, 9)
✅ No missing values found!


In [None]:
# NEW Cell #6 - Add comprehensive feature selection
def comprehensive_feature_selection(X, y, feature_names, target_features=15):
    """Apply multiple feature selection methods and compare results"""
    
    feature_importance_methods = {}
    
    # 1. Correlation with target
    correlations = {}
    for i, feature in enumerate(feature_names):
        correlations[feature] = abs(np.corrcoef(X.iloc[:, i], y)[0, 1])
    
    # Sort by correlation
    correlation_ranking = sorted(correlations.items(), key=lambda x: x[1], reverse=True)
    feature_importance_methods['Correlation'] = [f[0] for f in correlation_ranking[:target_features]]
    
    # 2. Univariate Statistical Tests
    selector_f = SelectKBest(score_func=f_regression, k=target_features)
    X_f_selected = selector_f.fit_transform(X, y)
    f_scores = selector_f.scores_
    f_ranking = [(feature_names[i], f_scores[i]) for i in selector_f.get_support(indices=True)]
    f_ranking.sort(key=lambda x: x[1], reverse=True)
    feature_importance_methods['F_Regression'] = [f[0] for f in f_ranking]
    
    # 3. Mutual Information
    selector_mi = SelectKBest(score_func=mutual_info_regression, k=target_features)
    X_mi_selected = selector_mi.fit_transform(X, y)
    mi_scores = selector_mi.scores_
    mi_ranking = [(feature_names[i], mi_scores[i]) for i in selector_mi.get_support(indices=True)]
    mi_ranking.sort(key=lambda x: x[1], reverse=True)
    feature_importance_methods['Mutual_Info'] = [f[0] for f in mi_ranking]
    
    # 4. Tree-based Feature Importance (Random Forest)
    rf_selector = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_selector.fit(X, y)
    rf_importance = [(feature_names[i], importance) for i, importance in enumerate(rf_selector.feature_importances_)]
    rf_importance.sort(key=lambda x: x[1], reverse=True)
    feature_importance_methods['Random_Forest'] = [f[0] for f in rf_importance[:target_features]]
    
    # 5. Recursive Feature Elimination with XGBoost
    xgb_selector = xgb.XGBRegressor(n_estimators=100, random_state=42)
    rfe_selector = RFE(xgb_selector, n_features_to_select=target_features)
    rfe_selector.fit(X, y)
    rfe_features = [feature_names[i] for i in range(len(feature_names)) if rfe_selector.support_[i]]
    feature_importance_methods['RFE_XGBoost'] = rfe_features
    
    return feature_importance_methods

# Apply feature selection
X_all = Master_df_encoded[all_feature_columns]
y = Master_df_encoded['price']

feature_methods = comprehensive_feature_selection(X_all, y, all_feature_columns)

# Display results
for method, features in feature_methods.items():
    print(f"\n{method} Top Features:")
    for i, feature in enumerate(features[:10], 1):
        print(f"{i:2d}. {feature}")

  c /= stddev[:, None]
  c /= stddev[None, :]



Correlation Top Features:
 1. price_to_income_ratio
 2. price_per_sqft
 3. Household_AGI
 4. bath
 5. affordability_numeric
 6. log_income
 7. value_numeric
 8. total_rooms
 9. log_house_size
10. recommendation_score

F_Regression Top Features:
 1. price_to_income_ratio
 2. price_per_sqft
 3. Household_AGI
 4. bath
 5. log_income
 6. value_numeric
 7. total_rooms
 8. log_house_size
 9. recommendation_score
10. buyer_profile_encoded

Mutual_Info Top Features:
 1. price_per_sqft
 2. price_to_income_ratio
 3. zip_code
 4. value_numeric
 5. log_income
 6. Household_AGI
 7. buyer_profile_encoded
 8. value_tier_encoded
 9. Total_Pop
10. recommendation_score

Random_Forest Top Features:
 1. price_per_sqft
 2. house_size
 3. log_house_size
 4. price_to_income_ratio
 5. buyer_profile_encoded
 6. space_per_room
 7. population_density
 8. lot_to_house_ratio
 9. bed_bath_ratio
10. zip_code

RFE_XGBoost Top Features:
 1. bed
 2. bath
 3. house_size
 4. acre_lot
 5. Household_AGI
 6. Total_Pop
 7. 

In [None]:
# NEW Cell #7 - Create feature consensus and final feature set
def create_feature_consensus(feature_methods, min_votes=2):
    """Create consensus features based on how many methods selected them"""
    
    feature_votes = {}
    
    # Count votes for each feature
    for method, features in feature_methods.items():
        for feature in features:
            feature_votes[feature] = feature_votes.get(feature, 0) + 1
    
    # Filter features by minimum votes
    consensus_features = {feature: votes for feature, votes in feature_votes.items() 
                         if votes >= min_votes}
    
    # Sort by votes
    consensus_ranking = sorted(consensus_features.items(), key=lambda x: x[1], reverse=True)
    
    return consensus_ranking, feature_votes

def propose_final_feature_set(consensus_features, target_size=12):
    """Propose final feature set based on consensus and domain knowledge"""
    
    # Core features (always include these for house price prediction)
    core_features = ['house_size', 'bed', 'bath', 'price_per_sqft', 'Household_AGI']
    
    # High consensus features (4+ votes)
    high_consensus = [f[0] for f in consensus_features if f[1] >= 4]
    
    # Medium consensus features (3 votes)
    medium_consensus = [f[0] for f in consensus_features if f[1] == 3]
    
    # Start with core features that exist
    final_features = [f for f in core_features if f in all_feature_columns]
    
    # Add high consensus features (if not already included)
    for feature in high_consensus:
        if feature not in final_features:
            final_features.append(feature)
    
    # Add medium consensus features until we reach target size
    for feature in medium_consensus:
        if len(final_features) >= target_size:
            break
        if feature not in final_features:
            final_features.append(feature)
    
    return final_features[:target_size]

# Create consensus and final feature set
consensus_features, all_votes = create_feature_consensus(feature_methods, min_votes=2)
final_feature_set = propose_final_feature_set(consensus_features, target_size=12)

print("PROPOSED FINAL FEATURE SET:")
print("=" * 40)
for i, feature in enumerate(final_feature_set, 1):
    votes = all_votes.get(feature, 0)
    print(f"{i:2d}. {feature:<25} (Selected by {votes}/5 methods)")

PROPOSED FINAL FEATURE SET:
 1. house_size                (Selected by 5/5 methods)
 2. bed                       (Selected by 3/5 methods)
 3. bath                      (Selected by 4/5 methods)
 4. price_per_sqft            (Selected by 5/5 methods)
 5. Household_AGI             (Selected by 5/5 methods)
 6. price_to_income_ratio     (Selected by 5/5 methods)
 7. buyer_profile_encoded     (Selected by 5/5 methods)
 8. log_income                (Selected by 4/5 methods)
 9. value_numeric             (Selected by 4/5 methods)
10. log_house_size            (Selected by 4/5 methods)
11. total_rooms               (Selected by 3/5 methods)
12. recommendation_score      (Selected by 3/5 methods)


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_tweedie_deviance

In [None]:
# Master_df.dropna(axis=0, inplace=True)
# X = Master_df[['bed', 'bath', 'house_size', 'zip_code', 'acre_lot', 'Household_AGI', 'Total_Pop']]
# y = Master_df['price']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# REPLACE Cell #7 - New train/test split with selected features
# Use the selected features instead of the original limited set

X = Master_df_encoded[final_feature_set]  # Changed from limited feature set
y = Master_df_encoded['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training with {len(final_feature_set)} selected features:")
print(final_feature_set)

In [None]:
XGReg = xgb_reg = xgb.XGBRegressor(
    objective='reg:tweedie',
    tweedie_variance_power=1.75, # Choose a value between 1 and 2 for overdispersed data
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
XGReg.fit(X_train, y_train)
y_pred = XGReg.predict(X_test)

In [None]:
# Evaluate using the Mean Tweedie Deviance
tweedie_deviance = mean_tweedie_deviance(y_test, y_pred, power=1.5)
print(f"Mean Tweedie Deviance: {tweedie_deviance:.4f}")

null_tweedie_deviance = mean_tweedie_deviance(y_test, [y_train.mean()]*len(y_test), power=1.5)
print(f"Null Model Mean Tweedie Deviance: {null_tweedie_deviance:.4f}")
print(f"Percent Deviance Explained: {(1-tweedie_deviance/null_tweedie_deviance)*100:.4f}")

Mean Tweedie Deviance: 63.5866
Null Model Mean Tweedie Deviance: 560.0368
Percent Deviance Explained: 88.6460


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import lightgbm as lgb

In [None]:
# 1. Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# 2. Ridge Regression (handles multicollinearity)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

# 3. Lasso Regression (feature selection)
lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)

# 4. Elastic Net (combines Ridge + Lasso)
elastic = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic.fit(X_train, y_train)
elastic_pred = elastic.predict(X_test)

# 5. Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# 6. Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)

# 7. LightGBM (fast gradient boosting)
lgb_reg = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_reg.fit(X_train, y_train)
lgb_pred = lgb_reg.predict(X_test)

# 8. Support Vector Regression
svr = SVR(kernel='rbf', C=1000, gamma='scale')
svr.fit(X_train, y_train)
svr_pred = svr.predict(X_test)

KeyboardInterrupt: 

In [None]:
def evaluate_models(models_dict, X_test, y_test, y_train):
    """Evaluate multiple models and return comparison metrics"""
    results = []
    
    for name, predictions in models_dict.items():
        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        tweedie = mean_tweedie_deviance(y_test, predictions, power=1.5)
        
        results.append({
            'Model': name,
            'MSE': mse,
            'MAE': mae,
            'R²': r2,
            'Tweedie_Deviance': tweedie
        })
    
    return pd.DataFrame(results).sort_values('R²', ascending=False)

# Compare all models
models_predictions = {
    'XGBoost': y_pred,
    'Linear Regression': lr_pred,
    'Ridge': ridge_pred,
    'Lasso': lasso_pred,
    'Elastic Net': elastic_pred,
    'Random Forest': rf_pred,
    'Gradient Boosting': gb_pred,
    'LightGBM': lgb_pred,
    'SVR': svr_pred
}

comparison_df = evaluate_models(models_predictions, X_test, y_test, y_train)
print(comparison_df)

In [None]:
# 9. Neural Network (MLPRegressor)
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

# Scale features for neural network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)
mlp_pred = mlp.predict(X_test_scaled)

# 10. Polynomial Features + Linear Regression
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

poly_lr = LinearRegression()
poly_lr.fit(X_train_poly, y_train)
poly_pred = poly_lr.predict(X_test_poly)