In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model as lm

The cell below outputs the data files

In [2]:
from data_load import Data
load = Data()
load.HOMES_FOR_SALE()
load.INCOME()
load.HOMICIDES()
load.POPULATION()
load.ZIP_COUNTY()

  from .autonotebook import tqdm as notebook_tqdm


Loading: C:\Users\elang\.cache\kagglehub\datasets\ahmedshahriarsakib\usa-real-estate-dataset\versions\25\realtor-data.zip.csv
Downloading from https://www.kaggle.com/api/v1/datasets/download/danofer/zipcodes-county-fips-crosswalk?dataset_version_number=1...


100%|██████████| 184k/184k [00:00<00:00, 2.20MB/s]

Extracting files...
Loading: C:\Users\elang\.cache\kagglehub\datasets\danofer\zipcodes-county-fips-crosswalk\versions\1\ZIP-COUNTY-FIPS_2017-06.csv





In [None]:
from Final_Data_Output import Final_Data as FD
FD = FD()
Master_df = FD.Merge_all(min_price=1000, max_bed=12, max_bath=10)

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_tweedie_deviance

In [None]:
House_Income_Pop.dropna(axis=0, inplace=True)
X = House_Income_Pop[['bed', 'bath', 'house_size', 'zip_code', 'acre_lot', 'Household_AGI', 'Total_Pop']]
y = House_Income_Pop['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
XGReg = xgb_reg = xgb.XGBRegressor(
    objective='reg:tweedie',
    tweedie_variance_power=1.75, # Choose a value between 1 and 2 for overdispersed data
    n_estimators=1000,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
XGReg.fit(X_train, y_train)
y_pred = XGReg.predict(X_test)

In [None]:
# Evaluate using the Mean Tweedie Deviance
tweedie_deviance = mean_tweedie_deviance(y_test, y_pred, power=1.5)
print(f"Mean Tweedie Deviance: {tweedie_deviance:.4f}")

null_tweedie_deviance = mean_tweedie_deviance(y_test, [y_train.mean()]*len(y_test), power=1.5)
print(f"Null Model Mean Tweedie Deviance: {null_tweedie_deviance:.4f}")
print(f"Percent Deviance Explained: {(1-tweedie_deviance/null_tweedie_deviance)*100:.4f}")

Mean Tweedie Deviance: 60.0778
Null Model Mean Tweedie Deviance: 335.8737
Percent Deviance Explained: 82.1130


In [None]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import lightgbm as lgb

In [None]:
# 1. Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

# 2. Ridge Regression (handles multicollinearity)
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)

# 3. Lasso Regression (feature selection)
lasso = Lasso(alpha=1.0)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)

# 4. Elastic Net (combines Ridge + Lasso)
elastic = ElasticNet(alpha=1.0, l1_ratio=0.5)
elastic.fit(X_train, y_train)
elastic_pred = elastic.predict(X_test)

# 5. Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

# 6. Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)
gb_pred = gb.predict(X_test)

# 7. LightGBM (fast gradient boosting)
lgb_reg = lgb.LGBMRegressor(n_estimators=100, random_state=42)
lgb_reg.fit(X_train, y_train)
lgb_pred = lgb_reg.predict(X_test)

# 8. Support Vector Regression
svr = SVR(kernel='rbf', C=1000, gamma='scale')
svr.fit(X_train, y_train)
svr_pred = svr.predict(X_test)

In [None]:
def evaluate_models(models_dict, X_test, y_test, y_train):
    """Evaluate multiple models and return comparison metrics"""
    results = []
    
    for name, predictions in models_dict.items():
        mse = mean_squared_error(y_test, predictions)
        mae = mean_absolute_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        tweedie = mean_tweedie_deviance(y_test, predictions, power=1.5)
        
        results.append({
            'Model': name,
            'MSE': mse,
            'MAE': mae,
            'R²': r2,
            'Tweedie_Deviance': tweedie
        })
    
    return pd.DataFrame(results).sort_values('R²', ascending=False)

# Compare all models
models_predictions = {
    'XGBoost': y_pred,
    'Linear Regression': lr_pred,
    'Ridge': ridge_pred,
    'Lasso': lasso_pred,
    'Elastic Net': elastic_pred,
    'Random Forest': rf_pred,
    'Gradient Boosting': gb_pred,
    'LightGBM': lgb_pred,
    'SVR': svr_pred
}

comparison_df = evaluate_models(models_predictions, X_test, y_test, y_train)
print(comparison_df)

In [None]:
# 9. Neural Network (MLPRegressor)
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

# Scale features for neural network
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

mlp = MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
mlp.fit(X_train_scaled, y_train)
mlp_pred = mlp.predict(X_test_scaled)

# 10. Polynomial Features + Linear Regression
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly_features.fit_transform(X_train)
X_test_poly = poly_features.transform(X_test)

poly_lr = LinearRegression()
poly_lr.fit(X_train_poly, y_train)
poly_pred = poly_lr.predict(X_test_poly)