In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

df_filled = pd.read_csv("../exploratory_data_analysis/imputed.csv")

target = 'taxvaluedollarcnt'

cols_to_exclude = [target, 'parcelid', 'logerror']
features = [col for col in df_filled.select_dtypes(include=[np.number]).columns 
            if col not in cols_to_exclude]

X = df_filled[features]
y = df_filled[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

# stats on accuracy
print("Random Forest Evaluation:")
print(f"R¬≤ Score: {r2_score(y_test, y_pred_rf):.4f}")
print(f"RMSE: ${np.sqrt(mean_squared_error(y_test, y_pred_rf)):,.2f}")
print(f"MAE: ${mean_absolute_error(y_test, y_pred_rf):,.2f}")
print(f"MAPE: {np.mean(np.abs((y_test - y_pred_rf) / y_test)) * 100:.2f}%")
print(f"Average property price: ${y_test.mean():,.0f}")
print(f"RMSE as % of avg price: {(41082.64 / y_test.mean()) * 100:.2f}%")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

df_filled = pd.read_csv("../exploratory_data_analysis/imputed.csv")

target = 'taxvaluedollarcnt'
cols_to_exclude = [target, 'parcelid', 'logerror']
features = [col for col in df_filled.select_dtypes(include=[np.number]).columns 
            if col not in cols_to_exclude]

X = df_filled[features]
y = df_filled[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Average property price: ${y_test.mean():,.0f}")

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# load imputed data
df_filled = pd.read_csv("../exploratory_data_analysis/imputed.csv")

# define target and features
target = 'taxvaluedollarcnt'
cols_to_exclude = [target, 'parcelid', 'logerror']
features = [col for col in df_filled.select_dtypes(include=[np.number]).columns 
            if col not in cols_to_exclude]

X = df_filled[features]
y = df_filled[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (important for linear models, optional for tree-based)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Enhanced evaluation function
def evaluate_model(y_actual, y_pred, model_name):
    """Comprehensive model evaluation with multiple metrics"""
    mae = mean_absolute_error(y_actual, y_pred)
    mse = mean_squared_error(y_actual, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_actual, y_pred)
    mape = np.mean(np.abs((y_actual - y_pred) / y_actual)) * 100
    
    print(f"\n{model_name} Performance:")
    print(f"  R¬≤ Score: {r2:.4f}")
    print(f"  RMSE: ${rmse:,.2f}")
    print(f"  MAE: ${mae:,.2f}")
    print(f"  MAPE: {mape:.2f}%")
    print(f"  RMSE as % of avg price: {(rmse / y_actual.mean()) * 100:.2f}%")
    
    return {'model': model_name, 'R2': r2, 'RMSE': rmse, 'MAE': mae, 'MAPE': mape}

# Store results for comparison
results = []

print(f"Average property price: ${y_test.mean():,.0f}")
print("="*60)

# 1. Linear Regression
print("\n--- Training Linear Regression ---")
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_lr_pred = lr.predict(X_test_scaled)
results.append(evaluate_model(y_test, y_lr_pred, "Linear Regression"))

# 2. Ridge Regression
print("\n--- Training Ridge Regression ---")
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X_train_scaled, y_train)
y_ridge_pred = ridge.predict(X_test_scaled)
results.append(evaluate_model(y_test, y_ridge_pred, "Ridge Regression"))

# 3. Random Forest (with your optimized hyperparameters)
print("\n--- Training Random Forest ---")
rf_model = RandomForestRegressor(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)  # Tree models don't need scaling
y_rf_pred = rf_model.predict(X_test)
results.append(evaluate_model(y_test, y_rf_pred, "Random Forest"))

# 4. Gradient Boosting
print("\n--- Training Gradient Boosting ---")
gbr = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
gbr.fit(X_train, y_train)
y_gbr_pred = gbr.predict(X_test)
results.append(evaluate_model(y_test, y_gbr_pred, "Gradient Boosting"))

# 5. LightGBM
print("\n--- Training LightGBM ---")
lgbm = LGBMRegressor(
    n_estimators=100,
    learning_rate=0.1,
    random_state=42,
    verbose=-1
)
lgbm.fit(X_train, y_train)
y_lgbm_pred = lgbm.predict(X_test)
results.append(evaluate_model(y_test, y_lgbm_pred, "LightGBM"))

# 6. XGBoost
print("\n--- Training XGBoost ---")
xgb = XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
xgb.fit(X_train, y_train)
y_xgb_pred = xgb.predict(X_test)
results.append(evaluate_model(y_test, y_xgb_pred, "XGBoost"))

# Summary comparison
print("\n" + "="*60)
print("MODEL COMPARISON SUMMARY")
print("="*60)
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('R2', ascending=False)
print(results_df.to_string(index=False))

# Visualize comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# R¬≤ Score
axes[0, 0].barh(results_df['model'], results_df['R2'], color='skyblue')
axes[0, 0].set_xlabel('R¬≤ Score')
axes[0, 0].set_title('Model Comparison: R¬≤ Score')
axes[0, 0].grid(axis='x', alpha=0.3)

# RMSE
axes[0, 1].barh(results_df['model'], results_df['RMSE'], color='salmon')
axes[0, 1].set_xlabel('RMSE ($)')
axes[0, 1].set_title('Model Comparison: RMSE (lower is better)')
axes[0, 1].grid(axis='x', alpha=0.3)

# MAE
axes[1, 0].barh(results_df['model'], results_df['MAE'], color='lightgreen')
axes[1, 0].set_xlabel('MAE ($)')
axes[1, 0].set_title('Model Comparison: MAE (lower is better)')
axes[1, 0].grid(axis='x', alpha=0.3)

# MAPE
axes[1, 1].barh(results_df['model'], results_df['MAPE'], color='gold')
axes[1, 1].set_xlabel('MAPE (%)')
axes[1, 1].set_title('Model Comparison: MAPE (lower is better)')
axes[1, 1].grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

# Best model
best_model = results_df.iloc[0]
print(f"\nüèÜ Best Model: {best_model['model']}")
print(f"   R¬≤ Score: {best_model['R2']:.4f}")
print(f"   RMSE: ${best_model['RMSE']:,.2f}")

ModuleNotFoundError: No module named 'lightgbm'