In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

In [67]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
df = pd.read_csv("/content/drive/MyDrive/Satellite_Imagery_Property_Valuation/data/train_with_cnn_features.csv")

# Load CNN embeddings
X_img = np.load("/content/drive/MyDrive/Satellite_Imagery_Property_Valuation/data/cnn_image_features.npy")

print(df.shape, X_img.shape)

(16209, 22) (16209, 512)


### Feature Engineering

In [69]:
df['price']

Unnamed: 0,price
0,12.501142
1,12.409018
2,12.206078
3,12.772806
4,12.354497
...,...
16204,12.842652
16205,12.899097
16206,13.262127
16207,12.409018


In [92]:
y = df['price'].values
y

array([12.50114236, 12.40901757, 12.20607765, ..., 13.26212706,
       12.40901757, 12.66033109])

In [71]:
tabular_features = [
    'bedrooms', 'bathrooms', 'sqft_living',
    'floors', 'waterfront', 'view',
    'condition', 'grade',
    'sqft_living15', 'sqft_lot15',
    'lat', 'long'
]

X_tab = df[tabular_features].values

In [72]:
X_tab

array([[ 4.00000e+00,  2.25000e+00,  1.81000e+03, ...,  9.24000e+03,
         4.74362e+01, -1.22187e+02],
       [ 3.00000e+00,  2.50000e+00,  1.60000e+03, ...,  3.60500e+03,
         4.74034e+01, -1.22187e+02],
       [ 4.00000e+00,  2.50000e+00,  1.72000e+03, ...,  7.45500e+03,
         4.72704e+01, -1.22313e+02],
       ...,
       [ 3.00000e+00,  2.50000e+00,  2.12000e+03, ...,  2.65000e+03,
         4.76810e+01, -1.22032e+02],
       [ 1.00000e+00,  7.50000e-01,  3.80000e+02, ...,  1.50000e+04,
         4.74810e+01, -1.22323e+02],
       [ 4.00000e+00,  2.50000e+00,  3.13000e+03, ...,  5.99700e+03,
         4.73837e+01, -1.22099e+02]])

In [73]:
from sklearn.preprocessing import StandardScaler

scaler_tab = StandardScaler()
X_tab_scaled = scaler_tab.fit_transform(X_tab)
joblib.dump(
    scaler_tab,
    "/content/drive/MyDrive/Satellite_Imagery_Property_Valuation/data/scaler_tab.pkl"
)

['/content/drive/MyDrive/Satellite_Imagery_Property_Valuation/data/scaler_tab.pkl']

In [74]:
scaler_img = StandardScaler()
X_img_scaled = scaler_img.fit_transform(X_img)
joblib.dump(
    scaler_img,
    "/content/drive/MyDrive/Satellite_Imagery_Property_Valuation/data/scaler_img.pkl"
)

['/content/drive/MyDrive/Satellite_Imagery_Property_Valuation/data/scaler_img.pkl']

In [75]:
#applying pca will help in reducing variance
from sklearn.decomposition import PCA

pca = PCA(n_components=50, random_state=42)
X_img_pca = pca.fit_transform(X_img_scaled)

print("Explained variance:", pca.explained_variance_ratio_.sum())

Explained variance: 0.6812646


In [76]:
X_fusion = np.hstack([X_tab_scaled, X_img_pca])
print(X_fusion.shape)

(16209, 62)


In [96]:
#train test split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X_fusion, y, test_size=0.2, random_state=42
)

In [99]:
X_train.shape , X_val.shape

((12967, 62), (3242, 62))

### Training and Evaluation

In [100]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    r2 = r2_score(y_val, preds)
    return rmse, r2

In [102]:
!pip install xgboost
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

lr = LinearRegression()
rmse_lr, r2_lr = evaluate_model(lr, X_train, y_train, X_val, y_val)
ridge = Ridge(alpha=1.0)
rmse_ridge, r2_ridge = evaluate_model(ridge, X_train, y_train, X_val, y_val)
lasso = Lasso(alpha=0.001)
rmse_lasso, r2_lasso = evaluate_model(lasso, X_train, y_train, X_val, y_val)
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)
rmse_gbr, r2_gbr = evaluate_model(gbr, X_train, y_train, X_val, y_val)
xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
rmse_xgb, r2_xgb = evaluate_model(xgb, X_train, y_train, X_val, y_val)



In [104]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rmse_rf, r2_rf = evaluate_model(rf, X_train, y_train, X_val, y_val)

In [105]:
results = pd.DataFrame({
    "Model": [
        "Linear Regression",
        "Ridge Regression",
        "Lasso Regression",
        "Random Forest",
        "Gradient Boosting",
        "XGBoost"
    ],
    "RMSE": [
        rmse_lr,
        rmse_ridge,
        rmse_lasso,
        rmse_rf,
        rmse_gbr,
        rmse_xgb
    ],
    "R² Score": [
        r2_lr,
        r2_ridge,
        r2_lasso,
        r2_rf,
        r2_gbr,
        r2_xgb
    ]
})

results.sort_values(by="RMSE")

Unnamed: 0,Model,RMSE,R² Score
5,XGBoost,0.172476,0.892199
4,Gradient Boosting,0.179169,0.883671
3,Random Forest,0.187487,0.872618
0,Linear Regression,0.246411,0.779968
1,Ridge Regression,0.246412,0.779967
2,Lasso Regression,0.246424,0.779945


### comparison with only tablular data presentation

In [83]:
tabular_features = [
    'bedrooms', 'bathrooms', 'sqft_living',
    'floors', 'waterfront', 'view',
    'condition', 'grade',
    'sqft_living15', 'sqft_lot15',
    'lat', 'long'
]

X_tab = df[tabular_features].values

In [84]:
y = np.log1p(df['price'].values)

In [85]:
from sklearn.model_selection import train_test_split

X_tab_train, X_tab_val, y_train, y_val = train_test_split(
    X_tab, y, test_size=0.2, random_state=42
)

In [86]:
from sklearn.preprocessing import StandardScaler

scaler_tab = StandardScaler()
X_tab_train_scaled = scaler_tab.fit_transform(X_tab_train)
X_tab_val_scaled = scaler_tab.transform(X_tab_val)

In [87]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    r2 = r2_score(y_val, preds)
    return rmse, r2

In [88]:
lr_tab = LinearRegression()
rmse_lr_tab, r2_lr_tab = evaluate_model(
    lr_tab, X_tab_train_scaled, y_train, X_tab_val_scaled, y_val
)
ridge_tab = Ridge(alpha=1.0)
rmse_ridge_tab, r2_ridge_tab = evaluate_model(
    ridge_tab, X_tab_train_scaled, y_train, X_tab_val_scaled, y_val
)
lasso_tab = Lasso(alpha=0.001)
rmse_lasso_tab, r2_lasso_tab = evaluate_model(
    lasso_tab, X_tab_train_scaled, y_train, X_tab_val_scaled, y_val
)
rf_tab = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rmse_rf_tab, r2_rf_tab = evaluate_model(
    rf_tab, X_tab_train, y_train, X_tab_val, y_val
)
gbr_tab = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)

rmse_gbr_tab, r2_gbr_tab = evaluate_model(
    gbr_tab, X_tab_train, y_train, X_tab_val, y_val
)
xgb_tab = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

rmse_xgb_tab, r2_xgb_tab = evaluate_model(
    xgb_tab, X_tab_train, y_train, X_tab_val, y_val
)

In [89]:
tabular_results = pd.DataFrame({
    "Model": [
        "Linear Regression",
        "Ridge Regression",
        "Lasso Regression",
        "Random Forest",
        "Gradient Boosting",
        "XGBoost"
    ],
    "RMSE": [
        rmse_lr_tab,
        rmse_ridge_tab,
        rmse_lasso_tab,
        rmse_rf_tab,
        rmse_gbr_tab,
        rmse_xgb_tab
    ],
    "R² Score": [
        r2_lr_tab,
        r2_ridge_tab,
        r2_lasso_tab,
        r2_rf_tab,
        r2_gbr_tab,
        r2_xgb_tab
    ]
})

tabular_results.sort_values(by="RMSE")

Unnamed: 0,Model,RMSE,R² Score
5,XGBoost,0.012044,0.89507
4,Gradient Boosting,0.012462,0.887674
3,Random Forest,0.012706,0.883222
0,Linear Regression,0.018625,0.749087
1,Ridge Regression,0.018625,0.749087
2,Lasso Regression,0.018884,0.742053


In [103]:
xgb.fit(X_fusion, y)

joblib.dump(
    xgb,
    "/content/drive/MyDrive/Satellite_Imagery_Property_Valuation/data/xgb.pkl"
)

['/content/drive/MyDrive/Satellite_Imagery_Property_Valuation/data/xgb.pkl']

## Predictions for test dataset