In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import tensorflow as tf
import xgboost as xgb

from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import ( GlobalAveragePooling2D, Dense, Dropout,
    BatchNormalization, RandomFlip, RandomRotation)

from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# imports the image fetcher script 
from data_fetcher import fetch_images_for_dataframe


In [None]:
IMAGE_DIR = "satellite_images"
TRAIN_PATH = "train(1).xlsx"
TEST_PATH = "test.xlsx"
OUTPUT_PATH = "fusion_price_predictions.csv"

# load tabular data
df_train = pd.read_excel(TRAIN_PATH)
df_test  = pd.read_excel(TEST_PATH)

fetch_images_for_dataframe(df_train, IMAGE_DIR)
print("images for training data fetched")


fetch_images_for_dataframe(df_test, IMAGE_DIR)
print("images for test data fetched")


In [None]:
image_map = {
    f.replace(".png", ""): os.path.join(IMAGE_DIR, f)
    for f in os.listdir(IMAGE_DIR) if f.endswith(".png")
}

df_train["id_str"] = df_train["id"].astype(str)
df_train = df_train[df_train["id_str"].isin(image_map.keys())].reset_index(drop=True)

train_image_paths = np.array([image_map[i] for i in df_train["id_str"]])

# Target (log price)
y_log = np.log1p(df_train["price"].values)

# Tabular features
cols_to_drop = ["id", "id_str", "date", "price"]
X_train_tab = (
    df_train.drop(columns=[c for c in cols_to_drop if c in df_train.columns])
            .select_dtypes(include=[np.number])
            .fillna(0)
            .values
)


In [None]:
tabular_model = xgb.XGBRegressor(
    n_estimators=1500,
    learning_rate=0.02,
    max_depth=6,
    n_jobs=-1,
    random_state=42
)

tabular_model.fit(X_train_tab, y_log)

# Compute residuals for CNN training
train_preds_log = tabular_model.predict(X_train_tab)
residuals = y_log - train_preds_log

In [None]:
class ImageGenerator(tf.keras.utils.Sequence):
    def __init__(self, paths, labels, batch_size=32):
        self.paths = paths
        self.labels = labels
        self.batch_size = batch_size
        self.indices = np.arange(len(paths))

    def __len__(self):
        return int(np.ceil(len(self.paths) / self.batch_size))

    def __getitem__(self, idx):
        batch_idx = self.indices[idx*self.batch_size:(idx+1)*self.batch_size]
        batch_paths = self.paths[batch_idx]
        batch_labels = self.labels[batch_idx]

        images = []
        for p in batch_paths:
            img = cv2.imread(p)
            if img is None:
                img = np.zeros((224, 224, 3))
            else:
                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
                img = cv2.resize(img, (224, 224))
                img = preprocess_input(img)
            images.append(img)

        return np.array(images), np.array(batch_labels)

def build_cnn():
    augmentation = tf.keras.Sequential([
        RandomFlip("horizontal_and_vertical"),
        RandomRotation(0.2)
    ])

    backbone = EfficientNetB0(weights="imagenet", include_top=False, input_shape=(224, 224, 3))

    for layer in backbone.layers[:-50]:
        layer.trainable = False

    inputs = tf.keras.Input(shape=(224, 224, 3))
    x = augmentation(inputs)
    x = backbone(x, training=True)
    x = GlobalAveragePooling2D()(x)
    x = BatchNormalization()(x)
    x = Dense(64, activation="relu")(x)
    x = Dropout(0.5)(x)
    outputs = Dense(1, activation="linear")(x)

    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(1e-4), loss="mse")
    return model


In [None]:
train_gen = ImageGenerator(train_image_paths, residuals, batch_size=32)

cnn = build_cnn()
cnn.fit(train_gen, epochs=6, verbose=1)


In [None]:
df_test["id_str"] = df_test["id"].astype(str)

test_ids = df_test["id"].values

X_test_tab = (
    df_test.drop(columns=[c for c in cols_to_drop if c in df_test.columns])
           .select_dtypes(include=[np.number])
           .fillna(0)
           .values
)


In [None]:
# Step 1: Tabular predictions for ALL test rows
test_tabular_log = tabular_model.predict(X_test_tab)

# Step 2: CNN corrections ONLY for rows that have images
cnn_corrections = np.zeros(len(df_test))  # Fallback = 0

has_image_mask = df_test["id_str"].isin(image_map.keys()).values

if np.any(has_image_mask):
    test_image_paths = np.array([
        image_map[i] for i in df_test.loc[has_image_mask, "id_str"]
    ])

    test_gen = ImageGenerator(test_image_paths, np.zeros(len(test_image_paths)), batch_size=64)
    cnn_preds = cnn.predict(test_gen).flatten()

    # Insert CNN corrections only where images exist
    cnn_corrections[has_image_mask] = cnn_preds


final_log_prices = test_tabular_log + (cnn_corrections * 0.5)
final_prices = np.expm1(final_log_prices)

In [None]:
submission = pd.DataFrame({
    "id": test_ids,
    "predicted_price": final_prices
})

submission.to_csv(OUTPUT_PATH, index=False)
print(f"Predictions saved to {OUTPUT_PATH}")