In [1]:
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB4
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.preprocessing import image
from tqdm.notebook import tqdm

2026-01-04 18:36:43.705836: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767551803.916616      17 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767551803.974393      17 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767551804.464090      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767551804.464143      17 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767551804.464146      17 computation_placer.cc:177] computation placer alr

In [2]:
train_df = pd.read_csv("/kaggle/input/property-val-dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/property-val-dataset/test.csv")

print(len(train_df))
print(len(test_df))

16209
5404


In [3]:
from pathlib import Path

folder = Path("/kaggle/input/property-val-dataset-01/property_images_train_zoom_18")
print(len(list(folder.glob("*.jpg"))))
folder = Path("/kaggle/input/property-val-dataset-01/property_images_test_zoom_18")
print(len(list(folder.glob("*.jpg"))))

16110
5396


In [4]:
# --- 1. LOAD PRE-TRAINED MODEL ---
print("Loading EfficientNetB4 model...")
# include_top=False: Removes the classifier
# pooling='avg': Returns a 1D vector (1792,)
base_model = EfficientNetB4(weights='imagenet', include_top=False, pooling='avg')

# --- 2. IMAGE PREPROCESSING FUNCTION ---
def load_and_preprocess_images(image_ids, folder_path):
    batch_images = []
    valid_indices = [] 
    
    for i, img_id in enumerate(image_ids):
        img_path = os.path.join(folder_path, f"{img_id}.jpg")
        
        if os.path.exists(img_path):
            try:
                # Load and Resize to 380x380
                img = image.load_img(img_path, target_size=IMG_SIZE)
                img_array = image.img_to_array(img)
                batch_images.append(img_array)
                valid_indices.append(i) 
            except Exception as e:
                pass 
        else:
            pass
            
    if not batch_images:
        return None, []
        
    batch_array = np.array(batch_images)
    
    # EfficientNet specific preprocessing
    batch_array = preprocess_input(batch_array)
    
    return batch_array, valid_indices

# --- 3. FEATURE EXTRACTION LOOP ---
def extract_features(df, folder_path):
    all_ids = df['id'].tolist()
    feature_map = {} 
    
    # Process in chunks
    for start_idx in tqdm(range(0, len(all_ids), BATCH_SIZE), desc="Extracting Features"):
        end_idx = min(start_idx + BATCH_SIZE, len(all_ids))
        batch_ids = all_ids[start_idx:end_idx]
        
        img_batch, valid_indices = load_and_preprocess_images(batch_ids, folder_path)
        
        if img_batch is not None:
            # Predict
            features = base_model.predict(img_batch, verbose=0)
            
            # Map features back to IDs
            for i, valid_idx in enumerate(valid_indices):
                original_id = batch_ids[valid_idx]
                feature_map[original_id] = features[i]
    
    return feature_map

Loading EfficientNetB4 model...


2026-01-04 18:36:59.192481: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb4_notop.h5
[1m71686520/71686520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [5]:
# train data

IMG_FOLDER = "/kaggle/input/property-val-dataset-01/property_images_train_zoom_18"  # Ensure this matches your download folder
BATCH_SIZE = 32                 # Lower this to 16 if you get OOM (Out of Memory)
IMG_SIZE = (380, 380)           # EfficientNetB4 optimal resolution

print("Processing Training Images...")
train_features_map = extract_features(train_df, IMG_FOLDER)

np.save('train_image_features_b4.npy', train_features_map)

Processing Training Images...


Extracting Features:   0%|          | 0/507 [00:00<?, ?it/s]

In [6]:
# test data

IMG_FOLDER = "/kaggle/input/property-val-dataset-01/property_images_test_zoom_18"  # Ensure this matches your download folder
BATCH_SIZE = 32                 # Lower this to 16 if you get OOM (Out of Memory)
IMG_SIZE = (380, 380)           # EfficientNetB4 optimal resolution

print("Processing Training Images...")
test_features_map = extract_features(test_df, IMG_FOLDER)

np.save('test_image_features_b4.npy', test_features_map)

Processing Training Images...


Extracting Features:   0%|          | 0/169 [00:00<?, ?it/s]

In [7]:
print(f"Extraction Complete!")
print(f"Encoded {len(train_features_map)} training images.")

Extraction Complete!
Encoded 16110 training images.


In [8]:
import xgboost as xgb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [9]:
# Load the EfficientNet embeddings we saved
train_img_dict = np.load('train_image_features_b4.npy', allow_pickle=True).item()
test_img_dict = np.load('test_image_features_b4.npy', allow_pickle=True).item()

In [10]:
print(type(train_img_dict))
print(len(train_img_dict))

<class 'dict'>
16110


In [11]:
# --- 2. PREPROCESS TABULAR DATA (Reusing your Phase 1 logic) ---
def preprocess_tabular(df):
    df = df.copy()
    
    # Date stuff
    df['date'] = pd.to_datetime(df['date'])
    df['sale_year'] = df['date'].dt.year
    df['sale_month'] = df['date'].dt.month
    df['day_of_month'] = df['date'].dt.day
    df['day_of_week'] = df['date'].dt.dayofweek
    
    # Age stuff
    df['effective_built_year'] = df.apply(lambda x: x['yr_renovated'] if x['yr_renovated'] > 0 else x['yr_built'], axis=1)
    df['house_age'] = df['sale_year'] - df['effective_built_year']
    
    # Drop non-features
    cols_to_drop = ['id', 'date', 'price']  # , 'yr_built', 'yr_renovated'
    return df.drop(columns=cols_to_drop, errors='ignore')

In [12]:
# Prepare Base Features
X_tab_train = preprocess_tabular(train_df)
#X_tab_test = preprocess_tabular(test_df)

# Target (Log Scale)
y = np.log1p(train_df['price'])

In [13]:
# --- 3. ALIGN & PREPARE IMAGE VECTORS ---
def get_image_matrix(df, img_dict, dim=1792):
    """
    Converts dictionary to a sorted matrix matching the dataframe order.
    Fills missing images with zeros.
    """
    matrix = []
    missing_count = 0
    
    for house_id in df['id']:
        if house_id in img_dict:
            matrix.append(img_dict[house_id])
        else:
            matrix.append(np.zeros(dim))
            missing_count += 1
            
    print(f"Processed {len(df)} rows. Missing images: {missing_count}")
    return np.array(matrix)

print("Aligning Training Images...")
X_img_train_raw = get_image_matrix(train_df, train_img_dict)

#print("Aligning Test Images...")
#X_img_test_raw = get_image_matrix(test_df, test_img_dict)

Aligning Training Images...
Processed 16209 rows. Missing images: 0


In [14]:
# --- 4. DIMENSIONALITY REDUCTION (PCA) ---
# We standardize first because PCA is sensitive to scale
scaler_pca = StandardScaler()
X_img_train_scaled = scaler_pca.fit_transform(X_img_train_raw)
#X_img_test_scaled = scaler_pca.transform(X_img_test_raw)

print("Fitting PCA to keep 90% variance...")
pca = PCA(n_components=0.80, random_state=42)
X_img_train_pca = pca.fit_transform(X_img_train_scaled)
#X_img_test_pca = pca.transform(X_img_test_scaled)

print(f"PCA reduced dimensions from 1792 -> {X_img_train_pca.shape[1]} components")

Fitting PCA to keep 90% variance...
PCA reduced dimensions from 1792 -> 161 components


In [15]:
# --- 5. FUSION (CONCATENATION) ---
# Convert PCA arrays to DataFrames
pca_cols = [f"img_pca_{i}" for i in range(X_img_train_pca.shape[1])]

df_img_train = pd.DataFrame(X_img_train_pca, columns=pca_cols)
#df_img_test = pd.DataFrame(X_img_test_pca, columns=pca_cols)

# Reset index to ensure clean concatenation
X_tab_train.reset_index(drop=True, inplace=True)
#X_tab_test.reset_index(drop=True, inplace=True)

# Merge
X_final_train = pd.concat([X_tab_train, df_img_train], axis=1)
#X_final_test = pd.concat([X_tab_test, df_img_test], axis=1)

print(f"Final Feature Count: {X_final_train.shape[1]}")

Final Feature Count: 185


In [16]:
# Split for validation
X_train, X_test, y_train, y_test = train_test_split(X_final_train, y, test_size=0.2, random_state=42)


In [17]:
best_params = {'learning_rate': 0.006632121481442682,
 'grow_policy': 'lossguide',
 'max_depth': 9,
 'subsample': 0.7435900096401388,
 'colsample_bytree': 0.8056094584069656,
 'reg_alpha': 1.0950340353169523e-08,
 'reg_lambda': 2.9173376875434175e-07,
 'min_child_weight': 3,
 'gamma': 1.4042292694295272e-05,
 'max_bin': 512,
 'max_leaves': 33,
 'objective': 'reg:squarederror',
 'tree_method': 'auto',
 'eval_metric': 'rmse',
 'n_estimators': 2500}


# 3. Instantiate the Final Model
final_model = xgb.XGBRegressor(**best_params)

In [18]:
from sklearn.metrics import mean_squared_error, r2_score
def eval(y_test,y_pred) :
  print("MSE : ",mean_squared_error(y_test,y_pred))
  print("RMSE : ",mean_squared_error(y_test,y_pred)**0.5)
  print("R2 : ",r2_score(y_test,y_pred))
  return

final_model.fit(X_train, y_train)
log_predictions = final_model.predict(X_test)
real_predictions = np.expm1(log_predictions)
real_y_test = np.expm1(y_test)
eval(real_y_test,real_predictions)

MSE :  12989795341.379099
RMSE :  113972.7833361066
R2 :  0.8964862603047792


In [None]:
len(X_final_train)

16209