In [2]:
!pip install xgboost


Collecting xgboost
  Downloading xgboost-3.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.1.2-py3-none-win_amd64.whl (72.0 MB)
   ---------------------------------------- 0.0/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.3/72.0 MB ? eta -:--:--
   ---------------------------------------- 0.8/72.0 MB 2.4 MB/s eta 0:00:30
    --------------------------------------- 1.3/72.0 MB 2.5 MB/s eta 0:00:29
   - -------------------------------------- 1.8/72.0 MB 2.7 MB/s eta 0:00:26
   - -------------------------------------- 2.6/72.0 MB 3.0 MB/s eta 0:00:23
   - -------------------------------------- 3.4/72.0 MB 3.2 MB/s eta 0:00:22
   -- ------------------------------------- 4.5/72.0 MB 3.4 MB/s eta 0:00:21
   -- ------------------------------------- 5.2/72.0 MB 3.5 MB/s eta 0:00:20
   --- ------------------------------------ 6.3/72.0 MB 3.7 MB/s eta 0:00:18
   ---- ----------------------------------- 7.3/72.0 MB 3.9 MB/s eta 0:00:17
   ---- ------------

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor


In [4]:
X = pd.read_csv("data/train_features.csv")
y = pd.read_csv("data/train_target.csv").values.ravel()
X_test = pd.read_csv("data/test_features.csv")

print(X.shape, y.shape, X_test.shape)


(16209, 384) (16209,) (5404, 384)


In [5]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


In [6]:
tabular_model = XGBRegressor(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

tabular_model.fit(X_train, y_train)


In [7]:
val_preds = tabular_model.predict(X_val)

rmse = mean_squared_error(y_val, val_preds, squared=False)
r2 = r2_score(y_val, val_preds)

print("Baseline Tabular Model")
print("RMSE:", rmse)
print("R² Score:", r2)


Baseline Tabular Model
RMSE: 0.1636837538743402
R² Score: 0.9029099006737175




In [8]:
import joblib
joblib.dump(tabular_model, "tabular_baseline_model.pkl")


['tabular_baseline_model.pkl']

In [10]:
!pip install torch

Collecting torch
  Downloading torch-2.9.1-cp312-cp312-win_amd64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.9.1-cp312-cp312-win_amd64.whl (110.9 MB)
   ---------------------------------------- 0.0/110.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/110.9 MB ? eta -:--:--
   ---------------------------------------- 0.5/110.9 MB 1.9 MB/s eta 0:01:00
   ---------------------------------------- 1.0/110.9 MB 2.1 MB/s eta 0:00:53
    --------------------------------------- 1.6/110.9 MB 2.3 MB/s eta 0:00:49
    --------------------------------------- 2.4/110.9 MB 2.5 MB/s eta 0:00:44
   - -------------------------------------- 2.9/110.9 MB 2.7 MB/s eta 0:00:41
   - -------------------------------------- 3.7/110.9 MB 2.8 MB/s eta 0:00:39
   - -------------------------------------- 4.7/110.9 MB 3.0 MB/s eta 0:00:36
   -- ------------------------------------- 5.8/110.9 MB 3.3 MB/s 

In [12]:
!pip install torchvision

Collecting torchvision
  Downloading torchvision-0.24.1-cp312-cp312-win_amd64.whl.metadata (5.9 kB)
Downloading torchvision-0.24.1-cp312-cp312-win_amd64.whl (4.3 MB)
   ---------------------------------------- 0.0/4.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/4.3 MB ? eta -:--:--
   ---- ----------------------------------- 0.5/4.3 MB 1.9 MB/s eta 0:00:03
   ------- -------------------------------- 0.8/4.3 MB 1.9 MB/s eta 0:00:02
   ------------ --------------------------- 1.3/4.3 MB 2.0 MB/s eta 0:00:02
   ------------------- -------------------- 2.1/4.3 MB 2.3 MB/s eta 0:00:01
   ------------------------ --------------- 2.6/4.3 MB 2.4 MB/s eta 0:00:01
   ------------------------------- -------- 3.4/4.3 MB 2.7 MB/s eta 0:00:01
   ---------------------------------------- 4.3/4.3 MB 2.9 MB/s eta 0:00:00
Installing collected packages: torchvision
Successfully installed torchvision-0.24.1


In [13]:
import os
import pandas as pd
import numpy as np

from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models


In [19]:
train_img_df = pd.read_csv("data/train_with_images.csv")
print("Train with images shape:", train_img_df.shape)


Train with images shape: (50, 22)


In [20]:
test_img_df = pd.read_csv("data/test_with_images.csv")
print("Test with images shape:", test_img_df.shape)


Test with images shape: (2, 21)


In [21]:
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


In [22]:
class PropertyImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.df = dataframe.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.loc[idx, "image_path"]
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        return image


In [24]:
import os

def find_image_path(image_id, base_dir):
    for ext in ["png", "jpg", "jpeg"]:
        path = os.path.join(base_dir, f"{image_id}.{ext}")
        if os.path.exists(path):
            return path
    return None


In [26]:
import pandas as pd

train = pd.read_csv("data/train.csv")

image_paths = []
valid_rows = []

for _, row in train.iterrows():
    img_path = find_image_path(row["id"], "images/train")
    if img_path is not None:
        valid_rows.append(row)
        image_paths.append(img_path)

train_img = pd.DataFrame(valid_rows)
train_img["image_path"] = image_paths

print("Final train rows with images:", len(train_img))


Final train rows with images: 50


In [27]:
train_img.to_csv("data/train_with_images.csv", index=False)


In [28]:
train_img_df = pd.read_csv("data/train_with_images.csv")
print(train_img_df.head())


         id             date   price  bedrooms  bathrooms  sqft_living  \
0  16000545  20150312T000000  250000         4       1.00         1320   
1  42000130  20140924T000000  600000         5       4.50         4440   
2   1200019  20140508T000000  647500         4       1.75         2060   
3  34001304  20150410T000000  480000         5       2.25         2240   
4  34000005  20140618T000000  343566         2       1.00         1100   

   sqft_lot  floors  waterfront  view  ...  sqft_above  sqft_basement  \
0     11212     1.0           0     0  ...        1320              0   
1      9784     2.0           0     0  ...        4440              0   
2     26036     1.0           0     0  ...        1160            900   
3      5500     1.0           0     0  ...        1490            750   
4      4200     1.0           0     0  ...        1100              0   

   yr_built  yr_renovated  zipcode      lat     long  sqft_living15  \
0      1914             0    98002  47.3098 -

In [30]:
train_image_dataset = PropertyImageDataset(
    train_img_df,
    transform=image_transforms
)

train_image_loader = DataLoader(
    train_image_dataset,
    batch_size=32,
    shuffle=True
)

images = next(iter(train_image_loader))
print(images.shape)


torch.Size([32, 3, 224, 224])


In [31]:
import torch
from torchvision import models

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load pretrained ResNet18
resnet = models.resnet18(pretrained=True)

# Remove final classification layer
resnet.fc = torch.nn.Identity()

resnet = resnet.to(device)
resnet.eval()


Using device: cpu




Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\Chhavikash Raj/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:07<00:00, 5.98MB/s]


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [32]:
import numpy as np
from tqdm import tqdm

def extract_image_features(dataloader, model):
    features = []

    with torch.no_grad():
        for images in tqdm(dataloader):
            images = images.to(device)
            outputs = model(images)
            features.append(outputs.cpu().numpy())

    features = np.vstack(features)
    return features


In [33]:
train_image_features = extract_image_features(
    train_image_loader,
    resnet
)

print("Train image feature shape:", train_image_features.shape)


100%|██████████| 2/2 [00:01<00:00,  1.88it/s]

Train image feature shape: (50, 512)





In [34]:
np.save("data/train_image_features.npy", train_image_features)


In [35]:
import pandas as pd
import numpy as np

# Load image-aligned train data
train_img_df = pd.read_csv("data/train_with_images.csv")

# Load full processed tabular features
X_tab = pd.read_csv("data/train_features.csv")
y_full = pd.read_csv("data/train_target.csv").values.ravel()

# Align tabular rows using id
X_tab_img = X_tab.loc[train_img_df.index].reset_index(drop=True)
y_img = y_full[train_img_df.index]

print(X_tab_img.shape, y_img.shape)


(50, 384) (50,)


In [36]:
X_img = np.load("data/train_image_features.npy")

print("Image features shape:", X_img.shape)


Image features shape: (50, 512)


In [39]:
X_fused = np.concatenate(
    [X_tab_img.values, X_img],
    axis=1
)

print("Final fused feature shape:", X_fused.shape)


Final fused feature shape: (50, 896)


In [40]:
X_fused = np.concatenate(
    [X_tab_img.values, X_img],
    axis=1
)

print("Final fused feature shape:", X_fused.shape)


Final fused feature shape: (50, 896)


In [41]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor

X_train, X_val, y_train, y_val = train_test_split(
    X_fused,
    y_img,
    test_size=0.2,
    random_state=42
)

multimodal_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

multimodal_model.fit(X_train, y_train)


In [42]:
val_preds = multimodal_model.predict(X_val)

rmse = mean_squared_error(y_val, val_preds, squared=False)
r2 = r2_score(y_val, val_preds)

print("Multimodal Model Results")
print("RMSE:", rmse)
print("R²:", r2)


Multimodal Model Results
RMSE: 0.1769562602384845
R²: 0.6473656331540378




In [43]:
import numpy as np
import pandas as pd


In [44]:
test = pd.read_csv("data/test.csv")
X_test_tab = pd.read_csv("data/test_features.csv")

print(test.shape, X_test_tab.shape)


(5404, 20) (5404, 384)


In [45]:
import os

test_image_ids = set(
    int(f.split(".")[0])
    for f in os.listdir("images/test")
    if f.endswith((".png", ".jpg"))
)

test["has_image"] = test["id"].isin(test_image_ids)

print(test["has_image"].value_counts())


has_image
False    5402
True        2
Name: count, dtype: int64


In [46]:
tabular_test_preds_log = tabular_model.predict(X_test_tab)


In [48]:
# Predict using tabular model for all test rows
final_preds_log = tabular_model.predict(X_test_tab)

# Convert back from log-scale
final_preds = np.expm1(final_preds_log)

submission = pd.DataFrame({
    "id": test["id"],
    "predicted_price": final_preds
})

submission.to_csv("enrollno_final.csv", index=False)


In [50]:
tabular_test_preds_log = tabular_model.predict(X_test_tab)


In [51]:
import numpy as np

final_preds = np.expm1(tabular_test_preds_log)


In [52]:
submission = pd.DataFrame({
    "id": test["id"],
    "predicted_price": final_preds
})


In [53]:
submission.to_csv("enrollno_final.csv", index=False)


In [54]:
print(submission.head())
print(submission.shape)


           id  predicted_price
0  2591820310     3.579433e+05
1  7974200820     8.270059e+05
2  7701450110     1.094641e+06
3  9522300010     2.005054e+06
4  9510861140     7.353151e+05
(5404, 2)
