In [23]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

# 1. Loading the Data
data = np.load('/kaggle/input/data-for-model-training-and-testing/dataset.npz')
ids_df = pd.read_csv('/kaggle/input/data-for-model-training-and-testing/test_ids.csv')

# Model Training - Tabular Data

In [24]:
X_train_full = data['X_train']
y_train_full = data['y_train']
X_test_submit = data['x_test']  # This is for the final submission (unseen data)

# 2.Local Validation Set
# We hold back 20% of our training data to measure accuracy
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, 
    test_size=0.2, 
    random_state=42
)

print(f"Training on:   {X_train.shape} samples")
print(f"Validating on: {X_val.shape} samples")

Training on:   (12967, 18) samples
Validating on: (3242, 18) samples


In [25]:
# 3. Initialize the Model
model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,            # Use all CPU cores
    random_state=42
)

# 4. Train with Early Stopping
# This automatically stops if validation error stops dropping
eval_set = [(X_train, y_train), (X_val, y_val)]

model.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=100  # Print progress every 100 rounds
)

[0]	validation_0-rmse:349592.54685	validation_1-rmse:343051.15282
[100]	validation_0-rmse:91508.42621	validation_1-rmse:125263.61970
[200]	validation_0-rmse:74581.65104	validation_1-rmse:119568.82088
[300]	validation_0-rmse:65484.51287	validation_1-rmse:117907.76656
[400]	validation_0-rmse:58709.92005	validation_1-rmse:117363.64951
[500]	validation_0-rmse:53549.87199	validation_1-rmse:116918.42573
[600]	validation_0-rmse:49414.45755	validation_1-rmse:116785.66793
[700]	validation_0-rmse:45467.44412	validation_1-rmse:116654.85252
[800]	validation_0-rmse:42127.67437	validation_1-rmse:116590.62021
[900]	validation_0-rmse:39161.68013	validation_1-rmse:116496.44823
[999]	validation_0-rmse:36537.00487	validation_1-rmse:116523.57755


In [26]:
from sklearn.metrics import mean_squared_error, r2_score

# 1. Check Local Accuracy (Validation Set)
val_predictions = model.predict(X_val)

# Calculate Metrics
mse = mean_squared_error(y_val, val_predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, val_predictions)

print("="*40)
print(f"MODEL PERFORMANCE (XGBoost)")
print(f"Validation RMSE:     ${rmse:,.2f}")
print(f"Validation R¬≤ Score: {r2:.4f}")
print("="*40)

# Interpretation
print(f"This means your model explains {r2*100:.2f}% of the price variation.")
print(f"On average, predictions are off by approx ${rmse:,.0f}.")

MODEL PERFORMANCE (XGBoost)
Validation RMSE:     $116,523.57
Validation R¬≤ Score: 0.8918
This means your model explains 89.18% of the price variation.
On average, predictions are off by approx $116,524.


In [31]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from PIL import Image
import os
import time

# ==========================================
# 1. CONFIGURATION & PATHS
# ==========================================
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
LEARNING_RATE = 1e-4
EPOCHS = 20

# Paths (Updated based on your inputs)
IMAGE_DIR = '/kaggle/input/image-data'
PROCESSED_DATA_PATH = '/kaggle/input/data-for-model-training-and-testing/dataset.npz'
TEST_IDS_PATH = '/kaggle/input/data-for-model-training-and-testing/test_ids.csv'
# Note: Double check this path if it fails. It might be 'train.xlsx'
ORIGINAL_DATA_PATH = '/kaggle/input/dataset-for-model/train(1).xlsx' 

print(f"üöÄ Running on {DEVICE}")

# ==========================================
# 2. DATA LOADING & ALIGNMENT
# ==========================================
print("\n--- Step 1: Loading Data ---")

# A. Load Numerical Data
data = np.load(PROCESSED_DATA_PATH)
X_train_full = data['X_train']  # Check casing if error (e.g. 'X_train' vs 'x_train')
y_train_full = data['y_train']
X_test_submission = data['x_test']

print(f"Numerical Data Loaded: {X_train_full.shape} samples")

# B. Load Original Data to Recover IDs
# We need IDs to find the matching image for each row of numbers
try:
    raw_df = pd.read_excel(ORIGINAL_DATA_PATH)
except FileNotFoundError:
    print(f"‚ö†Ô∏è File not found at {ORIGINAL_DATA_PATH}. Searching for valid path...")
    for root, dirs, files in os.walk('/kaggle/input'):
        for file in files:
            if 'train' in file and '.xlsx' in file:
                ORIGINAL_DATA_PATH = os.path.join(root, file)
                print(f"‚úÖ Found correct path: {ORIGINAL_DATA_PATH}")
                raw_df = pd.read_excel(ORIGINAL_DATA_PATH)
                break

# C. Align IDs with X_train_full
# If X_train_full has 16209 rows, we need 16209 IDs.
if len(raw_df) == len(X_train_full):
    print("‚úÖ ID Count matches Data Count. Using IDs directly.")
    train_ids_full = raw_df['id'].values
else:
    print(f"‚ö†Ô∏è Size mismatch (DF: {len(raw_df)} vs X: {len(X_train_full)}). Re-splitting IDs...")
    # This assumes your preprocessing used random_state=42
    _, _, _, _, train_ids_full, _ = train_test_split(
        raw_df, raw_df['price'], raw_df['id'], test_size=0.2, random_state=42
    )

# D. Create Validation Split
# We split everything (X, y, and IDs) together so they stay perfectly aligned
X_train, X_val, y_train, y_val, train_ids, val_ids = train_test_split(
    X_train_full, 
    y_train_full, 
    train_ids_full, 
    test_size=0.2, 
    random_state=42
)

# E. Log-Transform Targets (CRITICAL for RMSE)
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

print(f"Final Training Set: {X_train.shape} samples")
print(f"Final Validation Set: {X_val.shape} samples")

# ==========================================
# 3. DATASET CLASS & LOADERS
# ==========================================
class RealEstateDataset(Dataset):
    def __init__(self, features, ids, image_dir, targets=None, transform=None, mode='train'):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.ids = [str(i) for i in ids] # Ensure string for filenames
        self.image_dir = image_dir
        self.transform = transform
        self.mode = mode
        
        if mode == 'train':
            self.targets = torch.tensor(targets, dtype=torch.float32).view(-1, 1)
        else:
            self.targets = None

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # 1. Load Image
        img_id = self.ids[idx]
        img_path = os.path.join(self.image_dir, f"{img_id}.jpg")
        
        try:
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
        except (FileNotFoundError, OSError):
            image = torch.zeros((3, 224, 224), dtype=torch.float32) # Fallback

        # 2. Get Numbers
        tab_data = self.features[idx]
        
        if self.mode == 'train':
            return image, tab_data, self.targets[idx]
        else:
            return image, tab_data

# Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Create Loaders
train_ds = RealEstateDataset(X_train, train_ids, IMAGE_DIR, y_train_log, transform, 'train')
val_ds = RealEstateDataset(X_val, val_ids, IMAGE_DIR, y_val_log, transform, 'train')

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# ==========================================
# 4. MODEL ARCHITECTURE
# ==========================================
class MultimodalNet(nn.Module):
    def __init__(self, num_tabular_features):
        super(MultimodalNet, self).__init__()
        
        # Image Branch
        self.cnn = models.resnet18(pretrained=True)
        self.cnn.fc = nn.Sequential(
            nn.Linear(self.cnn.fc.in_features, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        
        # Tabular Branch
        self.tabular_mlp = nn.Sequential(
            nn.Linear(num_tabular_features, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )
        
        # Fusion
        self.fusion_head = nn.Sequential(
            nn.Linear(128 + 32, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, image, tab_data):
        img_embed = self.cnn(image)
        tab_embed = self.tabular_mlp(tab_data)
        combined = torch.cat((img_embed, tab_embed), dim=1)
        return self.fusion_head(combined)

model = MultimodalNet(num_tabular_features=X_train.shape[1]).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss()

# ==========================================
# 5. TRAINING LOOP
# ==========================================
print("\n--- Step 2: Training Model ---")
best_rmse = float('inf')

for epoch in range(EPOCHS):
    start_time = time.time()
    model.train()
    train_loss = 0
    
    # Train
    for images, tabs, targets in train_loader:
        images, tabs, targets = images.to(DEVICE), tabs.to(DEVICE), targets.to(DEVICE)
        
        optimizer.zero_grad()
        outputs = model(images, tabs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        
    # Validate
    model.eval()
    actuals = []
    predictions = []
    
    with torch.no_grad():
        for images, tabs, targets in val_loader:
            images, tabs = images.to(DEVICE), tabs.to(DEVICE)
            
            # Predict (Log Scale)
            log_preds = model(images, tabs)
            
            # Convert back to Dollars
            real_preds = torch.expm1(log_preds).cpu().numpy().flatten()
            real_targets = torch.expm1(targets).cpu().numpy().flatten()
            
            predictions.extend(real_preds)
            actuals.extend(real_targets)
            
    # Metrics
    mse = np.mean((np.array(actuals) - np.array(predictions)) ** 2)
    val_rmse = np.sqrt(mse)
    val_r2 = r2_score(actuals, predictions)
    
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{EPOCHS} [{epoch_time:.0f}s] | RMSE: ${val_rmse:,.0f} | R¬≤: {val_r2:.4f}")
    
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        torch.save(model.state_dict(), 'best_multimodal_model.pth')
        print(f"   >>> üíæ Saved New Best Model!")

print(f"Training Complete. Best RMSE: ${best_rmse:,.0f}")

üöÄ Running on cuda

--- Step 1: Loading Data ---
Numerical Data Loaded: (16209, 18) samples
‚úÖ ID Count matches Data Count. Using IDs directly.
Final Training Set: (12967, 18) samples
Final Validation Set: (3242, 18) samples





--- Step 2: Training Model ---
Epoch 1/20 [50s] | RMSE: $403,626 | R¬≤: -0.2982
   >>> üíæ Saved New Best Model!
Epoch 2/20 [49s] | RMSE: $349,163 | R¬≤: 0.0285


KeyboardInterrupt: 

In [32]:
# Using XGBoost model to make the final predictions

In [33]:
# 3. Initialize the Model
model = xgb.XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,            # Use all CPU cores
    random_state=42
)

# 4. Train with Early Stopping
# This automatically stops if validation error stops dropping
eval_set = [(X_train, y_train), (X_val, y_val)]

model.fit(
    X_train, y_train,
    eval_set=eval_set,
    verbose=100  # Print progress every 100 rounds
)

[0]	validation_0-rmse:349592.54685	validation_1-rmse:343051.15282
[100]	validation_0-rmse:91508.42621	validation_1-rmse:125263.61970
[200]	validation_0-rmse:74581.65104	validation_1-rmse:119568.82088
[300]	validation_0-rmse:65484.51287	validation_1-rmse:117907.76656
[400]	validation_0-rmse:58709.92005	validation_1-rmse:117363.64951
[500]	validation_0-rmse:53549.87199	validation_1-rmse:116918.42573
[600]	validation_0-rmse:49414.45755	validation_1-rmse:116785.66793
[700]	validation_0-rmse:45467.44412	validation_1-rmse:116654.85252
[800]	validation_0-rmse:42127.67437	validation_1-rmse:116590.62021
[900]	validation_0-rmse:39161.68013	validation_1-rmse:116496.44823
[999]	validation_0-rmse:36537.00487	validation_1-rmse:116523.57755


In [34]:
id = pd.read_csv('/kaggle/input/data-for-model-training-and-testing/test_ids.csv')

In [35]:
predicted_price = model.predict(X_test_submit)

In [36]:
predicted_price = pd.DataFrame(predicted_price, columns=['predicted_price'])

In [38]:
d = pd.concat([id,predicted_price],axis=1)

In [39]:
d

Unnamed: 0,id,predicted_price
0,2591820310,3.748754e+05
1,7974200820,8.809516e+05
2,7701450110,1.088302e+06
3,9522300010,2.050641e+06
4,9510861140,7.544586e+05
...,...,...
5399,7732500270,6.431563e+05
5400,3856903515,6.638189e+05
5401,2557000400,2.778444e+05
5402,4386700135,2.013106e+06


In [37]:
# Making the submission csv

In [40]:
import os

os.makedirs('/kaggle/working/results', exist_ok=True)
d.to_csv('/kaggle/working/results/predictions.csv', index=False)