# Data Processing

In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"heartking","key":"YOUR_KEY"}'}

In [2]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c house-prices-advanced-regression-techniques
!unzip house-prices-advanced-regression-techniques.zip
!rm -rf *.zip

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 624MB/s]
Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_output = df_train['SalePrice']
df_train = df_train.drop('SalePrice', axis = 1)
df_id = df_test['Id']
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal


In [4]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# --- 1) object 타입인 열들을 category로 변환 (너가 한 방식) ---
for col in df_train.columns:
    if df_train[col].dtype == 'object':
        df_train[col] = df_train[col].astype('category')
        # test에도 같은 컬럼이 있다면 category로 바꿔준다 (없는 경우는 무시)
        if col in df_test.columns:
            df_test[col] = df_test[col].astype('category')

# --- 2) 범주형 / 수치형 컬럼 목록 생성 ---
cat_cols = list(df_train.select_dtypes(include=['category']).columns)
num_cols = list(df_train.select_dtypes(exclude=['category', 'object']).columns)

# --- 3) OneHotEncoder 생성 (sklearn 버전 호환 처리) ---
try:
    ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
except TypeError:
    # 구버전 sklearn (sparse 대신 사용)
    ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

# --- 4) ColumnTransformer로 범주형만 OHE 적용, 숫자형은 passthrough ---
col_transformer = ColumnTransformer(
    transformers=[
        ('ohe', ohe, cat_cols)
    ],
    remainder='passthrough'  # 숫자형 컬럼은 그대로 통과
)

# --- 5) Pipeline (여기서는 단순히 transformer만 사용) ---
pipeline = Pipeline(steps=[('col_tr', col_transformer)])

# --- 6) Fit on train and transform both train/test ---
# (reset_index to avoid index alignment issues when concatenating)
X_train_trans = pipeline.fit_transform(df_train)
X_test_trans = pipeline.transform(df_test)

# --- 7) 컬럼 이름 재생성 ---
# ohe로 생성된 컬럼 이름들
# Fix: Access the fitted OHE from the pipeline after it has been fitted
fitted_ohe = pipeline.named_steps['col_tr'].named_transformers_['ohe']
ohe_feature_names = fitted_ohe.get_feature_names_out(cat_cols)

# ColumnTransformer의 remainder='passthrough'로 넘어간 숫자 컬럼들의 순서가
# transform 결과의 뒤쪽에 붙으므로, 전체 컬럼명을 아래처럼 만든다.
all_feature_names = list(ohe_feature_names) + num_cols

# --- 8) 결과를 DataFrame으로 변환 (원래 인덱스 유지) ---
df_train_encoded = pd.DataFrame(X_train_trans, columns=all_feature_names, index=df_train.index)
df_test_encoded  = pd.DataFrame(X_test_trans,  columns=all_feature_names, index=df_test.index)

# --- 9) 타입 정리 (OHE 결과는 float이므로 필요시 int로 변환 가능) ---
# 예: 모든 OHE 컬럼을 int로 바꾸려면 아래 주석 해제
# ohe_cols_set = set(ohe_feature_names)
# for c in df_train_encoded.columns:
#     if c in ohe_cols_set:
#         df_train_encoded[c] = df_train_encoded[c].astype(int)
#         df_test_encoded[c]  = df_test_encoded[c].astype(int)

# --- 10) NaN 보호 (일반적으로 필요 없지만 안전 차원에서) ---
df_train_encoded = df_train_encoded.fillna(0)
df_test_encoded  = df_test_encoded.fillna(0)

# --- 11) 확인 ---
print("Train encoded shape:", df_train_encoded.shape)
print("Test  encoded shape:", df_test_encoded.shape)
df_train_encoded.head()

df_train = df_train_encoded
df_test = df_test_encoded


Train encoded shape: (1460, 304)
Test  encoded shape: (1459, 304)


# PCA + Linear Regression

In [None]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df_train = df_train_encoded
df_test = df_test_encoded

pca = PCA(n_components = 10)
df_train = pca.fit_transform(df_train)
df_test = pca.transform(df_test)

model = LinearRegression()
model.fit(df_train, df_output) # fit_transform is for transformers, use fit for models

mse = mean_squared_error(df_output, model.predict(df_train))

print(mse)

predictions = model.predict(df_test)

2056892751.8303564


In [None]:
sales = pd.DataFrame({'Id': df_id, 'SalePrice': predictions})
sales.to_csv('submission.csv', index = False)

In [None]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Message"

100% 33.7k/33.7k [00:00<00:00, 86.8kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques

# Deep Learning

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from tqdm import tqdm

class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, 512)
        self.layer2 = nn.Linear(512, 256)
        self.layer3 = nn.Linear(256, 128)
        self.layer4 = nn.Linear(128, 64)
        self.layer5 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        x = F.relu(self.layer4(x))
        x = self.layer5(x)
        return x

device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 64
num_epochs = 1000

model = MLP(input_dim=304, output_dim=1).to(device)   # <- 모델을 device로 이동
optimizer = AdamW(model.parameters(), lr=0.001, weight_decay = 0.9)
loss_fn = nn.MSELoss()

# 데이터 준비 (이미 하신 것과 동일)
train_x, test_x, train_y, test_y = train_test_split(df_train_encoded, df_output, test_size=0.2)

train_x = torch.tensor(train_x.values, dtype=torch.float32).to(device)
train_y = torch.tensor(train_y.values, dtype=torch.float32).unsqueeze(1).to(device)

test_x = torch.tensor(test_x.values, dtype=torch.float32).to(device)
test_y = torch.tensor(test_y.values, dtype=torch.float32).unsqueeze(1).to(device)

train_dataset = torch.utils.data.TensorDataset(train_x, train_y)
test_dataset = torch.utils.data.TensorDataset(test_x, test_y)

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

best_test_loss = float('inf')
best_model_path = 'best_model.pth'  # 원하면 경로/이름 변경

for epoch in range(num_epochs):
    model.train()
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    running_loss = 0.0
    for data, target in pbar:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        outputs = model(data)
        loss = loss_fn(outputs, target)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * data.size(0)
        pbar.set_postfix({'train_batch_loss': loss.item()})


    # 전체 train loss (optional)
    epoch_train_loss = running_loss / len(train_loader.dataset)

    # validation (test) loss 계산
    model.eval()
    with torch.no_grad():
        val_outputs = model(test_x)
        test_loss = loss_fn(val_outputs, test_y)

    # 개선되면 체크포인트 저장
    if test_loss.item() < best_test_loss:
        best_test_loss = test_loss.item()
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'test_loss': best_test_loss
        }, best_model_path)
        print(f'>>> New best model saved at epoch {epoch+1}, test_loss={best_test_loss:.6f}')

    if epoch % 10 == 0:
        # 출력 (정수로 변환하지 말고 실수로 출력)
        print('-----------------------------------------')
        print(f'Epoch {epoch+1} Train Loss: {epoch_train_loss:.6f} , Best Test Loss: {best_test_loss:.6f}')
        print('-----------------------------------------')

# --- 학습 종료 후: best model 불러오기 및 적용 ---
checkpoint = torch.load(best_model_path, map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

with torch.no_grad():
    final_val_outputs = model(test_x)
    final_test_loss = loss_fn(final_val_outputs, test_y)
print(f'Loaded best model (saved test_loss={checkpoint["test_loss"]:.6f}) -> current test loss: {final_test_loss.item():.6f}')



>>> New best model saved at epoch 1, test_loss=30073659392.000000
-----------------------------------------
Epoch 1 Train Loss: 37010203185.095894 , Best Test Loss: 30073659392.000000
-----------------------------------------




>>> New best model saved at epoch 2, test_loss=8103959040.000000




>>> New best model saved at epoch 3, test_loss=6545766912.000000




>>> New best model saved at epoch 4, test_loss=4874540544.000000




>>> New best model saved at epoch 5, test_loss=3771742720.000000




>>> New best model saved at epoch 6, test_loss=3304635136.000000




>>> New best model saved at epoch 8, test_loss=2926203136.000000




>>> New best model saved at epoch 9, test_loss=2778224640.000000




>>> New best model saved at epoch 10, test_loss=2604680960.000000




>>> New best model saved at epoch 11, test_loss=2509603328.000000
-----------------------------------------
Epoch 11 Train Loss: 2844888211.287671 , Best Test Loss: 2509603328.000000
-----------------------------------------




>>> New best model saved at epoch 12, test_loss=2332642048.000000




>>> New best model saved at epoch 13, test_loss=2184872448.000000




>>> New best model saved at epoch 14, test_loss=2111525760.000000




>>> New best model saved at epoch 16, test_loss=1954228736.000000




>>> New best model saved at epoch 17, test_loss=1894012160.000000




>>> New best model saved at epoch 18, test_loss=1883103744.000000




>>> New best model saved at epoch 19, test_loss=1684545152.000000




>>> New best model saved at epoch 20, test_loss=1649464576.000000




-----------------------------------------
Epoch 21 Train Loss: 2123477340.931507 , Best Test Loss: 1649464576.000000
-----------------------------------------




>>> New best model saved at epoch 23, test_loss=1569601664.000000




>>> New best model saved at epoch 26, test_loss=1565118208.000000




>>> New best model saved at epoch 31, test_loss=1552212352.000000
-----------------------------------------
Epoch 31 Train Loss: 2059135132.054795 , Best Test Loss: 1552212352.000000
-----------------------------------------




>>> New best model saved at epoch 32, test_loss=1531382912.000000




>>> New best model saved at epoch 37, test_loss=1518036224.000000




-----------------------------------------
Epoch 41 Train Loss: 1953997284.821918 , Best Test Loss: 1518036224.000000
-----------------------------------------




>>> New best model saved at epoch 45, test_loss=1488643968.000000




>>> New best model saved at epoch 46, test_loss=1463857408.000000




>>> New best model saved at epoch 49, test_loss=1434919040.000000




-----------------------------------------
Epoch 51 Train Loss: 2011515321.863014 , Best Test Loss: 1434919040.000000
-----------------------------------------




-----------------------------------------
Epoch 61 Train Loss: 1936046446.465753 , Best Test Loss: 1434919040.000000
-----------------------------------------




>>> New best model saved at epoch 62, test_loss=1430921600.000000




-----------------------------------------
Epoch 71 Train Loss: 1804364401.972603 , Best Test Loss: 1430921600.000000
-----------------------------------------




>>> New best model saved at epoch 79, test_loss=1428928384.000000




>>> New best model saved at epoch 81, test_loss=1417841024.000000
-----------------------------------------
Epoch 81 Train Loss: 1729459299.068493 , Best Test Loss: 1417841024.000000
-----------------------------------------




>>> New best model saved at epoch 90, test_loss=1417036672.000000




>>> New best model saved at epoch 91, test_loss=1401637760.000000
-----------------------------------------
Epoch 91 Train Loss: 1664817943.671233 , Best Test Loss: 1401637760.000000
-----------------------------------------




>>> New best model saved at epoch 100, test_loss=1394235008.000000




-----------------------------------------
Epoch 101 Train Loss: 1649193219.506849 , Best Test Loss: 1394235008.000000
-----------------------------------------




>>> New best model saved at epoch 106, test_loss=1393786752.000000




-----------------------------------------
Epoch 111 Train Loss: 1582471422.246575 , Best Test Loss: 1393786752.000000
-----------------------------------------




>>> New best model saved at epoch 116, test_loss=1392831232.000000




>>> New best model saved at epoch 117, test_loss=1366732800.000000




-----------------------------------------
Epoch 121 Train Loss: 1655800188.493151 , Best Test Loss: 1366732800.000000
-----------------------------------------




-----------------------------------------
Epoch 131 Train Loss: 1541050322.410959 , Best Test Loss: 1366732800.000000
-----------------------------------------




>>> New best model saved at epoch 137, test_loss=1353824000.000000




>>> New best model saved at epoch 139, test_loss=1316862208.000000




-----------------------------------------
Epoch 141 Train Loss: 1646369213.369863 , Best Test Loss: 1316862208.000000
-----------------------------------------




-----------------------------------------
Epoch 151 Train Loss: 1676249834.958904 , Best Test Loss: 1316862208.000000
-----------------------------------------




-----------------------------------------
Epoch 161 Train Loss: 1389132204.273973 , Best Test Loss: 1316862208.000000
-----------------------------------------




-----------------------------------------
Epoch 171 Train Loss: 1384814739.287671 , Best Test Loss: 1316862208.000000
-----------------------------------------




-----------------------------------------
Epoch 181 Train Loss: 1358761075.726027 , Best Test Loss: 1316862208.000000
-----------------------------------------




>>> New best model saved at epoch 188, test_loss=1309993984.000000




-----------------------------------------
Epoch 191 Train Loss: 1613967852.712329 , Best Test Loss: 1309993984.000000
-----------------------------------------




-----------------------------------------
Epoch 201 Train Loss: 1707890617.863014 , Best Test Loss: 1309993984.000000
-----------------------------------------




-----------------------------------------
Epoch 211 Train Loss: 1441432967.452055 , Best Test Loss: 1309993984.000000
-----------------------------------------




-----------------------------------------
Epoch 221 Train Loss: 1362486890.958904 , Best Test Loss: 1309993984.000000
-----------------------------------------




>>> New best model saved at epoch 229, test_loss=1260300800.000000




-----------------------------------------
Epoch 231 Train Loss: 1387269586.410959 , Best Test Loss: 1260300800.000000
-----------------------------------------




-----------------------------------------
Epoch 241 Train Loss: 1376280498.849315 , Best Test Loss: 1260300800.000000
-----------------------------------------




-----------------------------------------
Epoch 251 Train Loss: 1321043950.465753 , Best Test Loss: 1260300800.000000
-----------------------------------------




-----------------------------------------
Epoch 261 Train Loss: 1714204933.260274 , Best Test Loss: 1260300800.000000
-----------------------------------------




-----------------------------------------
Epoch 271 Train Loss: 1457447945.643836 , Best Test Loss: 1260300800.000000
-----------------------------------------




-----------------------------------------
Epoch 281 Train Loss: 1476519918.904109 , Best Test Loss: 1260300800.000000
-----------------------------------------




-----------------------------------------
Epoch 291 Train Loss: 1361534000.219178 , Best Test Loss: 1260300800.000000
-----------------------------------------




>>> New best model saved at epoch 296, test_loss=1251115392.000000




-----------------------------------------
Epoch 301 Train Loss: 1320639530.082192 , Best Test Loss: 1251115392.000000
-----------------------------------------




-----------------------------------------
Epoch 311 Train Loss: 1319585311.561644 , Best Test Loss: 1251115392.000000
-----------------------------------------




-----------------------------------------
Epoch 321 Train Loss: 1314200670.246575 , Best Test Loss: 1251115392.000000
-----------------------------------------




-----------------------------------------
Epoch 331 Train Loss: 1469621248.876712 , Best Test Loss: 1251115392.000000
-----------------------------------------




-----------------------------------------
Epoch 341 Train Loss: 1381945628.931507 , Best Test Loss: 1251115392.000000
-----------------------------------------




>>> New best model saved at epoch 344, test_loss=1239507712.000000




-----------------------------------------
Epoch 351 Train Loss: 1398385613.150685 , Best Test Loss: 1239507712.000000
-----------------------------------------




-----------------------------------------
Epoch 361 Train Loss: 1315627884.712329 , Best Test Loss: 1239507712.000000
-----------------------------------------




-----------------------------------------
Epoch 371 Train Loss: 1447088677.260274 , Best Test Loss: 1239507712.000000
-----------------------------------------




-----------------------------------------
Epoch 381 Train Loss: 1422209074.849315 , Best Test Loss: 1239507712.000000
-----------------------------------------




-----------------------------------------
Epoch 391 Train Loss: 1288358503.013699 , Best Test Loss: 1239507712.000000
-----------------------------------------




-----------------------------------------
Epoch 401 Train Loss: 1417786883.506849 , Best Test Loss: 1239507712.000000
-----------------------------------------




>>> New best model saved at epoch 411, test_loss=1228289408.000000
-----------------------------------------
Epoch 411 Train Loss: 1392562434.630137 , Best Test Loss: 1228289408.000000
-----------------------------------------




-----------------------------------------
Epoch 421 Train Loss: 1532057415.890411 , Best Test Loss: 1228289408.000000
-----------------------------------------




-----------------------------------------
Epoch 431 Train Loss: 1409068831.561644 , Best Test Loss: 1228289408.000000
-----------------------------------------




-----------------------------------------
Epoch 441 Train Loss: 1554846166.794521 , Best Test Loss: 1228289408.000000
-----------------------------------------




-----------------------------------------
Epoch 451 Train Loss: 1366178447.780822 , Best Test Loss: 1228289408.000000
-----------------------------------------




-----------------------------------------
Epoch 461 Train Loss: 1346007957.041096 , Best Test Loss: 1228289408.000000
-----------------------------------------




-----------------------------------------
Epoch 471 Train Loss: 1349118639.342466 , Best Test Loss: 1228289408.000000
-----------------------------------------




>>> New best model saved at epoch 473, test_loss=1223459584.000000




-----------------------------------------
Epoch 481 Train Loss: 1373044076.712329 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 491 Train Loss: 1522689123.945205 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 501 Train Loss: 1320182824.328767 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 511 Train Loss: 1464873186.191781 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 521 Train Loss: 1267494452.602740 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 531 Train Loss: 1295185697.315068 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 541 Train Loss: 1338821453.150685 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 551 Train Loss: 1291341400.547945 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 561 Train Loss: 1359065501.808219 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 571 Train Loss: 1607902787.506849 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 581 Train Loss: 1447134883.068493 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 591 Train Loss: 1407320132.383562 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 601 Train Loss: 1579277568.000000 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 611 Train Loss: 1580221618.849315 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 621 Train Loss: 1485189478.575342 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 631 Train Loss: 1302302968.986301 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 641 Train Loss: 1278567213.589041 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 651 Train Loss: 1206550741.917808 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 661 Train Loss: 1385598362.301370 , Best Test Loss: 1223459584.000000
-----------------------------------------




-----------------------------------------
Epoch 671 Train Loss: 1358910779.616438 , Best Test Loss: 1223459584.000000
-----------------------------------------




>>> New best model saved at epoch 674, test_loss=1207021568.000000




-----------------------------------------
Epoch 681 Train Loss: 1252771008.876712 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 691 Train Loss: 1484416653.150685 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 701 Train Loss: 1534281062.575342 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 711 Train Loss: 1230746550.356164 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 721 Train Loss: 1240351697.095891 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 731 Train Loss: 1739901296.219178 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 741 Train Loss: 1482025044.164384 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 751 Train Loss: 1471508130.191781 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 761 Train Loss: 1491852683.835616 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 771 Train Loss: 1561858773.917808 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 781 Train Loss: 1324439855.342466 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 791 Train Loss: 1483389822.246575 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 801 Train Loss: 1436597449.643836 , Best Test Loss: 1207021568.000000
-----------------------------------------




-----------------------------------------
Epoch 811 Train Loss: 1456202133.041096 , Best Test Loss: 1207021568.000000
-----------------------------------------




>>> New best model saved at epoch 814, test_loss=1188809344.000000




-----------------------------------------
Epoch 821 Train Loss: 1305830459.616438 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 831 Train Loss: 1367391772.931507 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 841 Train Loss: 1307588108.273973 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 851 Train Loss: 1552798235.178082 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 861 Train Loss: 1331259259.178082 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 871 Train Loss: 1370849153.753425 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 881 Train Loss: 1560303824.657534 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 891 Train Loss: 1723565354.082192 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 901 Train Loss: 1450632661.041096 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 911 Train Loss: 1389608014.904109 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 921 Train Loss: 1395618296.109589 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 931 Train Loss: 1358099862.794521 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 941 Train Loss: 1501331610.301370 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 951 Train Loss: 1511693299.726027 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 961 Train Loss: 1520231737.863014 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 971 Train Loss: 1664119993.863014 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 981 Train Loss: 1402498830.904109 , Best Test Loss: 1188809344.000000
-----------------------------------------




-----------------------------------------
Epoch 991 Train Loss: 1652560327.890411 , Best Test Loss: 1188809344.000000
-----------------------------------------




Loaded best model (saved test_loss=1188809344.000000) -> current test loss: 1188809344.000000


In [None]:
submit_x = torch.tensor(df_test_encoded.values, dtype=torch.float32)
pred = model(submit_x)
sales = pd.DataFrame({'Id': df_id, 'SalePrice': pred.detach().numpy().squeeze()})

sales.to_csv('submission.csv', index = False)
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Message"

100% 21.1k/21.1k [00:00<00:00, 56.7kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques

# XGBoost

In [5]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m34.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
!pip install optuna-integration[xgboost]

Collecting optuna-integration[xgboost]
  Downloading optuna_integration-4.6.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.6.0-py3-none-any.whl (99 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.1/99.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.6.0


In [None]:
from sklearn.model_selection import RandomizedSearchCV, KFold, train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

df_output = np.log1p(df_output)
X_train, X_valid, y_train, y_valid = train_test_split(df_train, df_output, test_size=0.2)

In [None]:
import optuna
import xgboost as xgb
from optuna.integration import XGBoostPruningCallback
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

def objective(trial):
    param = {
        'verbosity': 0,
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'eval_metric': 'rmse', # Fix: Move eval_metric to model constructor parameters
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'n_estimators': 2000,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 5.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.5, 5.0),
    }

    kf = KFold(n_splits=5, shuffle=True)
    rmse_scores = []
    for train_idx, val_idx in kf.split(X_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        early_stopping_callback = xgb.callback.EarlyStopping(rounds=100, save_best=True, maximize=False)
        pruning_callback = XGBoostPruningCallback(trial, "validation_0-rmse")
        model = xgb.XGBRegressor(**param, callbacks = [early_stopping_callback, pruning_callback])

        # Fix: Use xgb.callback.EarlyStopping for early stopping and combine with PruningCallbac
        pruning_callback = XGBoostPruningCallback(trial, "validation_0-rmse")

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            verbose=False # Keep verbose for the fit method itself
        )

        # Fix: Predict using the best iteration found by early stopping
        preds = model.predict(X_val, iteration_range=(0, model.best_iteration))
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)

# Study 생성 및 최적화 (pruner 활성화 권장)
study = optuna.create_study(direction='minimize',
                            sampler=optuna.samplers.TPESampler(seed=42),
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=0))
study.optimize(objective, n_trials=160, n_jobs=1)  # n_jobs 병렬실행은 objective 내부에서 XGBoost가 병렬 사용하므로 보통 1 권장

print("Best params:", study.best_trial.params)
print("Best value (CV rmse):", study.best_value)

# === 최종 모델 학습 및 test 예측 ===
# 1) 최적 파라미터로 모델 생성 (n_estimators는 크게 두고 early stopping으로 조절)
best_params = study.best_trial.params
final_params = best_params.copy()
final_params.update({
    'verbosity': 0,
    'objective': 'reg:squarederror',
    'tree_method': 'hist',
    'eval_metric': 'rmse', # Fix: Add eval_metric to final_params
    'n_estimators': 5000,   # 넉넉히 줘도 early stopping으로 멈춤
    'random_state': 42
})

final_early_stopping_callback = xgb.callback.EarlyStopping(rounds=100, save_best=True, maximize=False)
final_model = xgb.XGBRegressor(**final_params, callbacks = [final_early_stopping_callback])

# 2) 간단한 validation split을 둬서 early stopping 사용
X_tr_full, X_val_for_es, y_tr_full, y_val_for_es = train_test_split(
    X_train, y_train, test_size=0.2)

# Fix: Use xgb.callback.EarlyStopping for final training

final_model.fit(
    X_tr_full, y_tr_full,
    eval_set=[(X_val_for_es, y_val_for_es)],
    verbose=True  # 최종 학습은 출력 켜서 best_iteration 확인 가능
)

# 3) test 예측 및 역변환 (y는 log1p 되어있다고 가정)
# Fix: Predict using the best iteration found by early stopping
preds_test = final_model.predict(df_test, iteration_range=(0, final_model.best_iteration))
preds = np.expm1(preds_test)  # 역변환

# preds는 제출용 예측값 (numpy array)


[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[I 2025-11-24 15:33:06,758] Trial 115 finished with value: 0.16192754553320804 and parameters: {'max_depth': 6, 'learning_rate': 0.16382625620428443, 'min_child_weight': 3, 'subsample': 0.7127801753761248, 'colsample_bytree': 0.9503310098401173, 'gamma': 0.5017307573871219, 'reg_alpha': 1.7170311841028691, 'reg_lambda': 4.583609028816648}. Best is trial 76 with value: 0.14631343729372473.
[I 2025-11-24 15:33:06,977] Trial 116 pruned. Trial was pruned at iteration 0.
[I 2025-11-24 15:33:07,193] Trial 117 pruned. Trial was pruned at iteration 0.
[I 2025-11-24 15:33:07,306] Trial 118 pruned. Trial was pruned at iteration 0.
[I 2025-11-24 15:33:07,409] Trial 119 pruned. Trial was pruned at iteration 0.
[I 2025-11-24 15:33:12,824] Trial 120 finished with value: 0.15170762802657853 and parameters: {'max_depth': 3, 'learning_rate': 0.16720948778612832, 'min_child_weight': 2, 'subsample': 0.6872139701941938, 'colsample_bytree': 0.39592636393203

Best params: {'max_depth': 6, 'learning_rate': 0.18818067933305652, 'min_child_weight': 6, 'subsample': 0.8140116552609121, 'colsample_bytree': 0.9059524240669696, 'gamma': 0.009724372216110666, 'reg_alpha': 0.6527904077984563, 'reg_lambda': 2.4554916236470854}
Best value (CV rmse): 0.1452783691079242
[0]	validation_0-rmse:0.32780
[1]	validation_0-rmse:0.28786
[2]	validation_0-rmse:0.25800
[3]	validation_0-rmse:0.23427
[4]	validation_0-rmse:0.21490
[5]	validation_0-rmse:0.20024
[6]	validation_0-rmse:0.18888
[7]	validation_0-rmse:0.17847
[8]	validation_0-rmse:0.17144
[9]	validation_0-rmse:0.16483
[10]	validation_0-rmse:0.16127
[11]	validation_0-rmse:0.15762
[12]	validation_0-rmse:0.15380
[13]	validation_0-rmse:0.15014
[14]	validation_0-rmse:0.14731
[15]	validation_0-rmse:0.14576
[16]	validation_0-rmse:0.14454
[17]	validation_0-rmse:0.14288
[18]	validation_0-rmse:0.14107
[19]	validation_0-rmse:0.13973
[20]	validation_0-rmse:0.13837
[21]	validation_0-rmse:0.13764
[22]	validation_0-rmse:0.

In [None]:
sales = pd.DataFrame({'Id': df_id, 'SalePrice': preds})

sales.to_csv('submission.csv', index = False)

!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Message"

100% 21.1k/21.1k [00:00<00:00, 35.2kB/s]
400 Client Error: Bad Request for url: https://www.kaggle.com/api/v1/competitions/submissions/submit/house-prices-advanced-regression-techniques


# CatBoost

In [4]:
!pip install catboost
!pip install optuna
!pip install bayesian-optimization

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0
Collecting bayesian-optimization
  Downloading bayesian_optimization-3.1.0-py3-none-any.whl.metadata (11 kB)
Col

In [5]:
import pandas as pd
from catboost import CatBoostRegressor
from bayes_opt import BayesianOptimization

# Make copies to avoid modifying global variables if they are used elsewhere
df_train_cb = df_train.copy()
df_test_cb = df_test.copy()

# Identify categorical features and fill NaNs
categorical_feature_names = []
for col in df_train_cb.columns:
    if df_train_cb[col].dtype == 'object':
        categorical_feature_names.append(col)
        # Fill NaN values in categorical columns with a placeholder string
        df_train_cb[col] = df_train_cb[col].fillna('None').astype(str)
        if col in df_test_cb.columns:
            df_test_cb[col] = df_test_cb[col].fillna('None').astype(str)
    else:
        # Fill NaN values in numerical columns with 0 for simplicity.
        # This is consistent with how `df_train_encoded` was handled earlier.
        df_train_cb[col] = df_train_cb[col].fillna(0)
        if col in df_test_cb.columns:
            df_test_cb[col] = df_test_cb[col].fillna(0)
print(categorical_feature_names)

['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']


In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
from bayes_opt import BayesianOptimization
import numpy as np
from tqdm import tqdm

# 가정: X, y 준비돼 있음

X, y = df_train_cb, df_output

def cb_eval(n_estimators, depth, learning_rate, subsample, l2_leaf_reg):
    # bayes_opt은 float로 값을 받고, 필요시 int로 변환
    params = {
        'n_estimators': int(n_estimators),
        'depth': int(depth),
        'learning_rate': float(learning_rate),
        'subsample': float(subsample),
        'l2_leaf_reg': float(l2_leaf_reg),
        'verbose': False,
        'loss_function': 'RMSE',
        'cat_features': categorical_feature_names,
        'random_state': 42
    }
    # K-Fold CV
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse_list = []
    for train_idx, val_idx in kf.split(X):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostRegressor(**params)
        model.fit(X_tr, y_tr)
        preds = model.predict(X_val)
        rmse = mean_squared_error(y_val, preds)
        rmse_list.append(rmse)
    # 평균 RMSE (최소화 대상)
    return -np.sqrt(np.mean(rmse_list))  # maximize()이기 때문에 음수 부호

# Bayesian Optimization 객체 생성
pbounds = {
    'n_estimators': (450, 700),
    'depth': (3, 10),
    'learning_rate': (0.01, 0.3),
    'subsample': (0.5, 1.0),
    'l2_leaf_reg': (1e-8, 10.0),
}

optimizer = BayesianOptimization(
    f=cb_eval,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(init_points=5, n_iter=40)

print("Best params:", optimizer.max)


|   iter    |  target   | n_esti... |   depth   | learni... | subsample | l2_lea... |
-------------------------------------------------------------------------------------
| [39m1        [39m | [39m-29145.03[39m | [39m543.63502[39m | [39m9.6550001[39m | [39m0.2222782[39m | [39m0.7993292[39m | [39m1.5601864[39m |
| [35m2        [39m | [35m-29092.92[39m | [35m488.99863[39m | [35m3.4065852[39m | [35m0.2611910[39m | [35m0.8005575[39m | [35m7.0807257[39m |
| [39m3        [39m | [39m-31150.97[39m | [39m455.14612[39m | [39m9.7893689[39m | [39m0.2514083[39m | [39m0.6061695[39m | [39m1.8182496[39m |
| [35m4        [39m | [35m-28256.39[39m | [35m495.85112[39m | [35m5.1296957[39m | [35m0.1621793[39m | [35m0.7159725[39m | [35m2.9122914[39m |
| [39m5        [39m | [39m-29422.10[39m | [39m602.96322[39m | [39m3.9764570[39m | [39m0.0947219[39m | [39m0.6831809[39m | [39m4.5606998[39m |
| [39m6        [39m | [39m-28425.19[39m | [

In [17]:
tuned_params = optimizer.max['params'].copy()
tuned_params['n_estimators'] = int(tuned_params['n_estimators'])
tuned_params['depth'] = int(tuned_params['depth'])

model = CatBoostRegressor(**tuned_params, cat_features=categorical_feature_names)
model.fit(df_train_cb, df_output)

preds_test = model.predict(df_test_cb)

submit = pd.DataFrame({'Id': df_id, 'SalePrice': preds_test})
submit.to_csv('submission.csv', index = False)

0:	learn: 74718.6655732	total: 16.6ms	remaining: 11.2s
1:	learn: 70871.4127839	total: 31.8ms	remaining: 10.7s
2:	learn: 66803.2661057	total: 48ms	remaining: 10.7s
3:	learn: 63764.7076406	total: 62.1ms	remaining: 10.4s
4:	learn: 60873.3875132	total: 77.4ms	remaining: 10.3s
5:	learn: 58049.8581013	total: 91.8ms	remaining: 10.2s
6:	learn: 55408.7484084	total: 109ms	remaining: 10.3s
7:	learn: 52840.3619715	total: 123ms	remaining: 10.2s
8:	learn: 50554.8361758	total: 136ms	remaining: 10s
9:	learn: 48758.6946349	total: 150ms	remaining: 9.97s
10:	learn: 46826.2121157	total: 167ms	remaining: 10.1s
11:	learn: 45102.2278301	total: 182ms	remaining: 10s
12:	learn: 43422.1697794	total: 196ms	remaining: 9.93s
13:	learn: 42036.1778293	total: 209ms	remaining: 9.83s
14:	learn: 40860.6061356	total: 225ms	remaining: 9.85s
15:	learn: 39863.9970440	total: 239ms	remaining: 9.79s
16:	learn: 38775.8236108	total: 253ms	remaining: 9.77s
17:	learn: 37668.7709808	total: 268ms	remaining: 9.77s
18:	learn: 36601.622

In [18]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Message"

100% 33.6k/33.6k [00:01<00:00, 21.3kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques

# XGBoost + CatBoost + LightGBM Ensemble

In [7]:
!pip install catboost
!pip install lightgbm



In [8]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import lightgbm as lgb

# -------------------------  FIX: Reload original data for proper categorical feature handling
# 1) 설정 및 컬럼 분리
# -------------------------
# Reload original dataframes to retain original dtypes for CatBoost
df_train_orig = pd.read_csv('train.csv')
df_test_orig = pd.read_csv('test.csv')
df_output_orig = df_train_orig['SalePrice']
df_train_orig = df_train_orig.drop('SalePrice', axis=1)

pred_x = df_test_orig.copy()
# Apply log1p to the target as done in the XGBoost section
df_output_log = np.log1p(df_output_orig)

categorical_cols = df_train_orig.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = [c for c in df_train_orig.columns if c not in categorical_cols]

print("categorical_cols:", categorical_cols)
print("numeric_cols:", numeric_cols)

# -------------------------
# 2) 데이터 준비 (train_test_split 제거)
# -------------------------
# Use full training data for training
train_data_for_catboost = df_train_orig.copy()
train_target = df_output_log.copy()

# Data for final submission prediction
pred_data_for_catboost = pred_x.copy()

# -------------------------
# 3) 준비: CatBoost용 데이터
# -------------------------
# Fill NaN values in numerical columns with 0
if numeric_cols:
    train_data_for_catboost[numeric_cols] = train_data_for_catboost[numeric_cols].fillna(0)
    pred_data_for_catboost[numeric_cols] = pred_data_for_catboost[numeric_cols].fillna(0)

# For categorical columns, fill NaNs with 'None' string
if categorical_cols:
    for col in categorical_cols:
        train_data_for_catboost[col] = train_data_for_catboost[col].fillna('None').astype(str)
        pred_data_for_catboost[col] = pred_data_for_catboost[col].fillna('None').astype(str)


# -------------------------
# 4) 준비: XGBoost and LightGBM용 파이프라인 (One-Hot encoding)
# -------------------------
# Use the original df_train_orig for preprocessing pipeline for XGBoost and LightGBM
# Note: LightGBM can handle categorical features directly, but for consistency with XGBoost and
# to simplify the stacking input, we'll use the preprocessed data here.
numeric_transformer = SimpleImputer(strategy='constant', fill_value=0)
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='drop',
    sparse_threshold=0
)

# fit transformer on full training data and transform both training and prediction data
X_train_processed_for_tree_models = preprocessor.fit_transform(df_train_orig)
X_pred_processed_for_tree_models = preprocessor.transform(pred_x)

# -------------------------
# 5) 모델 학습
# -------------------------
# CatBoost: cat_features 목록(열 이름)을 넘김
cat_model = CatBoostRegressor(
    iterations=500, depth=6, loss_function='RMSE', verbose=0, random_seed=42
)
cat_model.fit(train_data_for_catboost, train_target, cat_features=categorical_cols)

# XGBoost: 배열 입력
xgb_model = XGBRegressor(
    n_estimators=500, max_depth=6, objective='reg:squarederror', random_state=42
)
xgb_model.fit(X_train_processed_for_tree_models, train_target)

# LightGBM: 배열 입력
lgbm_model = lgb.LGBMRegressor(
    n_estimators=500, max_depth=6, objective='regression_l1', random_state=42
)
lgbm_model.fit(X_train_processed_for_tree_models, train_target)

# -------------------------
# 6) 예측 및 스태킹(선형회귀) - 훈련 데이터에 대한 RMSE 계산
# -------------------------
# Get predictions for the full training set
cat_preds_train_full = cat_model.predict(train_data_for_catboost)
xgb_preds_train_full = xgb_model.predict(X_train_processed_for_tree_models)
lgbm_preds_train_full = lgbm_model.predict(X_train_processed_for_tree_models)

# Stacking input for training the stacker
X_stack_train = np.column_stack((cat_preds_train_full, xgb_preds_train_full, lgbm_preds_train_full))

stacker = LinearRegression()
stacker.fit(X_stack_train, train_target) # Fit stacker on combined train predictions and actual train_y

# Calculate RMSE on the training set (as requested)
final_preds_log_train_full = stacker.predict(X_stack_train)
final_preds_train_full = np.expm1(final_preds_log_train_full)

rmse = np.sqrt(mean_squared_error(np.expm1(train_target), final_preds_train_full)) # Compare with original scale for RMSE
print("Ensemble RMSE on full training data:", rmse)

# Evaluate individual model performance on the raw train_target (inverse transformed)
cat_rmse = np.sqrt(mean_squared_error(np.expm1(train_target), np.expm1(cat_preds_train_full)))
xgb_rmse = np.sqrt(mean_squared_error(np.expm1(train_target), np.expm1(xgb_preds_train_full)))
lgbm_rmse = np.sqrt(mean_squared_error(np.expm1(train_target), np.expm1(lgbm_preds_train_full)))
print(f"CatBoost RMSE on full training data: {cat_rmse:.6f}, XGB RMSE on full training data: {xgb_rmse:.6f}, LightGBM RMSE on full training data: {lgbm_rmse:.6f}")

# -------------------------
# 7) 실제 Kaggle 제출용 test set에 대한 예측
# -------------------------
cat_preds_submission = cat_model.predict(pred_data_for_catboost)
xgb_preds_submission = xgb_model.predict(X_pred_processed_for_tree_models)
lgbm_preds_submission = lgbm_model.predict(X_pred_processed_for_tree_models)

X_stack_submission = np.column_stack((cat_preds_submission, xgb_preds_submission, lgbm_preds_submission))
submit_pred = np.expm1(stacker.predict(X_stack_submission))

df_test = pd.read_csv('test.csv') # Re-read for df_id if needed
df_id = df_test['Id']

submit = pd.DataFrame({'Id': df_id, 'SalePrice': submit_pred})
submit.to_csv('submission.csv', index = False)

categorical_cols: ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
numeric_cols: ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea'



In [9]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m "Message"

100% 33.6k/33.6k [00:01<00:00, 21.7kB/s]
Successfully submitted to House Prices - Advanced Regression Techniques