# 문제/목표 정의
- 분석 유형: 지도학습 회귀(Regression)
- 목표 변수(y): UCI 회귀 데이터셋의 연속형 타깃(예: median_house_value, energy, concrete_strength 등)
- 입력 변수(X): 수치형 + (있다면) 범주형 피처
- 비즈니스 성공 지표: 보통 RMSE/MAE(낮을수록 좋음), 보조로 R²(높을수록 좋음)

# 데이터 점검 체크리스트
- 결측치: 수치형은 median, 범주형은 most_frequent로 대체
- 범주형 유무: object/category 있으면 OneHotEncoder 필요
- 스케일링: 선형/거리기반 모델은 스케일링 중요(트리계열은 덜 민감)
- 누수(leakage): 타깃에서 파생된 컬럼/미래정보 컬럼 없는지 확인
- 평가 방식: train/test split + KFold CV로 안정적인 성능 확인

# Colab 실행 코드
- 아래 코드는 UCI(California Housing: scikit-learn 내장) 을 사용한 회귀 템플릿입니다.
- 데이터만 바꾸면 다른 UCI 회귀도 거의 그대로 적용 가능

# 데이터 가져오기

In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


## Introductory Paper
- 참고 : [Building real estate valuation models with comparative approach through case-based reasoning](https://dl.acm.org/doi/abs/10.1016/j.asoc.2018.01.029)

In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
real_estate_valuation = fetch_ucirepo(id=477)

# data (as pandas dataframes)
X = real_estate_valuation.data.features
y = real_estate_valuation.data.targets

print("X type:", type(X), "shape:", X.shape)
print("y type:", type(y), "shape:", getattr(y, "shape", None))
print("X columns:", X.columns.tolist())
print("y columns:", y.columns.tolist() if isinstance(y, pd.DataFrame) else None)

y_series = y.iloc[:, 0].copy()
y_series.name = y.columns[0]

df = X.copy()
df[y_series.name] = y_series

print("\nMerged df shape:", df.shape)

X type: <class 'pandas.core.frame.DataFrame'> shape: (414, 6)
y type: <class 'pandas.core.frame.DataFrame'> shape: (414, 1)
X columns: ['X1 transaction date', 'X2 house age', 'X3 distance to the nearest MRT station', 'X4 number of convenience stores', 'X5 latitude', 'X6 longitude']
y columns: ['Y house price of unit area']

Merged df shape: (414, 7)


In [None]:
display(df.head())

Unnamed: 0,X1 transaction date,X2 house age,X3 distance to the nearest MRT station,X4 number of convenience stores,X5 latitude,X6 longitude,Y house price of unit area
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [None]:
# 컬럼명 변경 (df가 이미 존재한다고 가정)

rename_map = {
    "X1 transaction date": "transaction_date",
    "X2 house age": "house_age_years",
    "X3 distance to the nearest MRT station": "mrt_distance_m",
    "X4 number of convenience stores": "convenience_stores",
    "X5 latitude": "latitude",
    "X6 longitude": "longitude",
    "Y house price of unit area": "price_per_unit_area",
}

# 실제로 바뀐 컬럼만 적용(안전)
df = df.rename(columns={k: v for k, v in rename_map.items() if k in df.columns})

print(df.columns.tolist())
df.head()

['transaction_date', 'house_age_years', 'mrt_distance_m', 'convenience_stores', 'latitude', 'longitude', 'price_per_unit_area']


Unnamed: 0,transaction_date,house_age_years,mrt_distance_m,convenience_stores,latitude,longitude,price_per_unit_area
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9
1,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2
2,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8
4,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1


In [None]:
df["trans_year"] = np.floor(df["transaction_date"]).astype(int)
frac = (df["transaction_date"] - np.floor(df["transaction_date"])).clip(0, 0.999999)
df["trans_month_approx"] = (frac * 12).astype(int) + 1

df.head()

Unnamed: 0,transaction_date,house_age_years,mrt_distance_m,convenience_stores,latitude,longitude,price_per_unit_area,trans_year,trans_month_approx
0,2012.917,32.0,84.87882,10,24.98298,121.54024,37.9,2012,12
1,2012.917,19.5,306.5947,9,24.98034,121.53951,42.2,2012,12
2,2013.583,13.3,561.9845,5,24.98746,121.54391,47.3,2013,7
3,2013.5,13.3,561.9845,5,24.98746,121.54391,54.8,2013,7
4,2012.833,5.0,390.5684,5,24.97937,121.54245,43.1,2012,10


## 데이터셋 설명
- 참고 : [Real Estate Valuation](https://archive.ics.uci.edu/dataset/477/real+estate+valuation+data+set)

# 라이브러리 가져오기

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import StratifiedKFold, cross_validate

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

target_col = "price_per_unit_area"

# 시각화

In [None]:
# =========================
# [Cell 4] EDA (Plotly 최소 2개)
# =========================
target_col = "price_per_unit_area"

fig1 = px.histogram(df, x=target_col, nbins=50, title="Target distribution: price_per_unit_area")
fig1.show()

corr = df.select_dtypes(include=[np.number]).corr(numeric_only=True)
fig2 = px.imshow(corr, title="Correlation heatmap (numeric)")
fig2.show()

# 데이터셋 분리

In [None]:
# =========================
# [Cell 5] Train/Val split
# =========================
X = df.drop(columns=[target_col])
y = df[target_col].copy()

X_train_raw, X_val_raw, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print("X_train_raw:", X_train_raw.shape, "X_val_raw:", X_val_raw.shape)

X_train_raw: (331, 8) X_val_raw: (83, 8)


# 모델 만들기

## 평가함수/스코어러

In [None]:
def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

neg_rmse_scorer = make_scorer(lambda yt, yp: -rmse(yt, yp), greater_is_better=True)

## 전처리 모델 개발

In [None]:
# 1) 전처리: 수치형만 (현재 df는 전부 수치형이므로)

num_cols = X_train_raw.columns.tolist()

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]), num_cols)
    ],
    remainder="drop"
)

## StratifiedKFold for Regression
- y를 binning해서 stratify

In [None]:
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

q = 10
q = min(q, int(pd.Series(y_train).nunique()))

while True:
    y_bins = pd.qcut(y_train, q=q, labels=False, duplicates="drop")  # <- int 라벨
    bin_counts = pd.Series(y_bins).value_counts()

    # 각 bin에 최소 n_splits개 이상 있어야 stratified split 가능
    if (bin_counts.min() >= n_splits) and (pd.Series(y_bins).nunique() >= n_splits):
        break

    q -= 1
    if q < 2:
        # 그래도 안 되면 최소한으로 bins를 만들어 진행
        y_bins = pd.cut(y_train, bins=n_splits, labels=False)
        break
cv_splits = list(skf.split(X_train_raw, y_bins))
print("q used:", q, "| #bins:", pd.Series(y_bins).nunique())
print("min bin count:", pd.Series(y_bins).value_counts().min())

q used: 10 | #bins: 10
min bin count: 32


## 모델 후보군 코드

In [None]:
models = {
    "Ridge": Ridge(alpha=10.0, random_state=RANDOM_STATE),
    "KNN": KNeighborsRegressor(n_neighbors=15),
    "HGBR": HistGradientBoostingRegressor(random_state=RANDOM_STATE),
    "RF": RandomForestRegressor(n_estimators=600, random_state=RANDOM_STATE, n_jobs=-1),

    # XGBoost 필수 포함
    "XGB": XGBRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.0,
        reg_lambda=1.0,
        random_state=RANDOM_STATE,
        n_jobs=-1,
        objective="reg:squarederror",
        tree_method="hist"
    )
}

# CV로 모델 비교


In [None]:
rows = []
pipes = {}

for name, model in models.items():
    pipe = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", model)
    ])
    pipes[name] = pipe

    scores = cross_validate(
        pipe,
        X_train_raw, y_train,
        cv=cv_splits,
        scoring={"neg_rmse": neg_rmse_scorer, "r2": "r2"},
        n_jobs=-1
    )

    rows.append({
        "model": name,
        "RMSE(CV)": -np.mean(scores["test_neg_rmse"]),
        "RMSE_std": np.std(-scores["test_neg_rmse"]),
        "R2(CV)": np.mean(scores["test_r2"])
    })

cv_df = pd.DataFrame(rows).sort_values("RMSE(CV)").reset_index(drop=True)
display(cv_df)

best_name = cv_df.loc[0, "model"]
best_pipe = pipes[best_name]
print("Best model by CV RMSE:", best_name)

Unnamed: 0,model,RMSE(CV),RMSE_std,R2(CV)
0,RF,7.704147,1.599783,0.68329
1,HGBR,8.13456,1.820234,0.645748
2,XGB,8.207559,1.706097,0.640414
3,KNN,8.860914,1.421284,0.578159
4,Ridge,9.16842,1.552135,0.548566


Best model by CV RMSE: RF


# 베스트 모델 학습 + 검증 평가

In [None]:
best_pipe.fit(X_train_raw, y_train)
pred = best_pipe.predict(X_val_raw)

mae = float(mean_absolute_error(y_val, pred))
rmse_val = rmse(y_val, pred)
r2 = float(r2_score(y_val, pred))

print(f"\nVAL metrics ({best_name}): MAE={mae:.4f} | RMSE={rmse_val:.4f} | R2={r2:.4f}")


VAL metrics (RF): MAE=3.8843 | RMSE=5.5970 | R2=0.8133


# Plotly 진단 (최소 2개)

In [None]:
# -------------------------------------------------
# 6) Plotly 진단 (최소 2개)
# -------------------------------------------------
df_sc = pd.DataFrame({"y_true": np.array(y_val), "y_pred": pred})

fig_pred = px.scatter(df_sc, x="y_true", y="y_pred", title=f"Predicted vs Actual (val) - {best_name}")
fig_pred.add_trace(go.Scatter(
    x=[df_sc["y_true"].min(), df_sc["y_true"].max()],
    y=[df_sc["y_true"].min(), df_sc["y_true"].max()],
    mode="lines",
    name="ideal"
))
fig_pred.show()

df_sc["residual"] = df_sc["y_true"] - df_sc["y_pred"]
fig_res = px.scatter(df_sc, x="y_pred", y="residual", title=f"Residuals vs Prediction (val) - {best_name}")
fig_res.add_hline(y=0)
fig_res.show()

In [None]:
def plot_true_vs_pred_train_val_thesis(model_name, pipe, X_train_raw, y_train, X_val_raw, y_val):
    pipe.fit(X_train_raw, y_train)

    pred_train = pipe.predict(X_train_raw)
    pred_val = pipe.predict(X_val_raw)

    mae_tr = float(mean_absolute_error(y_train, pred_train))
    rmse_tr = rmse(y_train, pred_train)
    r2_tr = float(r2_score(y_train, pred_train))

    mae_va = float(mean_absolute_error(y_val, pred_val))
    rmse_va = rmse(y_val, pred_val)
    r2_va = float(r2_score(y_val, pred_val))

    y_all = np.concatenate([np.array(y_train), np.array(y_val)])
    p_all = np.concatenate([np.array(pred_train), np.array(pred_val)])
    vmin = float(min(y_all.min(), p_all.min()))
    vmax = float(max(y_all.max(), p_all.max()))

    fig = make_subplots(
        rows=1, cols=2,
        subplot_titles=[
            f"train   MAE={mae_tr:.3f}  RMSE={rmse_tr:.3f}  R²={r2_tr:.3f}",
            f"val     MAE={mae_va:.3f}  RMSE={rmse_va:.3f}  R²={r2_va:.3f}",
        ],
        horizontal_spacing=0.10
    )

    # ✅ 진한 점(opacity↑) + 테두리(line)로 또렷하게
    fig.add_trace(
        go.Scatter(
            x=y_train, y=pred_train,
            mode="markers",
            marker=dict(size=5, opacity=0.70, symbol="circle", line=dict(width=0.7)),
            hoverinfo="skip"
        ),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=[vmin, vmax], y=[vmin, vmax],
            mode="lines",
            line=dict(width=2),
            hoverinfo="skip"
        ),
        row=1, col=1
    )

    fig.add_trace(
        go.Scatter(
            x=y_val, y=pred_val,
            mode="markers",
            marker=dict(size=5, opacity=0.80, symbol="circle", line=dict(width=0.7)),
            hoverinfo="skip"
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=[vmin, vmax], y=[vmin, vmax],
            mode="lines",
            line=dict(width=2),
            hoverinfo="skip",
            showlegend=False
        ),
        row=1, col=2
    )

    axis_common = dict(
        range=[vmin, vmax],
        showline=True,
        linewidth=1,
        mirror=True,
        ticks="outside",
        ticklen=4,
        showgrid=True,
        gridwidth=0.5,
        zeroline=False,
    )

    fig.update_xaxes(title_text="true", row=1, col=1, **axis_common)
    fig.update_yaxes(title_text="predict", row=1, col=1, **axis_common)
    fig.update_xaxes(title_text="true", row=1, col=2, **axis_common)
    fig.update_yaxes(title_text="predict", row=1, col=2, **axis_common)

    fig.update_layout(
        template="simple_white",
        title=dict(text=f"True vs Predict — {model_name}", x=0.5),
        height=460,
        margin=dict(l=30, r=30, t=80, b=30),
        showlegend=False,
        font=dict(family="Times New Roman", size=14),
    )
    fig.show()


# =========================
# 8) 모델별 플롯 출력 (원하는 모델만 골라서도 가능)
# =========================
for name, pipe in pipes.items():
    plot_true_vs_pred_train_val_thesis(name, pipe, X_train_raw, y_train, X_val_raw, y_val)