<a href="https://colab.research.google.com/github/guupiii/ESAA/blob/main/Poverty_Prediction_Challenge_misung.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# DATA

X : Household survey features → 설명 변수

y₁ : Household consumption labels → 1인당 하루 소비액 (USD/day)

y₂ : Survey-level poverty rate labels → 설문 전체 빈곤율 (여러 빈곤선 threshold)

< 전체 순서 >

```
X + y1  → 가구 소비 모델 학습
예측된 소비 → 빈곤율 계산
계산된 빈곤율 ↔ y2 비교 (검증)
```



In [None]:
X = pd.read_csv('/content/drive/MyDrive/MLData/poverty/train_hh_features.csv')
y1 = pd.read_csv('/content/drive/MyDrive/MLData/poverty/train_hh_gt.csv')
y2 = pd.read_csv('/content/drive/MyDrive/MLData/poverty/train_rates_gt.csv')

In [None]:
test = pd.read_csv('/content/drive/MyDrive/MLData/poverty/test_hh_features.csv')

# **1. 데이터 전처리**

# Processing

< 전처리 순서 >


```
X, y1 merge
split
X / y 분리
결측치 처리 fit, transform
```



## X, y1 merge

In [None]:
train = X.merge(
    y1,
    on=["survey_id", "hhid"],
    how="inner",
    validate="one_to_one"
)

## split

- train_test_split을 쓰려면, 각 행이 서로 독립이고 아무 행이나 섞어도 괜찮아야 함
- 하지만, 같은 survey_id 안에 있는 여러 행(가구)들이 여러 열 값 전반에서 서로 유사함
- 즉, 같은 group에 속한 행들은 train과 validation에 동시에 존재하지 않도록 같은 쪽으로 split되어야 함

In [None]:
from sklearn.model_selection import GroupShuffleSplit

In [None]:
gss = GroupShuffleSplit(test_size=0.2, random_state=42)
tr_idx, val_idx = next(gss.split(train, groups=train["survey_id"]))

train_df = train.iloc[tr_idx]
val_df   = train.iloc[val_idx]

## X / y 분리

In [None]:
ID_COLS = ["survey_id", "hhid"]
TARGET = "cons_ppp17"

X_tr_raw  = train_df.drop(columns=ID_COLS + [TARGET]).copy()
y_tr      = train_df[TARGET].copy()

X_val_raw = val_df.drop(columns=ID_COLS + [TARGET]).copy()
y_val     = val_df[TARGET].copy()

## test data set

In [None]:
test_id = test[ID_COLS].copy()
X_test_raw = test.drop(columns=ID_COLS).copy()

# train에서 만든 X_tr_raw랑 컬럼이 완전히 같은 순서/구성이어야 안전
X_test_raw = X_test_raw.reindex(columns=X_tr_raw.columns)

### 결측 처리 방법 정리



| 변수                | 성격  | 코멘트              |
| ----------------- | --- | ---------------- |
| `sector1d`        | 범주형 | Missing이라는 하나의 카테고리로 유지 |
| `dweltyp`         | 범주형 | Missing이라는 하나의 카테고리로 유지 |
| `employed`        | 범주형 | Not employed로 처리 |
| `educ_max`        | 범주형 | Missing이라는 하나의 카테고리로 유지 |
| `consumed`        | 더미형 | 0으로 처리 |
| `share_secondary` | 수치형 | train 기준 전체 중앙값으로 대치 |
| `utl_exp_ppp17`   | 수치형 | survey_id 별 중앙값으로 대치 |

## 결측 채우기

In [None]:
# 가공본 만들기 (원본 보존)
X_tr   = X_tr_raw.copy()
X_val  = X_val_raw.copy()
X_test = X_test_raw.copy()

In [None]:
col = "sector1d"

X_tr[col]   = X_tr[col].astype("object").fillna("Missing")
X_val[col]  = X_val[col].astype("object").fillna("Missing")
X_test[col] = X_test[col].astype("object").fillna("Missing")

print(col, X_tr[col].isna().sum(), X_val[col].isna().sum(), X_test[col].isna().sum())

sector1d 0 0 0


In [None]:
col = "dweltyp"

X_tr[col]   = X_tr[col].astype("object").fillna("Missing")
X_val[col]  = X_val[col].astype("object").fillna("Missing")
X_test[col] = X_test[col].astype("object").fillna("Missing")

print(col, X_tr[col].isna().sum(), X_val[col].isna().sum(), X_test[col].isna().sum())

dweltyp 0 0 0


In [None]:
col = "employed"

X_tr[col]   = X_tr[col].fillna("Not employed")
X_val[col]  = X_val[col].fillna("Not employed")
X_test[col] = X_test[col].fillna("Not employed")

print(col, X_tr[col].isna().sum(), X_val[col].isna().sum(), X_test[col].isna().sum())

employed 0 0 0


In [None]:
col = "educ_max"

X_tr[col]   = X_tr[col].astype("object").fillna("Missing")
X_val[col]  = X_val[col].astype("object").fillna("Missing")
X_test[col] = X_test[col].astype("object").fillna("Missing")

print(col, X_tr[col].isna().sum(), X_val[col].isna().sum(), X_test[col].isna().sum())

educ_max 0 0 0


In [None]:
consumed_cols = [c for c in X_tr.columns if c.startswith("consumed")]

map_dict = {"Yes": 1, "No": 0}

for df_ in [X_tr, X_val, X_test]:
    df_[consumed_cols] = (df_[consumed_cols].replace(map_dict).fillna(0))

# dtype 정리 (0/1 더미로 유지)
for df_ in [X_tr, X_val, X_test]:
    df_[consumed_cols] = df_[consumed_cols].astype("int64")

print("consumed*: missing counts",
      X_tr[consumed_cols].isna().sum().sum(),
      X_val[consumed_cols].isna().sum().sum(),
      X_test[consumed_cols].isna().sum().sum())

  df_[consumed_cols] = (df_[consumed_cols].replace(map_dict).fillna(0))
  df_[consumed_cols] = (df_[consumed_cols].replace(map_dict).fillna(0))
  df_[consumed_cols] = (df_[consumed_cols].replace(map_dict).fillna(0))


consumed*: missing counts 0 0 0


In [None]:
col = "share_secondary"

share_median = X_tr[col].median()  # fit on train only

X_tr[col]   = X_tr[col].fillna(share_median)
X_val[col]  = X_val[col].fillna(share_median)
X_test[col] = X_test[col].fillna(share_median)

print(col, "median(train)=", share_median)
print(col, X_tr[col].isna().sum(), X_val[col].isna().sum(), X_test[col].isna().sum())

share_secondary median(train)= 0.0
share_secondary 0 0 0


In [None]:
col = "utl_exp_ppp17"

# fit: train_df의 survey_id별 중앙값 (X_tr의 col 기준)
utl_median_by_survey = (
    pd.concat([train_df["survey_id"], X_tr[col]], axis=1)
      .groupby("survey_id")[col]
      .median()
)

global_median = X_tr[col].median()

# transform: train
X_tr[col] = X_tr[col].fillna(train_df["survey_id"].map(utl_median_by_survey)).fillna(global_median)

# transform: val (train에 없는 survey_id면 global median)
X_val[col] = X_val[col].fillna(val_df["survey_id"].map(utl_median_by_survey)).fillna(global_median)

# transform: test
X_test[col] = X_test[col].fillna(test["survey_id"].map(utl_median_by_survey)).fillna(global_median)

print(col, "global_median(train)=", global_median)
print(col, X_tr[col].isna().sum(), X_val[col].isna().sum(), X_test[col].isna().sum())

utl_exp_ppp17 global_median(train)= 405.79352
utl_exp_ppp17 0 0 0


파생변수 추가 - infra_score,infra_urban,infra_rural

In [None]:
infra_cols = ["elect", "water", "toilet", "sewer"]

access_map = {
    "Access": 1,
    "No access": 0,
    "Yes": 1,
    "No": 0
}

for df in [X_tr, X_val, X_test]:
    for col in infra_cols:
        df[col] = df[col].map(access_map).astype("int64")

    df["infra_score"] = df[infra_cols].sum(axis=1)

In [None]:
tmp = train_df[["cons_ppp17"]].copy()
tmp["infra_score"] = X_tr["infra_score"].values

print(tmp.groupby("infra_score")["cons_ppp17"].mean())

infra_score
0     5.869443
1     7.311200
2     7.379066
3     7.470517
4    14.744475
Name: cons_ppp17, dtype: float64


In [None]:
for df in [X_tr, X_val, X_test]:
    df["infra_urban"] = df["infra_score"] * (df["urban"] == "Urban").astype(int)
    df["infra_rural"] = df["infra_score"] * (df["urban"] == "Rural").astype(int)

# **2. 모델링(CatBoost)**

In [None]:
# 범주형 컬럼 추출
cat_cols = [c for c in X_tr.columns if X_tr[c].dtype == "object"]
cat_idx  = [X_tr.columns.get_loc(c) for c in cat_cols]

In [None]:
y_tr_log  = np.log1p(y_tr)
y_val_log = np.log1p(y_val)

In [None]:
# CatBoost 모델링
!pip install catboost
from catboost import CatBoostRegressor

cb_model = CatBoostRegressor(
    loss_function="MAE",
    depth=6,
    learning_rate=0.05,
    iterations=2000,
    random_seed=42,
    early_stopping_rounds=100,
    verbose=200
)

cb_model.fit(
    X_tr, y_tr_log,
    eval_set=(X_val, y_val_log),
    cat_features=cat_idx
)

0:	learn: 0.4844458	test: 0.4868007	best: 0.4868007 (0)	total: 131ms	remaining: 4m 22s
200:	learn: 0.2405231	test: 0.2475856	best: 0.2475856 (200)	total: 33.2s	remaining: 4m 56s
400:	learn: 0.2317756	test: 0.2416693	best: 0.2416693 (400)	total: 1m 6s	remaining: 4m 23s
600:	learn: 0.2271712	test: 0.2395420	best: 0.2395420 (600)	total: 1m 41s	remaining: 3m 56s
800:	learn: 0.2239624	test: 0.2384997	best: 0.2384997 (800)	total: 2m 4s	remaining: 3m 6s
1000:	learn: 0.2215449	test: 0.2379370	best: 0.2379368 (999)	total: 2m 28s	remaining: 2m 28s
1200:	learn: 0.2196025	test: 0.2375787	best: 0.2375765 (1198)	total: 2m 53s	remaining: 1m 55s
1400:	learn: 0.2180380	test: 0.2373521	best: 0.2373519 (1399)	total: 3m 15s	remaining: 1m 23s
1600:	learn: 0.2166020	test: 0.2371816	best: 0.2371816 (1600)	total: 3m 38s	remaining: 54.4s
1800:	learn: 0.2153904	test: 0.2370283	best: 0.2370272 (1798)	total: 4m 1s	remaining: 26.7s
1999:	learn: 0.2143594	test: 0.2369201	best: 0.2369201 (1999)	total: 4m 23s	remaini

<catboost.core.CatBoostRegressor at 0x7e345a36d430>

In [None]:
# validation
val_pred_cb = np.expm1(cb_model.predict(X_val))
val_pred_cb = np.maximum(val_pred_cb, 0).astype(np.float32)

# test
test_pred_cb = np.expm1(cb_model.predict(X_test))
test_pred_cb = np.maximum(test_pred_cb, 0).astype(np.float32)

In [None]:
# survey-level bias 계산
bias_df = (
    pd.DataFrame({
        "survey_id": val_df["survey_id"].values,
        "y_true": y_val.values,
        "y_pred": val_pred_cb
    })
    .groupby("survey_id")
    .agg(
        true_med=("y_true", "median"),
        pred_med=("y_pred", "median")
    )
    .reset_index()
)

bias_df["bias"] = bias_df["pred_med"] - bias_df["true_med"]
bias_map = dict(zip(bias_df["survey_id"], bias_df["bias"]))

bias_df.head()

Unnamed: 0,survey_id,true_med,pred_med,bias
0,100000,8.572003,8.567756,-0.004248


# **3. 제출 파일 생성**

In [None]:
test_sid  = test_id["survey_id"].values
test_hhid = test_id["hhid"].values

test_pred_adj = np.array([
    pred - bias_map.get(sid, 0.0)
    for sid, pred in zip(test_sid, test_pred_cb)
])

test_pred_adj = np.maximum(test_pred_adj, 0)

In [None]:
rate_cols = [c for c in y2.columns if c.startswith("pct_hh_below_")]
rate_cols_sorted = sorted(rate_cols, key=lambda x: float(x.split("_")[-1]))

thresholds = np.array(
    [float(c.split("_")[-1]) for c in rate_cols_sorted],
    dtype=np.float32
)

In [None]:
submission_cons = pd.DataFrame({
    "survey_id": test_sid,
    "hhid": test_hhid,
    "cons_ppp17": test_pred_adj
})

submission_cons.to_csv(
    "predicted_household_consumption.csv",
    index=False
)

submission_cons.head()

Unnamed: 0,survey_id,hhid,cons_ppp17
0,400000,400001,11.018563
1,400000,400002,6.630354
2,400000,400003,7.647429
3,400000,400004,11.229031
4,400000,400005,5.351196


In [None]:
pov_rows = []

for sid in np.unique(test_sid):
    mask = (test_sid == sid)
    cons = test_pred_adj[mask]

    rates = (cons[:, None] < thresholds[None, :]).mean(axis=0)

    row = {"survey_id": sid}
    for c, r in zip(rate_cols_sorted, rates):
        row[c] = float(r)

    pov_rows.append(row)

submission_pov = pd.DataFrame(
    pov_rows,
    columns=["survey_id"] + rate_cols_sorted
)

submission_pov.to_csv(
    "predicted_poverty_distribution.csv",
    index=False
)

submission_pov.head()

Unnamed: 0,survey_id,pct_hh_below_3.17,pct_hh_below_3.94,pct_hh_below_4.60,pct_hh_below_5.26,pct_hh_below_5.88,pct_hh_below_6.47,pct_hh_below_7.06,pct_hh_below_7.70,pct_hh_below_8.40,pct_hh_below_9.13,pct_hh_below_9.87,pct_hh_below_10.70,pct_hh_below_11.62,pct_hh_below_12.69,pct_hh_below_14.03,pct_hh_below_15.64,pct_hh_below_17.76,pct_hh_below_20.99,pct_hh_below_27.37
0,400000,0.020917,0.063388,0.119138,0.181455,0.241545,0.292897,0.343353,0.393924,0.448402,0.500333,0.548011,0.596847,0.646261,0.696514,0.751164,0.802662,0.852857,0.906032,0.957992
1,500000,0.012995,0.048883,0.096306,0.153541,0.208001,0.260213,0.308191,0.360753,0.41425,0.46839,0.518441,0.571237,0.623711,0.677267,0.732516,0.785721,0.839101,0.895109,0.954826
2,600000,0.017157,0.055447,0.103294,0.157893,0.213018,0.261713,0.310116,0.362289,0.415573,0.470289,0.520095,0.572151,0.62523,0.677842,0.73323,0.788151,0.841201,0.897232,0.954929


In [None]:
!zip submission.zip predicted_household_consumption.csv predicted_poverty_distribution.csv

  adding: predicted_household_consumption.csv (deflated 70%)
  adding: predicted_poverty_distribution.csv (deflated 55%)
