# 📊 비만 예측용 데이터 전처리

In [1]:
# 1️⃣ 데이터 로드
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np

df = pd.read_csv('원시데이터.csv')  # 파일 경로는 실행 환경에 맞게 조정하세요

In [2]:
# 2️⃣ 사용할 피처 및 타겟 정의
HE_obe_input_features = [
    'HE_sbp1', 'HE_dbp1', 'HE_wc', 'HE_glu',
    'HE_chol', 'HE_HDL_st2', 'HE_LDL_drct', 'HE_crea',
    'HE_ast', 'HE_alt', 'HE_wt', 'HE_ht', 'HE_BMI', "BD1_11", "BE5_1", "L_OUT_FQ", "LS_VEG1"
]
target = 'HE_obe'
data = df[[col for col in HE_obe_input_features + [target] if col in df.columns]].copy()

In [3]:
# 3️⃣ EDA 요약 (결측치, 통계값, 이상치)
eda = pd.DataFrame(index=data.columns)
eda['결측치 비율 (%)'] = data.isnull().mean() * 100
eda['고유값 개수'] = data.nunique()
eda['최소값'] = data.min(numeric_only=True)
eda['최대값'] = data.max(numeric_only=True)
eda['평균'] = data.mean(numeric_only=True)
eda['표준편차'] = data.std(numeric_only=True)

def detect_outliers(series):
    if series.dtype.kind in 'biufc':
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        return ((series < lower) | (series > upper)).sum()
    else:
        return np.nan

eda['이상치 수'] = data.apply(detect_outliers)
eda

Unnamed: 0,결측치 비율 (%),고유값 개수,최소값,최대값,평균,표준편차,이상치 수
HE_sbp1,4.834752,115,79.0,243.0,120.435244,16.832707,134
HE_dbp1,4.834752,81,31.0,152.0,73.892175,10.45006,64
HE_wc,6.047049,645,43.4,132.0,81.922704,12.286913,51
HE_glu,10.751912,171,60.0,398.0,100.361255,23.161769,488
HE_chol,10.751912,242,70.0,456.0,182.477684,39.901996,49
HE_HDL_st2,10.751912,103,6.0,135.0,56.995634,15.33782,88
HE_LDL_drct,10.751912,207,10.0,337.0,110.370472,36.039976,33
HE_crea,10.751912,151,0.33,10.19,0.788436,0.266071,106
HE_ast,10.751912,99,9.0,722.0,22.869017,16.743954,340
HE_alt,10.925097,126,5.0,861.0,21.671257,22.445837,419


In [4]:
# 4️⃣ 결측치 평균값 대체
imputer = SimpleImputer(strategy='mean')
data_imputed = pd.DataFrame(imputer.fit_transform(data), columns=data.columns)

In [5]:
# 5️⃣ 이상치 IQR 기준 클리핑
for col in data_imputed.columns:
    if data_imputed[col].dtype.kind in 'biufc':
        q1 = data_imputed[col].quantile(0.25)
        q3 = data_imputed[col].quantile(0.75)
        iqr = q3 - q1
        lower = q1 - 1.5 * iqr
        upper = q3 + 1.5 * iqr
        data_imputed[col] = data_imputed[col].clip(lower, upper)

In [6]:
# 6️⃣ X / y 분리 및 저장
X = data_imputed.drop(columns=target)
y = data_imputed[target]
final_df = pd.concat([X, y], axis=1)
final_df.to_csv('비만_예측용_정제데이터.csv', index=False)
final_df.head()

Unnamed: 0,HE_sbp1,HE_dbp1,HE_wc,HE_glu,HE_chol,HE_HDL_st2,HE_LDL_drct,HE_crea,HE_ast,HE_alt,HE_wt,HE_ht,HE_BMI,BD1_11,BE5_1,L_OUT_FQ,LS_VEG1,HE_obe
0,144.0,86.0,90.2,120.0,166.0,38.0,101.0,0.78,34.5,29.0,58.9,159.8,23.06544,8.0,6.0,3.0,1.0,3.0
1,125.0,76.0,65.2,105.0,218.0,67.0,139.0,0.63,18.0,17.0,41.8,151.2,18.284063,8.0,1.0,5.0,2.0,1.0
2,102.0,57.0,71.8,102.0,196.0,87.0,105.0,0.81,19.0,11.0,53.5,156.7,21.787921,5.0,1.0,5.0,3.0,2.0
3,108.0,58.0,59.6,100.361255,182.477684,56.995634,110.370472,0.788436,22.869017,21.671257,35.8,142.0,17.754414,8.0,8.0,3.0,2.0,2.965225
4,131.0,93.0,86.9,101.0,176.0,40.0,116.0,1.19,28.0,38.0,79.8,185.0,23.316289,4.0,1.0,4.0,2.0,3.0
