In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    roc_auc_score,
    precision_recall_curve,
    ConfusionMatrixDisplay,
    RocCurveDisplay,
    PrecisionRecallDisplay
)
from statsmodels.stats.outliers_influence import variance_inflation_factor
from itertools import combinations

In [2]:
# 1. 데이터 불러오기
file_path ="AmesHousing.csv"
df = pd.read_csv(file_path)

결측치 처리

In [3]:
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)

edited_df = df

#결측치 80%이상인경우 컬럼 삭제
missing_cols = missing.loc[(missing / len(df)) > 0.80].index
edited_df = edited_df.drop(columns=missing_cols)

missing_cols = missing.loc[(missing / len(df)) < 0.05].index
for col in missing_cols:
    if edited_df[col].dtype == 'object' or edited_df[col].dtype.name == 'category':
        #결측치 5%이하 범주형 변수 최빈값 대체
        edited_df[col] = edited_df[col].fillna(edited_df[col].mode().iloc[0])
    else:
        #결측치 5%이하 수치형 변수 중앙값 대체
        edited_df[col] = edited_df[col].fillna(edited_df[col].median())
        

#수치형은 MICE, 범주형은 'Missing'으로 결측치 채우기
missing_cols = missing.loc[(0.05 <= (missing / len(df))) & ((missing / len(df)) <= 0.80)].index

numeric_cols = edited_df.select_dtypes(include=['number']).columns

for col in missing_cols:
    if edited_df[col].dtype == 'object' or edited_df[col].dtype.name == 'category':
        edited_df[col] = edited_df[col].fillna('Missing')

#MICE Imputation(다중 대체)이용하여 결측치 제거
imputer = IterativeImputer(random_state=0)
edited_df[numeric_cols] = imputer.fit_transform(edited_df[numeric_cols])

타겟 변수 이진화

In [4]:
#상위 5% 기준 이진화 (premium house 예측)
threshold = edited_df['SalePrice'].quantile(0.95)
edited_df['target'] = (edited_df['SalePrice'] >= threshold).astype(int)

edited_df.drop('SalePrice', axis=1, inplace=True)

X = edited_df.drop('target', axis=1)
y = edited_df['target']

1번 방법 - 원 핫 인코딩 진행 후 모든 변수 다 때려넣기  

In [25]:
X_edited = pd.get_dummies(X, drop_first=True)

2번 방법 - 수치형 변수 중 타겟 변수와의 상관관계가 0.1이상인 변수를 선택하여 진행 

In [6]:
# 상관계수 기준 값 설정
CORR_THRESHOLD = 0.1  # 원하는 값으로 조정 가능

# 수치형 변수만 선택
numeric_df = edited_df.select_dtypes(include='number')

# 타겟 변수와 상관계수 계산
corr_with_target = numeric_df.corr()['target'].drop('target').abs()

# 기준 이상 변수만 선택
selected_features_corr = corr_with_target[corr_with_target >= CORR_THRESHOLD].index.tolist()

print(f"상관계수 {CORR_THRESHOLD} 이상 변수: ", selected_features_corr)

# 최종 변수 선택
X_edited = X[selected_features_corr]


상관계수 0.1 이상 변수:  ['PID', 'Lot Frontage', 'Lot Area', 'Overall Qual', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'BsmtFin SF 1', 'Total Bsmt SF', '1st Flr SF', '2nd Flr SF', 'Gr Liv Area', 'Bsmt Full Bath', 'Full Bath', 'Half Bath', 'TotRms AbvGrd', 'Fireplaces', 'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'Wood Deck SF', 'Open Porch SF']


3번 방법 - Mutual Information 체크를 통해 MI가 0.05 이상인 변수를 선택하여 진행 (이 경우 범주형 변수 포함 가능)

In [7]:
# MI 기준 값 설정
MI_THRESHOLD = 0.01  # 원하는 값으로 조절 (보통 0.01 ~ 0.05 사이 추천)

df_mi = edited_df.copy()
cat_cols_all = df_mi.select_dtypes(include=['object']).columns

# 범주형 변수 Label Encoding
for col in cat_cols_all:
    df_mi[col] = LabelEncoder().fit_transform(df_mi[col])

# Mutual Information 계산
X_mi = df_mi.drop(columns=['target'])
y_mi = df_mi['target']

#수치형 / 범주형 구분
num_cols = X_mi.select_dtypes(exclude='object').columns
cat_cols = X_mi.select_dtypes(include='object').columns

# 각 feature가 discrete인지 여부 리스트 생성
discrete_flags = [True if col in cat_cols else False for col in X_mi.columns]

mi_scores = mutual_info_classif(X_mi, y_mi, discrete_features=discrete_flags, random_state=42)
mi_result = pd.Series(mi_scores, index=X_mi.columns).sort_values(ascending=False)

# MI 기준 이상 변수 선택
selected_features_MI = mi_result[mi_result >= MI_THRESHOLD].index.tolist()

print(f"MI {MI_THRESHOLD} 이상 변수: ", selected_features_MI)

# 수치형 / 범주형 분리
num_cols = df[selected_features_MI].select_dtypes(exclude='object').columns.tolist()
cat_cols = df[selected_features_MI].select_dtypes(include='object').columns.tolist()

# 범주형 변수 One-Hot Encoding
df_cat_encoded = pd.get_dummies(edited_df[cat_cols], drop_first=True)

# 최종 X 데이터 구성
X_edited = pd.concat([edited_df[num_cols], df_cat_encoded], axis=1)


MI 0.01 이상 변수:  ['PID', 'Overall Qual', 'Neighborhood', 'Bsmt Qual', 'Order', 'Total Bsmt SF', 'Kitchen Qual', 'Garage Area', '1st Flr SF', 'Garage Cars', 'Gr Liv Area', 'Exter Qual', 'BsmtFin SF 1', 'Fireplace Qu', 'TotRms AbvGrd', 'Year Built', 'Mas Vnr Area', 'Lot Area', 'Garage Yr Blt', 'Year Remod/Add', 'Lot Frontage', 'Foundation', 'Full Bath', 'Bsmt Exposure', 'Garage Finish', 'Mas Vnr Type', 'Heating QC', 'Sale Type', 'Roof Style', 'Open Porch SF', 'Fireplaces', 'Garage Type', 'Exterior 2nd', 'Sale Condition', 'Exterior 1st', 'Overall Cond', '2nd Flr SF', 'Wood Deck SF', 'BsmtFin Type 1', 'MS SubClass', 'Bsmt Unf SF', 'Bsmt Full Bath', 'Half Bath', 'MS Zoning', 'Condition 2']


Train Test Split 및 SMOTE로 클래스 불균형 완화 및 회귀 진행

In [31]:
#train test split, Train과 Test에서 class 비율을 유지하기 위하여 stratify 옵션을 True로 둔다
X_train, X_test, y_train, y_test = train_test_split(
    X_edited, y, test_size=0.2, random_state=42, stratify=y
)

#smote = SMOTE(random_state=42)
#X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
#model = LogisticRegression(max_iter=1000)
#model.fit(X_train_smote, y_train_smote)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

odds 비 해석

In [27]:
odds_ratios = pd.Series(np.exp(model.coef_[0]), index=X_edited.columns)
print(odds_ratios.sort_values())

Yr Sold           0.997710
Year Remod/Add    0.997939
Garage Yr Blt     0.997997
Year Built        0.998012
Misc Val          0.998691
                    ...   
1st Flr SF        1.001871
Garage Area       1.001956
BsmtFin SF 1      1.002107
Total Bsmt SF     1.002760
Gr Liv Area       1.003119
Length: 257, dtype: float64


Test 세트 예측 및 평가

In [32]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).transpose()

report_df

Unnamed: 0,precision,recall,f1-score,support
0,0.985689,0.991007,0.988341,556.0
1,0.814815,0.733333,0.77193,30.0
accuracy,0.977816,0.977816,0.977816,0.977816
macro avg,0.900252,0.86217,0.880135,586.0
weighted avg,0.976941,0.977816,0.977262,586.0
