In [23]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import joblib

from data_preprocessing import pipeline, filter_unnecessary_columns
from feature_preprocessing import pipeline2

import matplotlib.pyplot as plt
plt.rcParams['font.family'] ='Malgun Gothic'
plt.rcParams['axes.unicode_minus'] =False

### 데이터 전처리

In [24]:
# 로우 데이터 불러오기
file_path = "./storage/raw_data/병합_청약매물_목록_정보_픽스2.csv"
df = pd.read_csv(file_path)

# 데이터 전처리
preprocessing_pipeline = pipeline(type='train')
df = preprocessing_pipeline.transform(df)

# 학습할 모델별로 드랍할 칼럼 정의
# - 최고당첨가점: 공급지역코드, 거래금액(만원), 공급세대수
# 최저당첨가점: 공급지역코드, 거래금액(만원), 공급세대수
# 시세차익: 공급지역코드, 공급세대수

#  -----------------------------최고, 최저당점가점용-----------------------------------
df['시세차익'] = df['전용면적'] * df['전용면적당 시세차익']
df.drop(columns=['전용면적', '전용면적당 시세차익', '공급금액(최고가 기준)'], inplace=True)
# 최고, 최저당점가점용
df.drop(['공급지역코드', '거래금액(만원)', '공급세대수'], axis=1, inplace=True)


#  -----------------------------시세차익용-----------------------------------
# 시세차익 할 때 feature에서 거래금액(만원) 이부분 스케일링 해주기
# df['시세차익'] = df['전용면적'] * df['전용면적당 시세차익']
# df.drop(columns=['전용면적', '전용면적당 시세차익', '공급금액(최고가 기준)'], inplace=True)
# df.drop(['공급지역코드', '공급세대수'], axis=1, inplace=True)


# 파일 저장
file_version = "250320-당첨가점"
output_file = f"./storage/train_data/train-{file_version}.csv"
df.to_csv(output_file, index=False, encoding='cp949')

  df = pd.read_csv(file_path)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['최고당첨가점'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['최저당첨가점'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

### 모델 학습 및 평가

In [25]:
# 전처리된 데이터 가져오기

file_path = f"./storage/train_data/train-{file_version}.csv"
df = pd.read_csv(file_path, encoding='cp949')

In [None]:
X = df.drop(columns=["최고당첨가점"])
Y = df["최고당첨가점"]

X_train, X_test, y_train, y_test = train_test_split(
    X,  # 타겟
    Y,  
    test_size=0.2,
    random_state=42
)

In [None]:
feature_pipeline = pipeline2()

# 학습 데이터(X_train)에 fit
feature_pipeline.fit(X_train)

# 학습 데이터(X_train)를 변환
X_train_transformed = feature_pipeline.transform(X_train)

# 테스트 데이터(X_test)를 변환
X_test_transformed = feature_pipeline.transform(X_test)

Index(['공급지역코드', '공급규모', '투기과열지구', '조정대상지역', '분양가상한제', '정비사업', '공공주택지구',
       '대규모택지개발지구', '수도권내민영공공주택지구', '공급세대수', '순위', '거주지역', '접수건수', '경쟁률',
       '토픽 1', '토픽 2', '토픽 3', '토픽 4', '토픽 5', '토픽 6', '토픽 7', '법정동코드'],
      dtype='object')
Index(['공급규모', '공급세대수', '접수건수', '경쟁률', '토픽 1', '토픽 2', '토픽 3', '토픽 4', '토픽 5',
       '토픽 6', '토픽 7', '법정동코드', '투기과열지구_N', '투기과열지구_Y', '조정대상지역_N', '조정대상지역_Y',
       '분양가상한제_N', '분양가상한제_Y', '정비사업_N', '정비사업_Y', '공공주택지구_N', '공공주택지구_Y',
       '대규모택지개발지구_N', '대규모택지개발지구_Y', '거주지역_기타경기', '거주지역_기타지역', '거주지역_해당지역',
       '공급지역코드_100', '공급지역코드_400', '공급지역코드_410', '수도권내민영공공주택지구_N',
       '수도권내민영공공주택지구_Y', '순위_1순위', '순위_2순위'],
      dtype='object')
Index(['공급지역코드', '공급규모', '투기과열지구', '조정대상지역', '분양가상한제', '정비사업', '공공주택지구',
       '대규모택지개발지구', '수도권내민영공공주택지구', '공급세대수', '순위', '거주지역', '접수건수', '경쟁률',
       '토픽 1', '토픽 2', '토픽 3', '토픽 4', '토픽 5', '토픽 6', '토픽 7', '법정동코드'],
      dtype='object')
Index(['공급규모', '공급세대수', '접수건수', '경쟁률', '토픽 1', '토픽 2', '토픽 3', '토픽 4', 



In [16]:
# shap test data 저장

X_test_transformed.to_csv('./storage/shap_test_data/X_test_transformed.csv', index=False)

In [7]:
# 피쳐 파이프라인 저장 

version = '0.0.1'
joblib.dump(feature_pipeline, f"./storage/trained_pipeline/pipeline_{version}.pkl")

['./storage/trained_pipeline/pipeline_0.0.1.pkl']

### 학습 - LightGBM

In [None]:
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import warnings

# 워닝 무시
warnings.filterwarnings("ignore", category=UserWarning)

lgb_model = lgb.LGBMRegressor(verbosity=-1)

lgb_param_grid = {
    'max_depth': np.arange(3, 9),  
    'num_leaves': np.arange(31, 127),  
    'min_data_in_leaf': np.arange(10, 50),  
    'subsample': np.linspace(0.5, 1.0, 6),  
    'colsample_bytree': np.linspace(0.5, 1.0, 6),  
    'learning_rate': np.logspace(-4, -1, 10),  
    'n_estimators': np.arange(50, 200, 50)  
}

# 랜덤 서치
lgb_random_search = RandomizedSearchCV(
    lgb_model, 
    param_distributions=lgb_param_grid, 
    cv=5, 
    n_iter=100,
)

lgb_random_search.fit(X_train_transformed, y_train)

print("베스트 파라미터:")
print(lgb_random_search.best_params_)

y_pred = lgb_random_search.best_estimator_.predict(X_test_transformed)

# RMSE , R² 
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"LightGBM - RMSE: {rmse:.4f}, R²: {r2:.4f}")


'\nRandomforest, XGB, LGB 비교결과\nXGB가 가장 높았지만, 성능평가에서 0.1정도의 아주 작은 차이라 더 가볍고 빠르게 돌아가는 모델인 LightGBM 이용 제안\n추후 결정되면 코드 적어놓겠음\n'

특정 파라미터 값 결정 시 아래 코드 이용

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt
# import lightgbm as lgb
# from sklearn.metrics import mean_squared_error, r2_score

# lgb_model = lgb.LGBMRegressor(
#     max_depth=8, 
#     num_leaves=119, 
#     min_data_in_leaf=47, 
#     subsample=0.7, 
#     colsample_bytree=0.9, 
#     learning_rate=0.1, 
#     n_estimators=100
# )

# lgb_model.fit(X_train_transformed, y_train)

# y_pred_lgb = lgb_model.predict(X_test_transformed)

# # RMSE , R^2
# def evaluate_model(y_true, y_pred):
#     rmse = np.sqrt(mean_squared_error(y_true, y_pred))
#     r2 = r2_score(y_true, y_pred)
#     return rmse, r2

# rmse_lgb, r2_lgb = evaluate_model(y_test, y_pred_lgb)
# print(f"LightGBM - RMSE: {rmse_lgb}, R^2: {r2_lgb}")

# feature_importances = lgb_model.feature_importances_

# # Feature Importance
# sorted_idx = np.argsort(feature_importances)[::-1]

# plt.barh(X_train_transformed.columns[sorted_idx], feature_importances[sorted_idx])
# plt.xlabel("Feature Importance")
# plt.ylabel("Feature Name")
# plt.show()


### 모델 저장 

In [10]:
# 모델 저장 및 로드

version = 'rf_grid_0.0.1'

# 모델 저장
joblib.dump(rf_model, f"./storage/trained_model/model_{version}.pkl")

['./storage/trained_model/model_rf_grid_0.0.1.pkl']

In [8]:
# 모델 로드
loaded_model = joblib.load("./storage/trained_model/model_0.0.1.pkl")

# 예측
# X_test = np.array([[1, 2]])
# prediction = loaded_model.predict(X_test)
# print("Prediction:", prediction)