In [77]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
sns.set(style="darkgrid",font_scale=1.5)
pd.set_option("display.max.columns",None)

from scipy import stats
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from mlxtend.regressor import StackingCVRegressor

from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, StandardScaler, LabelEncoder

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 20

import warnings

# 경고 무시하기
warnings.filterwarnings("ignore")
# unicode minus를 사용하지 않기 위한 설정 (minus 깨짐현상 방지)
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams['font.family'] = 'Malgun Gothic'# (위도우용)
from matplotlib import font_manager, rc

# AppleGothic 폰트 설정
rc('font', family='Malgun Gothic')

In [78]:
import pandas as pd

df = pd.read_csv('../통합데이터/강남구전세데이터_찐마지막.csv')

In [79]:
# Log취한 값

z = df["y"]

df["logy"] = np.log1p(df["y"])

In [80]:
con_cols = ["Area","Floor","InterestRate"]
df[con_cols].skew().sort_values().to_frame().rename(columns={0:"Skewness"}).T

Unnamed: 0,InterestRate,Area,Floor
Skewness,0.739445,1.093297,2.296495


In [81]:
from scipy.stats import boxcox
import pandas as pd

con_cols = ["Area", "Floor", "InterestRate"]

for col in con_cols:
    # 양수 데이터에만 적용하기 위해 최소값을 더해줌
    min_value = df[col].min()
    transformed_data, lambda_value = boxcox(df[col] - min_value + 1)  # 최소값 더한 후 변환
    df[col] = transformed_data


In [82]:
df[con_cols].skew().sort_values().to_frame().rename(columns={0:"Skewness"})

Unnamed: 0,Skewness
Area,-0.032438
Floor,-0.001868
InterestRate,0.068459


In [83]:
df_group = df.groupby('Address')['logy'].mean().reset_index(drop=False)
df_group = df_group.rename(columns={'logy':'Address_encoding'})
df_group
df = pd.merge(df, df_group, how='inner',on='Address')
df = df.drop('Address', axis=1).rename(columns={'Address_encoding':'Address'})
df

Unnamed: 0,AD_Code,Admin_District,Area,YearBuilt,Type,Floor,Cont_Date,Age,Latitude,Longitude,...,Michelin_Cnt,Mc_cnt,Bigstore_cnt,InterestRate,Market_cnt,Subway_cnt,Kinder_cnt,y,logy,Address
0,0,개포동,6.551771,1989,단독다가구,1.419861,2022-06-01,34,37.473722,127.052725,...,0,0,0,0.791735,0,0,2,7500,8.922792,9.438575
1,0,개포동,6.551771,1991,단독다가구,1.419861,2022-10-05,32,37.473722,127.052725,...,0,0,0,0.950193,0,0,2,14333,9.570390,9.438575
2,0,개포동,6.551771,1991,단독다가구,1.419861,2022-07-07,32,37.473722,127.052725,...,0,0,0,0.820237,0,0,2,17115,9.747769,9.438575
3,0,개포동,6.551771,1991,단독다가구,1.419861,2022-07-07,32,37.473722,127.052725,...,0,0,0,0.820237,0,0,2,17640,9.777981,9.438575
4,0,개포동,6.551771,1989,단독다가구,1.419861,2022-06-01,34,37.473722,127.052725,...,0,0,0,0.791735,0,0,2,7500,8.922792,9.438575
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87356,2,대치동,8.301657,2004,오피스텔,1.927861,2019-07-22,19,37.501439,127.054330,...,1,1,1,0.212487,0,0,1,45000,10.714440,10.714440
87357,13,청담동,6.521947,2005,오피스텔,1.673458,2019-06-21,18,37.525283,127.047239,...,17,0,2,0.281043,0,0,1,38000,10.545368,10.545368
87358,13,청담동,5.212609,2014,오피스텔,1.673458,2019-07-20,9,37.522821,127.039963,...,31,1,2,0.212487,0,0,0,23500,10.064798,10.064798
87359,0,개포동,4.942016,2020,오피스텔,1.770396,2020-09-08,3,37.478097,127.045854,...,0,0,0,0.048268,0,0,1,3000,8.006701,8.006701


In [84]:
# df_train과 df_test 합치기
# df = pd.concat([df_train, df_test], ignore_index=True)

# LabelEncoder 객체 생성
label_encoder = LabelEncoder()

# 각 열에 대해 라벨 인코딩 수행
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = label_encoder.fit_transform(df[column].astype(str))  # 형변환 추가

df.drop(columns='logy', inplace = True)
df["logy"] = np.log1p(df["y"])
df

Unnamed: 0,AD_Code,Admin_District,Area,YearBuilt,Type,Floor,Cont_Date,Age,Latitude,Longitude,...,Michelin_Cnt,Mc_cnt,Bigstore_cnt,InterestRate,Market_cnt,Subway_cnt,Kinder_cnt,y,Address,logy
0,0,0,6.551771,1989,0,1.419861,1607,34,37.473722,127.052725,...,0,0,0,0.791735,0,0,2,7500,9.438575,8.922792
1,0,0,6.551771,1991,0,1.419861,1733,32,37.473722,127.052725,...,0,0,0,0.950193,0,0,2,14333,9.438575,9.570390
2,0,0,6.551771,1991,0,1.419861,1643,32,37.473722,127.052725,...,0,0,0,0.820237,0,0,2,17115,9.438575,9.747769
3,0,0,6.551771,1991,0,1.419861,1643,32,37.473722,127.052725,...,0,0,0,0.820237,0,0,2,17640,9.438575,9.777981
4,0,0,6.551771,1989,0,1.419861,1607,34,37.473722,127.052725,...,0,0,0,0.791735,0,0,2,7500,9.438575,8.922792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87356,2,2,8.301657,2004,3,1.927861,565,19,37.501439,127.054330,...,1,1,1,0.212487,0,0,1,45000,10.714440,10.714440
87357,13,13,6.521947,2005,3,1.673458,534,18,37.525283,127.047239,...,17,0,2,0.281043,0,0,1,38000,10.545368,10.545368
87358,13,13,5.212609,2014,3,1.673458,563,9,37.522821,127.039963,...,31,1,2,0.212487,0,0,0,23500,10.064798,10.064798
87359,0,0,4.942016,2020,3,1.770396,978,3,37.478097,127.045854,...,0,0,0,0.048268,0,0,1,3000,8.006701,8.006701


In [85]:
import pandas as pd
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
# X = df.drop(columns=["y"])
X = df.drop(columns=["y",'logy'])
X_scaled = scaler.fit_transform(X)

df_scaled = pd.DataFrame(X_scaled, columns=X.columns)
# df_scaled['y'] = df['y']

# 추가
df = pd.concat([df_scaled, df['logy']], axis=1)

# df_scaled
# df.drop(columns='y', inplace = True)
df


Unnamed: 0,AD_Code,Admin_District,Area,YearBuilt,Type,Floor,Cont_Date,Age,Latitude,Longitude,Star_Cnt,Michelin_Cnt,Mc_cnt,Bigstore_cnt,InterestRate,Market_cnt,Subway_cnt,Kinder_cnt,Address,logy
0,-0.571429,-0.571429,-0.407127,-0.619048,-1.0,-0.400899,0.713383,0.619048,-1.124766,0.041923,-0.50,0.0,0.0,0.0,0.663267,0.0,-1.0,0.5,-1.162386,8.922792
1,-0.571429,-0.571429,-0.407127,-0.523810,-1.0,-0.400899,0.846154,0.523810,-1.124766,0.041923,-0.50,0.0,0.0,0.0,0.988211,0.0,-1.0,0.5,-1.162386,9.570390
2,-0.571429,-0.571429,-0.407127,-0.523810,-1.0,-0.400899,0.751317,0.523810,-1.124766,0.041923,-0.50,0.0,0.0,0.0,0.721714,0.0,-1.0,0.5,-1.162386,9.747769
3,-0.571429,-0.571429,-0.407127,-0.523810,-1.0,-0.400899,0.751317,0.523810,-1.124766,0.041923,-0.50,0.0,0.0,0.0,0.721714,0.0,-1.0,0.5,-1.162386,9.777981
4,-0.571429,-0.571429,-0.407127,-0.619048,-1.0,-0.400899,0.713383,0.619048,-1.124766,0.041923,-0.50,0.0,0.0,0.0,0.663267,0.0,-1.0,0.5,-1.162386,8.922792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87356,-0.285714,-0.285714,0.145119,0.095238,2.0,0.402172,-0.384615,-0.095238,0.234059,0.099481,0.25,1.0,1.0,1.0,-0.524579,0.0,-1.0,0.0,-0.027873,10.714440
87357,1.285714,1.285714,-0.416539,0.142857,2.0,0.000000,-0.417281,-0.142857,1.403042,-0.154784,0.25,17.0,0.0,2.0,-0.383993,0.0,-1.0,0.0,-0.178214,10.545368
87358,1.285714,1.285714,-0.829752,0.571429,2.0,0.000000,-0.386723,-0.571429,1.282345,-0.415640,0.25,31.0,1.0,2.0,-0.524579,0.0,-1.0,-0.5,-0.605541,10.064798
87359,-0.571429,-0.571429,-0.915149,0.857143,2.0,0.153244,0.050580,-0.857143,-0.910278,-0.204434,-0.25,0.0,0.0,0.0,-0.861339,0.0,-1.0,0.0,-2.435624,8.006701


In [86]:
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor

In [87]:
# 데이터프레임과 타겟 변수를 훈련 및 테스트 세트로 분할
from sklearn.model_selection import train_test_split

# 데이터프레임과 타겟 변수를 훈련 및 테스트 세트로 분할
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=["logy"]), df["logy"], test_size=0.2, random_state=2)



In [88]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, x_train, y_train, x_test, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False)
    
    # 넘파이 배열 초기화
    train_fold_pred = np.zeros((x_train.shape[0], 1))
    test_pred = np.zeros((x_test.shape[0], n_folds))
    print(model.__class__.__name__, 'model 시작')
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.iloc[train_index]
        y_tr = y_train.iloc[train_index]
        x_te = x_train.iloc[valid_index]
        
        model.fit(x_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(x_te).reshape(-1, 1)
        test_pred[:,folder_counter] = model.predict(x_test)
    
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)
    
    return train_fold_pred, test_pred_mean

In [95]:
from xgboost import XGBRegressor

best_params = {
    'n_estimators':1000,
    'learning_rate': 0.1,
    'min_child_weight': 0.575684239651117,
    'max_depth': 10,
    'subsample': 0.8920007427702377,
    'colsample_bytree': 0.8904323734148172,
    'gamma': 0.0005146347157239641
}

# Train the best model using the best parameters
best_model1 = XGBRegressor(**best_params)

In [96]:
best_params = {
    'n_estimators':1000,
    'colsample_bytree': 0.8501582134519151, 
    'gamma': 3.5447331670078747e-06,
    'learning_rate': 0.1,
    'max_depth': 9, 
    'min_child_weight': 0.008013170117259539, 
    'random_state': 2, 
    'subsample': 0.7526720683086026

}

best_model = XGBRegressor(**best_params)

In [97]:
from catboost import CatBoostRegressor
import numpy as np

catboost = CatBoostRegressor(
        loss_function='RMSE',
        iterations=1000,
        learning_rate=0.1,
        depth=8,
        l2_leaf_reg=1,
        bagging_temperature=0.7,
        random_strength=0.3,
        border_count=64,
        verbose=100,
        random_seed=np.random.randint(1, 1000)
    )

In [98]:
optn_train, optn_test = get_stacking_base_datasets(best_model1, x_train, y_train, x_test, 3)
hopt_train, hopt_test = get_stacking_base_datasets(best_model, x_train, y_train, x_test, 3)
cb_train, cb_test = get_stacking_base_datasets(catboost, x_train, y_train, x_test, 3)

XGBRegressor model 시작
XGBRegressor model 시작
CatBoostRegressor model 시작
0:	learn: 0.7019367	total: 11.2ms	remaining: 11.2s
100:	learn: 0.2353246	total: 1s	remaining: 8.9s
200:	learn: 0.2222684	total: 1.97s	remaining: 7.81s
300:	learn: 0.2134432	total: 2.92s	remaining: 6.78s
400:	learn: 0.2064397	total: 3.9s	remaining: 5.83s
500:	learn: 0.2000348	total: 4.86s	remaining: 4.84s
600:	learn: 0.1947980	total: 5.79s	remaining: 3.84s
700:	learn: 0.1903524	total: 6.76s	remaining: 2.88s
800:	learn: 0.1856928	total: 7.73s	remaining: 1.92s
900:	learn: 0.1818922	total: 8.65s	remaining: 951ms
999:	learn: 0.1781754	total: 9.58s	remaining: 0us
0:	learn: 0.7019413	total: 10.2ms	remaining: 10.2s
100:	learn: 0.2369460	total: 990ms	remaining: 8.81s
200:	learn: 0.2225281	total: 1.94s	remaining: 7.71s
300:	learn: 0.2137391	total: 2.91s	remaining: 6.75s
400:	learn: 0.2065982	total: 3.85s	remaining: 5.75s
500:	learn: 0.2009103	total: 4.79s	remaining: 4.77s
600:	learn: 0.1957924	total: 5.71s	remaining: 3.79s
70

In [99]:
stack_final_x_train = np.concatenate((optn_train, hopt_train, cb_train), axis=1)
stack_final_x_test = np.concatenate((optn_test, hopt_test, cb_test), axis=1)

In [100]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(n_estimators=1000, learning_rate=0.085, num_leaves=39,
                     subsample=0.62, colsample_bytree=0.98, reg_lambda=1.57,
                     reg_alpha=0.43, min_child_samples=24, max_depth=133,
                     n_jobs=-1)

lgbm.fit(stack_final_x_train, y_train)
stack_final = lgbm.predict(stack_final_x_test)

mean_squared_error(y_test, stack_final)**0.5

0.21798003931060042