In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# 지면온도
data_train = pd.read_csv(r"surface_tp_train.csv", encoding = "cp949")
data_test = pd.read_csv(r"surface_tp_test.csv", encoding = "cp949")

# 사계절 나누기

In [None]:
# 'mmddhh' 기준
data_train['surface_tp_train.mmddhh'] = data_train['surface_tp_train.mmddhh'].astype(int)

# 각 계절에
spring_mask = (data_train['surface_tp_train.mmddhh'] >= 20100) & (data_train['surface_tp_train.mmddhh'] <= 43023)
summer_mask = (data_train['surface_tp_train.mmddhh'] >= 50100) & (data_train['surface_tp_train.mmddhh'] <= 73123)
autumn_mask = (data_train['surface_tp_train.mmddhh'] >= 80100) & (data_train['surface_tp_train.mmddhh'] <= 103123)
winter_mask = (data_train['surface_tp_train.mmddhh'] >= 110100) | (data_train['surface_tp_train.mmddhh'] <= 13123)

# 데이터프레임에서 계절별로 데이터 선택
spring_df = data_train[spring_mask]
summer_df = data_train[summer_mask]
autumn_df = data_train[autumn_mask]
winter_df = data_train[winter_mask]

# 전처리 함수정의

In [None]:
def 전처리(df_season):
    df_season['surface_tp_train.mmddhh']=df_season['surface_tp_train.mmddhh'].apply(lambda x : str(x).zfill(6))
    
    # 데이터프레임 생성 예시
    df_season['surface_tp_train.mmddhh_categorical_month'] = df_season['surface_tp_train.mmddhh'].str[0:2]
    df_season['surface_tp_train.mmddhh_categorical_day'] = df_season['surface_tp_train.mmddhh'].str[2:4]
    df_season['surface_tp_train.mmddhh_categorical_hour'] = df_season['surface_tp_train.mmddhh'].str[-2:]

    df_season.loc[(df_season['surface_tp_train.mmddhh_categorical_month'] == '11'), 'surface_tp_train.mmddhh_categorical_month'] = 0
    df_season.loc[(df_season['surface_tp_train.mmddhh_categorical_month'] == '12'), 'surface_tp_train.mmddhh_categorical_month'] = 1
    df_season.loc[(df_season['surface_tp_train.mmddhh_categorical_month'] == '01'), 'surface_tp_train.mmddhh_categorical_month'] = 2

    df_set = df_season
    df_set = df_set.drop(['Unnamed: 0', 'surface_tp_train.stn', 'surface_tp_train.year', 'surface_tp_train.mmddhh'], axis = 1)

    for i in ['rn', 'si', 'ss', 'sn']: # 0으로 대체할 변수들
        df_set['surface_tp_train.{}'.format(i)].replace(-99.9, 0, inplace=True)
    # RN = 1.8, 3.3, 5.3, 19.5, 30.3, 623.5 -> -99
    df_set['surface_tp_train.re'].replace(-99, 0, inplace=True)
    df_set['surface_tp_train.re'].replace(1.8, 0, inplace=True)
    df_set['surface_tp_train.re'].replace(3.3, 0, inplace=True)
    df_set['surface_tp_train.re'].replace(5.3, 0, inplace=True)
    df_set['surface_tp_train.re'].replace(19.5, 0, inplace=True)
    df_set['surface_tp_train.re'].replace(30.3, 0, inplace=True)
    df_set['surface_tp_train.re'].replace(623.5, 0, inplace=True)


    for i in ['ta', 'td', 'hm', 'ws', 'ts']:
        df_set['surface_tp_train.{}'.format(i)].replace(-99.9, np.nan, inplace = True)
        df_set['surface_tp_train.{}'.format(i)].interpolate(method='cubic', limit_area='inside', limit=100, inplace=True)


    # 범주형 변수로 변환
    df_set['surface_tp_train.ww_category'] = pd.Categorical(df_set['surface_tp_train.ww'])

    # 숫자로 변환
    df_set['surface_tp_train.ww'] = df_set['surface_tp_train.ww_category'].cat.codes
    df_set['surface_tp_train.ww'].unique()
    df_set.drop(['surface_tp_train.ww_category'], axis = 1, inplace = True)

    
    return df_set

In [None]:
df_전처리_spring = 전처리(spring_df) # spring_df, summer_df, autumn_df, winter_df
df_전처리_summer = 전처리(summer_df) # spring_df, summer_df, autumn_df, winter_df
df_전처리_autumn = 전처리(autumn_df) # spring_df, summer_df, autumn_df, winter_df
df_전처리_winter = 전처리(winter_df) # spring_df, summer_df, autumn_df, winter_df

# 분석

In [None]:
df = df_전처리_winter

In [None]:
df.reset_index(drop = True, inplace = True)

In [None]:
y = df['surface_tp_train.ts']
X = df.drop('surface_tp_train.ts', axis=1)

### train_test_split

In [None]:
#train, test 분류. test_size=0.1
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# # X, y = shuffle(X, y, random_state=1)
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.001, shuffle = False)

# 함수 정의 및 Scaler

In [None]:
def adj_r2_score(y_true, y_pred, p):
    return 1-(1-r2_score(y_true, y_pred)) * (len(y_true)-1) / (len(y_true) - p - 1)

def mae_r2_plot(y_test, y_predict):
    from sklearn.metrics import mean_absolute_error
    mae = mean_absolute_error(y_test, y_predict)
    print('R2 : ', round(r2_score(y_test, y_predict),5), ', adj_R2 : ', round(adj_r2_score(y_test, y_predict, X.shape[1]),5), ', mae : ', round(mae,5))

    x_test = range(len(y_test))  
    
    plt.figure(figsize=(40, 16))
    plt.plot(x_test, y_test, label='Actual', alpha = 1)
    plt.plot(x_test, y_predict, label='Predicted', alpha = 0.8)
    plt.legend()
    plt.show()

In [None]:
# scaler = MinMaxScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# CatBoostRegressor

## Optuna

In [None]:
# Optuna Libraries
from catboost import CatBoostRegressor
import optuna
from optuna import Trial
from optuna.samplers import TPESampler

# random sampler
sampler = TPESampler(seed=10)

# define function
def objective(trial):

    cbrm_param = {
        'iterations':trial.suggest_int("iterations", 100, 10000),
        'od_wait':trial.suggest_int('od_wait', 100, 2300),
        'learning_rate' : trial.suggest_uniform('learning_rate',0.01, 1),
        'reg_lambda': trial.suggest_uniform('reg_lambda',1e-5,100),
        'subsample': trial.suggest_uniform('subsample',0,1),
        'random_strength': trial.suggest_uniform('random_strength',10,50),
        'depth': trial.suggest_int('depth',1, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf',1,30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations',1,15),
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00),
        'colsample_bylevel':trial.suggest_float('colsample_bylevel', 0.4, 1.0),
    }

    # Generate model
    model_cbrm = CatBoostRegressor(**cbrm_param)
    model_cbrm = model_cbrm.fit(X_train_scaled, y_train, verbose=0)
                           
	# 평가지표 원하는 평가 지표가 있을 시 바꾸어 준다.
    MSE = mean_squared_error(y_test, model_cbrm.predict(X_test_scaled))
    return MSE

optuna_cbrm = optuna.create_study(direction='minimize', sampler=sampler)
optuna_cbrm.optimize(objective, n_trials=100)

In [None]:
cbrm_trial = optuna_cbrm.best_trial
cbrm_trial_params = cbrm_trial.params
print('Best Trial: score {},\nparams {}'.format(cbrm_trial.value, cbrm_trial_params))

In [None]:
cbrm_trial_params

In [None]:
# Modeling fit
cbrm_optuna = CatBoostRegressor(**cbrm_trial_params)
cbrm_study = cbrm_optuna.fit(X_train_scaled, y_train, verbose=1)

# Predict the y_test
y_predict = cbrm_study.predict(X_test_scaled)
y_train_pred = cbrm_study.predict(X_train_scaled)

## Model

In [None]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# CatBoost 회귀 모델 생성
cbrm = CatBoostRegressor(**cbrm_param)

cbrm_study = cbrm.fit(X_train_scaled, y_train, verbose=1)

# 테스트 데이터 예측
y_predict = cbrm_study.predict(X_test_scaled)
y_train_pred = cbrm_study.predict(X_train_scaled)

# 평균 제곱근 오차(RMSE) 계산
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mse)

print("Root Mean Squared Error (RMSE):", rmse)

In [None]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_predict)
mae2 = mean_absolute_error(y_train, y_train_pred)

print('test: ', mae, ' train: ', mae2)

In [None]:
mae_r2_plot(y_test, y_predict)

## 확인 및 시각화

In [None]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_predict)
mae2 = mean_absolute_error(y_train, y_train_pred)

print(mae, mae2)

# 평균 제곱근 오차(RMSE) 계산
mse = mean_squared_error(y_test, y_predict)
rmse = np.sqrt(mse)
print('\n', mse, rmse)

In [None]:
mae_r2_plot(y_test, y_predict)

In [None]:
optuna.visualization.plot_param_importances(optuna_cbrm) # 파라미터 중요도 확인 그래프

In [None]:
optuna.visualization.plot_optimization_history(optuna_cbrm) # 최적화 과정 시각화

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
plt.rc('font', family='Malgun Gothic')

feature_imp = pd.DataFrame(sorted(zip(cbrm_optuna.feature_importances_,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 12))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[:15])
# plt.title('LightGBM Feature importances')
plt.tight_layout()
plt.show()