In [29]:
import pandas as pd
import mariadb
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.ensemble import *
import numpy as np
import bisect
from tqdm import tqdm
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split

conn  = mariadb.connect(
    user='team3', 
    password='dbdb', 
    database='team3db', 
    host='localhost',
    port=3306
)

# 위 커넥션 정보와 동일하게 입력
engine = create_engine("mysql://{user}:{pw}@{host}/{db}".format(user='team3', pw='dbdb',host='localhost', db='team3db'))

In [30]:
train = pd.read_csv('./data/train_std.csv')

In [31]:
train.columns

Index(['Unnamed: 0', 'ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST', 'ID',
       'BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT', 'LENGTH',
       'SHIPMANAGER', 'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN',
       'ATA_LT', 'PORT_SIZE', 'CI_HOUR', 'year', 'month', 'day', 'hour',
       'minute', 'weekday'],
      dtype='object')

In [32]:
train.drop(columns=['Unnamed: 0','ID','SHIPMANAGER'],inplace=True)

In [35]:
X = train[['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST', 'BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT', 'LENGTH', 
            'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'PORT_SIZE', 'year', 'weekday', 'BN', 'ATA_LT', 'month', 'day', 'hour', 'minute']]
y = train['CI_HOUR']

In [36]:
ss = StandardScaler()
X=ss.fit_transform(X)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42, test_size=0.2)

In [38]:
def train_and_evalute_model(model, X_train, y_train, X_test, y_test):
    ### 모델 훈련 시키기
    model.fit(X_train, y_train)

    ### 훈련 및 검증 독립변수로 정확도 확인하기
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)

    ### 훈련 및 검증 데이터를 이용해서 예측 및 평가하기
    # 훈련 예측 평가
    train_pred = model.predict(X_train)
    train_mae = mean_absolute_error(y_train, train_pred)
    train_mse = mean_squared_error(y_train, train_pred)
    train_r2 = r2_score(y_train, train_pred)

    # 검증 예측 및 평가
    test_pred = model.predict(X_test)
    test_mae = mean_absolute_error(y_test, test_pred)
    test_mse = mean_squared_error(y_test, test_pred)
    test_r2 = r2_score(y_test, test_pred)
    
    # print("--------------------------[종료]----------------------------")
    return model, train_score, train_mae, train_mse, train_r2, test_score, test_mae, test_mse, test_r2

In [39]:
lgb = lgb.LGBMRegressor()
xgb = xgb.XGBRegressor()

models=[lgb,xgb]

In [40]:
### 함수 반복 호출하기
results = {}

for m in models :
    ### 모델은 클래스로 되었기 때문에 -> 클래스 이름을 추출 할 수 있습니다.
    model_name = m.__class__.__name__
    print(f"-----------------------------[{model_name}]-------------------------------")

    ### 함수 호출하기
    # print(train_and_evalute_model(m, X_train_scaled, y_train, X_val_scaled, y_val))
    results[model_name] = train_and_evalute_model(m, X_train, y_train, X_test, y_test)
    print()

results

-----------------------------[LGBMRegressor]-------------------------------
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002931 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2375
[LightGBM] [Info] Number of data points in the train set: 313546, number of used features: 23
[LightGBM] [Info] Start training from score 31.607215

-----------------------------[XGBRegressor]-------------------------------



{'LGBMRegressor': (LGBMRegressor(),
  0.5742338157395308,
  18.01847149105936,
  809.2207008347247,
  0.5742338157395308,
  0.5676814983683551,
  18.13200586293516,
  817.1035755228852,
  0.5676814983683551),
 'XGBRegressor': (XGBRegressor(base_score=None, booster=None, callbacks=None,
               colsample_bylevel=None, colsample_bynode=None,
               colsample_bytree=None, device=None, early_stopping_rounds=None,
               enable_categorical=False, eval_metric=None, feature_types=None,
               gamma=None, grow_policy=None, importance_type=None,
               interaction_constraints=None, learning_rate=None, max_bin=None,
               max_cat_threshold=None, max_cat_to_onehot=None,
               max_delta_step=None, max_depth=None, max_leaves=None,
               min_child_weight=None, missing=nan, monotone_constraints=None,
               multi_strategy=None, n_estimators=None, n_jobs=None,
               num_parallel_tree=None, random_state=None, ...),
  0.6

In [41]:
### 정확도 및 평가 결과 확인하기
for m_name, (_, train_score, train_mae, train_mse, train_r2, test_score, test_mae, test_mse, test_r2) in results.items() :
    print(f"model_name = {m_name}")
    print(f"Train Score : {train_score:.4f} / Test Score : {test_score:.4f} / Train - Test = {train_score - test_score}")
    print(f"Train MAE : {train_mae:.4f} / Test MAE : {test_mae:.4f}")
    print(f"Train MSE : {train_mse:.4f} / Test MSE : {test_mse:.4f}")
    print(f"Train R2 : {train_r2:.4f} / Test R2 : {test_r2:.4f}")
    print()
    

model_name = LGBMRegressor
Train Score : 0.5742 / Test Score : 0.5677 / Train - Test = 0.0065523173711756755
Train MAE : 18.0185 / Test MAE : 18.1320
Train MSE : 809.2207 / Test MSE : 817.1036
Train R2 : 0.5742 / Test R2 : 0.5677

model_name = XGBRegressor
Train Score : 0.6128 / Test Score : 0.5825 / Train - Test = 0.030313987144563237
Train MAE : 17.2182 / Test MAE : 17.8619
Train MSE : 735.8671 / Test MSE : 789.0687
Train R2 : 0.6128 / Test R2 : 0.5825



In [45]:
vo_clf=VotingRegressor(estimators=[('lgb',lgb),('xgb',xgb)])

In [46]:
vo_clf.fit(X_train,y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2375
[LightGBM] [Info] Number of data points in the train set: 313546, number of used features: 23
[LightGBM] [Info] Start training from score 31.607215


In [49]:
pred=vo_clf.predict(X_test)

In [50]:
r2_score = r2_score(y_test,pred)
r2_score

0.579797244264829