In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import smogn

raw_data = pd.read_csv('./baseball_train_final4.csv')

model_dict = {'rf':RandomForestRegressor(),
              'xgb':XGBRegressor(),
              'lgbm':LGBMRegressor()}

def augmentation_cv(raw_df, label, cv_iter=5, aug_num=5, model='rf'):
    mae_scores = []
    rmse_scores = []
    
    for i in range(cv_iter):
        print(f'CV_ITER: # {i+1}')
        
        # Augmentation 전에 train set, val set을 분리
        print('------Starting Data Split------')
        X_train, X_test, y_train, y_test = train_test_split(raw_df.drop(label, axis=1), raw_df[label], test_size=0.1)
        train_df = pd.concat([X_train, y_train], axis=1).reset_index()
        val_df = pd.concat([X_test, y_test], axis=1).reset_index()
        
        # Augmentation
        print('------Starting Data Augmentation------')
        os_list = [smogn.smoter(train_df, y=label, k=i+2) for i in range(aug_num)]
        aug_train = pd.concat(os_list, ignore_index=True)
        
        # Augmented된 데이터에 대한 기본적인 전처리
        # 장타 or 출루가 모두 0인 row, 타수가 30 미만인 row 삭제
        zero_find = (aug_train['출루'] == 0) & (aug_train['장타'] == 0) & (aug_train['OPS'] == 0)
        zero_idx = aug_train[zero_find].index
        not_zero_idx = [idx for idx in aug_train.index if idx not in zero_idx]
        new_aug_train = aug_train.loc[not_zero_idx]
        new_aug_train = new_aug_train[new_aug_train['타수'] >= 30]
        new_aug_train = new_aug_train.drop_duplicates()
        
        exclude_cols = ['NAME', 'PCODE', 'Date', '장타', '출루', 'OPS']
        include_cols = [col for col in list(raw_df.columns) if col not in exclude_cols]
        
        # 모델 학습/평가에 사용할 최종 X_train, y_train, X_test, y_test 정의
        X_train, y_train = new_aug_train[include_cols], new_aug_train[label]
        X_test, y_test = val_df[include_cols], val_df[label]
        
        # 모델 학습, 성능 평가
        print('------Starting Model Training------')
        ensemble_model = model_dict[model]
        ensemble_model.fit(X_train, y_train)
        prediction = ensemble_model.predict(X_test)
        
        # Scoring
        mae = mean_absolute_error(y_test, prediction)
        rmse = np.sqrt(mean_squared_error(y_test, prediction))
        
        # Score list에 축적
        mae_scores.append(mae)
        rmse_scores.append(rmse)
    
    print('------------')
    print('MODEL RESULT')
    print(f'CV_ITER: {cv_iter}, AUG_NUM: {aug_num}, TRAIN_ROW_NUM: {len(X_train)}, MODEL: {model}')
    print(f'Mean MAE: {np.mean(mae_scores)}, Mean RMSE: {np.mean(rmse_scores)}')

In [65]:
augmentation_cv(raw_data, '장타', cv_iter=2, aug_num=1, model='rf')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
dist_matrix:   0%|                                     | 0/356 [00:00<?, ?it/s]

CV_ITER: # 0
------Starting Data Split------
------Starting Data Augmentation------


dist_matrix: 100%|###########################| 356/356 [02:24<00:00,  2.46it/s]
synth_matrix: 100%|##########################| 356/356 [00:12<00:00, 28.44it/s]
r_index: 100%|################################| 62/62 [00:00<00:00, 112.73it/s]


------Starting Model Training------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)
dist_matrix:   0%|                                     | 0/349 [00:00<?, ?it/s]

CV_ITER: # 1
------Starting Data Split------
------Starting Data Augmentation------


dist_matrix: 100%|###########################| 349/349 [02:21<00:00,  2.47it/s]
synth_matrix: 100%|##########################| 349/349 [00:10<00:00, 32.04it/s]
r_index: 100%|################################| 97/97 [00:00<00:00, 135.05it/s]


------Starting Model Training------
------------
MODEL RESULT
CV_ITER: 2, AUG_NUM: 1, TRAIN_ROW_NUM: 1912, MODEL: rf
Mean MAE: 0.17944980245505052, Mean RMSE: 0.23846180589653904
