In [None]:
# Data Wrangling
import pandas as pd
import numpy as np

#Utility
import random
import os

# Preprocessing & Feature Engineering
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile

# Evaluation
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

# Optuna
import optuna
from optuna.samplers import TPESampler
from optuna import Trial

# Modeling
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Visualization
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

pd.set_option('mode.chained_assignment',  None)

In [None]:
train = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_CODE_REVIEW\감귤 착과량\train.csv')
test = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_CODE_REVIEW\감귤 착과량\test.csv')

In [None]:
from collections import Counter

def print_mode(df, col):
    cnt = Counter(df[col])
    list_cnt = cnt.most_common(3)

    for idx, value in enumerate(list_cnt):

        print(f'{col}의 최빈값 {idx + 1}순위 : {value[0]} & {value[-1]}개')

In [None]:
def print_statistics(df, col):

    max = df['착과량(int)'].max()
    min = df['착과량(int)'].min()
    mean = df['착과량(int)'].mean()
    median = df['착과량(int)'].median()

    print(f'{col}의 최대값 : {max}')
    print(f'{col}의 최소값 : {max}')
    print(f'{col}의 평균값 : {max}')
    print(f'{col}의 중앙값 : {max}')
    
    print_mode(df, col)

In [None]:
def identify_hist(df, col):

    sns.histplot(data = df[col], kde = True)
    print_statistics(df, col)

In [None]:
identify_hist(train, '착과량(int)')

In [None]:
y_train = train['착과량(int)']
X_drop_list = ['ID']
X_train = train.drop(X_drop_list, axis = 1)
X_test = test.drop(['ID'], axis = 1)

In [None]:
high_corr = train.corr().abs().sort_values(by = '착과량(int)', ascending = False).iloc[:,:1]
features_name = high_corr[high_corr['착과량(int)'] > 0.9].index
features_name = list(features_name)
features_name.remove('착과량(int)')
X, y = X_train.drop(['착과량(int)'], axis = 1), X_train['착과량(int)']

X_1 = X[features_name]
X_test_1 = X_test[features_name]

In [None]:
X_1 = X_1.apply(lambda x : x.clip(x.quantile(.01), x.quantile(.99)), axis = 0)

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42)

In [None]:
X_train = X_train.drop(['착과량(int)'], axis = 1)

In [None]:
X_train['9월_새순_mean'] = X_train.iloc[:,4:34].mean(axis = 1)
X_train['9월_새순_std'] = X_train.iloc[:,4:34].std(axis = 1)
X_train['9월_새순_var'] = X_train.iloc[:,4:34].var(axis = 1)
X_train['10월_새순_mean'] = X_train.iloc[:,34:65].mean(axis = 1)
X_train['10월_새순_std'] = X_train.iloc[:,34:65].std(axis = 1)
X_train['10월_새순_var'] = X_train.iloc[:,34:65].var(axis = 1)
X_train['11월_새순_mean'] = X_train.iloc[:,65:93].mean(axis = 1)
X_train['11월_새순_std'] = X_train.iloc[:,65:93].std(axis = 1)
X_train['11월_새순_var'] = X_train.iloc[:,65:93].var(axis = 1)
X_train['9월_엽록소_mean'] = X_train.iloc[:,93:123].mean(axis = 1)
X_train['9월_엽록소_std'] = X_train.iloc[:,93:123].std(axis = 1)
X_train['9월_엽록소_var'] = X_train.iloc[:,93:123].var(axis = 1)
X_train['10월_엽록소_mean'] = X_train.iloc[:,123:154].mean(axis = 1)
X_train['10월_엽록소_std'] = X_train.iloc[:,123:154].std(axis = 1)
X_train['10월_엽록소_var'] = X_train.iloc[:,123:154].var(axis = 1)
X_train['11월_엽록소_mean'] = X_train.iloc[:,154:182].mean(axis = 1)
X_train['11월_엽록소_std'] = X_train.iloc[:,154:182].std(axis = 1)
X_train['11월_엽록소_var'] = X_train.iloc[:,154:182].var(axis = 1)

In [None]:
X_train['새순max'] = X_train.iloc[:,4:93].max(axis=1)
X_train['새순min'] = X_train.iloc[:,4:93].min(axis=1)
X_train['엽록소max'] = X_train.iloc[:,93:182].max(axis=1)
X_train['엽록소min'] = X_train.iloc[:,93:182].min(axis=1)
X_train['새순차이'] = X_train['새순max']-X_train['새순min']
X_train['엽록소차이'] = X_train['엽록소max']-X_train['엽록소min']
X_train['수고X수관폭'] = X_train['수고(m)']*X_train['수관폭평균']
X_train['수관폭차이'] = X_train['수관폭2(max)']-X_train['수관폭1(min)']

In [None]:
for i in range(0,89):
    X_train[f'새순 + 엽록소_{i}'] = X_train.iloc[:,4:93].iloc[:,i] + X_train.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_train[f'새순 - 엽록소_{i}'] = X_train.iloc[:,4:93].iloc[:,i] - X_train.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_train[f'새순 * 엽록소_{i}'] = X_train.iloc[:,4:93].iloc[:,i] * X_train.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_train[f'새순 / 엽록소_{i}'] = X_train.iloc[:,4:93].iloc[:,i] / X_train.iloc[:,93:182].iloc[:,i]

In [None]:
X_test['9월_새순_mean'] = X_test.iloc[:,4:34].mean(axis = 1)
X_test['9월_새순_std'] = X_test.iloc[:,4:34].std(axis = 1)
X_test['9월_새순_var'] = X_test.iloc[:,4:34].var(axis = 1)
X_test['10월_새순_mean'] = X_test.iloc[:,34:65].mean(axis = 1)
X_test['10월_새순_std'] = X_test.iloc[:,34:65].std(axis = 1)
X_test['10월_새순_var'] = X_test.iloc[:,34:65].var(axis = 1)
X_test['11월_새순_mean'] = X_test.iloc[:,65:93].mean(axis = 1)
X_test['11월_새순_std'] = X_test.iloc[:,65:93].std(axis = 1)
X_test['11월_새순_var'] = X_test.iloc[:,65:93].var(axis = 1)
X_test['9월_엽록소_mean'] = X_test.iloc[:,93:123].mean(axis = 1)
X_test['9월_엽록소_std'] = X_test.iloc[:,93:123].std(axis = 1)
X_test['9월_엽록소_var'] = X_test.iloc[:,93:123].var(axis = 1)
X_test['10월_엽록소_mean'] = X_test.iloc[:,123:154].mean(axis = 1)
X_test['10월_엽록소_std'] = X_test.iloc[:,123:154].std(axis = 1)
X_test['10월_엽록소_var'] = X_test.iloc[:,123:154].var(axis = 1)
X_test['11월_엽록소_mean'] = X_test.iloc[:,154:182].mean(axis = 1)
X_test['11월_엽록소_std'] = X_test.iloc[:,154:182].std(axis = 1)
X_test['11월_엽록소_var'] = X_test.iloc[:,154:182].var(axis = 1)

In [None]:
X_test['새순max'] = X_test.iloc[:,4:93].max(axis=1)
X_test['새순min'] = X_test.iloc[:,4:93].min(axis=1)
X_test['엽록소max'] = X_test.iloc[:,93:182].max(axis=1)
X_test['엽록소min'] = X_test.iloc[:,93:182].min(axis=1)
X_test['새순차이'] = X_test['새순max']-X_test['새순min']
X_test['엽록소차이'] = X_test['엽록소max']-X_test['엽록소min']
X_test['수고X수관폭'] = X_test['수고(m)']*X_test['수관폭평균']
X_test['수관폭차이'] = X_test['수관폭2(max)']-X_test['수관폭1(min)']

In [None]:
for i in range(0,89):
    X_test[f'새순+엽록소_{i}'] = X_test.iloc[:,4:93].iloc[:,i]+X_test.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_test[f'새순-엽록소_{i}'] = X_test.iloc[:,4:93].iloc[:,i]-X_test.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_test[f'새순*엽록소_{i}'] = X_test.iloc[:,4:93].iloc[:,i]*X_test.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_test[f'새순/엽록소_{i}'] = X_test.iloc[:,4:93].iloc[:,i]/X_test.iloc[:,93:182].iloc[:,i]

In [None]:
X_train.shape, X_test.shape

In [None]:
#base version
def NMAE(true, pred):
    mae = np.mean(np.abs(true - pred))
    score = mae / np.mean(np.abs(true))
    return score

#cross_val custom version
def NMAE_CV(clf, x, y):
    pred = clf.predict(x)
    mae = np.mean(np.abs(y - pred))
    score = mae / np.mean(np.abs(y))
    return score

In [None]:
NFOLDS = 10
SEED = 22

In [None]:
numeric_features = list(X_train.columns) 

In [None]:
#pipeline을 이용한 preprocessing
def remove_outlier(X, q=0.02):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

numeric_transformer = Pipeline(
    steps=[
        ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':0.02})), 
        ("scaler", MinMaxScaler()),
    ]
)
 
column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
   ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer), 
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor), 
        ("Regressor", LGBMRegressor(objective="regression", metric="mae", random_state=SEED)),
    ]
) 

In [None]:
#최적값으로 파이프라인 재설정(optuna를 이용해 preprocessing parameter tuning)
model.set_params(preprocessor__column__num__outlier__kw_args =  {'q': 0.02}, preprocessor__column__num__scaler = MinMaxScaler())

#전처리 파이프라인만 수행
X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [None]:
#SelectPercentile을 이용해 최적 피처 수 결정
fs = SelectPercentile(percentile=13).fit(X_train, y_train)
X_train = fs.transform(X_train)
X_test = fs.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [None]:
X_train.to_csv('X_train_fine.csv', index = False)
X_test.to_csv('X_test_fine.csv', index = False)

In [None]:
X_2 = pd.read_csv('X_train_fine.csv')
X_test_2 = pd.read_csv('X_test_fine.csv')

In [None]:
#seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [None]:
#optuna를 이용해 hyperparameter tuning
xgb_best_params_1 = {'lambda': 0.002645916029508221,
                     'alpha': 0.06770804282734474,
                     'colsample_bytree': 0.42500508042724955,
                     'subsample': 0.7135736798352763,
                     'learning_rate': 0.0034491759962488127,
                     'n_estimators': 2538,
                     'max_depth': 4,
                     'min_child_weight': 2,
                     'objective': 'reg:squarederror',
                     'tree_method': 'gpu_hist',
                     'predictor': 'gpu_predictor'}

xgb_best_params_2 = {'lambda': 0.059360963228304024,
                     'alpha': 0.9856292525135064,
                     'colsample_bytree': 0.4569397260113678,
                     'subsample': 0.4754658082470086,
                     'learning_rate': 0.0029407888288556297,
                     'n_estimators': 2020,
                     'max_depth': 11,
                     'min_child_weight': 49,
                     'objective': 'reg:squarederror',
                     'tree_method': 'gpu_hist',
                     'predictor': 'gpu_predictor'}

In [None]:
#multi-kfold1(과적합 방지를 이용해 사용)
xgb_pred_1 = []

kfold_list = [4, 5, 6]
for kfold in kfold_list:
    print(f"{kfold} Fold start")
    i = 0
    xgb_nmae = []
    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X_1)):
        tr_x, tr_y = X_1.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X_1.iloc[val_idx], y.iloc[val_idx]

        #사이킷 런 래퍼 XGB 학습
        xgb = XGBRegressor(**xgb_best_params_1)
        xgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 50, eval_metric = 'mae')       
        val_pred = xgb.predict(val_x).astype(int)
        fold_nmae = NMAE(val_y, val_pred)
        xgb_nmae.append(fold_nmae)
        print(f"{i + 1}/{kfold} Fold NMAE = {fold_nmae}")
        i += 1
        fold_pred = xgb.predict(X_test_1)
        xgb_pred_1.append(fold_pred)

    print(f"\nAVG of NMAE = {np.mean(xgb_nmae)}")

In [None]:
#KFold ensemble1
xgb_pred_sum_1 = sum(xgb_pred_1)  
xgb_pred_sum_1 /= len(xgb_pred_1)
xgb_pred_sum_1

In [None]:
#multi-kfold2
xgb_pred_2 = []

kfold_list = [4, 5, 6]
for kfold in kfold_list:
    print(f"{kfold} Fold start")
    i = 0
    xgb_nmae = []
    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X_2)):
        tr_x, tr_y = X_2.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X_2.iloc[val_idx], y.iloc[val_idx]

        #사이킷 런 래퍼 XGB 학습
        xgb = XGBRegressor(**xgb_best_params_2)
        xgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 50, eval_metric = 'mae')       
        val_pred = xgb.predict(val_x).astype(int)
        fold_nmae = NMAE(val_y, val_pred)
        xgb_nmae.append(fold_nmae)
        print(f"{i + 1}/{kfold} Fold NMAE = {fold_nmae}")
        i += 1
        fold_pred = xgb.predict(X_test_2)
        xgb_pred_2.append(fold_pred)

    print(f"\nAVG of NMAE = {np.mean(xgb_nmae)}")

In [None]:
#KFold ensemble2
xgb_pred_sum_2 = sum(xgb_pred_2)  
xgb_pred_sum_2 /= len(xgb_pred_2)
xgb_pred_sum_2

In [None]:
lgb_param = {'objective' : 'regression',
            'device' : 'gpu',
            'metric' : 'mae'}

In [None]:
#multi-kfold1
lgb_pred_1 = []

kfold_list = [4, 5, 6]
for kfold in kfold_list:
    print(f"{kfold} Fold start")
    i = 0
    lgb_nmae = []
    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X_1)):
        tr_x, tr_y = X_1.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X_1.iloc[val_idx], y.iloc[val_idx]

        lgb = LGBMRegressor(**lgb_param)
        lgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 50, eval_metric = 'mae')
        val_pred = lgb.predict(val_x).astype(int)
        fold_nmae = NMAE(val_y, val_pred)
        lgb_nmae.append(fold_nmae)
        print(f"{i + 1}/{kfold} Fold NMAE = {fold_nmae}")
        i += 1
        fold_pred = lgb.predict(X_test_1)
        lgb_pred_1.append(fold_pred)

    print(f"\nAVG of NMAE = {np.mean(lgb_nmae)}")

In [None]:
#KFold ensemble1
lgb_pred_sum_1 = sum(lgb_pred_1)  
lgb_pred_sum_1 /= len(lgb_pred_1)
lgb_pred_sum_1

In [None]:
#multi-kfold2
lgb_pred_2 = []

kfold_list = [4, 5, 6]
for kfold in kfold_list:
    print(f"{kfold} Fold start")
    i = 0
    lgb_nmae = []
    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X_2)):
        tr_x, tr_y = X_2.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X_2.iloc[val_idx], y.iloc[val_idx]

        lgb = LGBMRegressor(**lgb_param)
        lgb.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 50, eval_metric = 'mae')
        val_pred = lgb.predict(val_x).astype(int)
        fold_nmae = NMAE(val_y, val_pred)
        lgb_nmae.append(fold_nmae)
        print(f"{i + 1}/{kfold} Fold NMAE = {fold_nmae}")
        i += 1
        fold_pred = lgb.predict(X_test_2)
        lgb_pred_2.append(fold_pred)

    print(f"\nAVG of NMAE = {np.mean(lgb_nmae)}")

In [None]:
#KFold ensemble2
lgb_pred_sum_2 = sum(lgb_pred_2)  
lgb_pred_sum_2 /= len(lgb_pred_2)
lgb_pred_sum_2

In [None]:
#multi-kfold1
cat_pred_1 = []

kfold_list = [4, 5, 6]
for kfold in kfold_list:
    print(f"{kfold} Fold start")
    i = 0
    cat_nmae = []
    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X_1)):
        tr_x, tr_y = X_1.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X_1.iloc[val_idx], y.iloc[val_idx]

        cat = CatBoostRegressor(use_best_model = True,
                                task_type = 'GPU',
                                iterations = 10000, 
                                eval_metric = 'MAE')
        cat.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 50)
        val_pred = cat.predict(val_x).astype(int)
        fold_nmae = NMAE(val_y, val_pred)
        cat_nmae.append(fold_nmae)
        print(f"{i + 1}/{kfold} Fold NMAE = {fold_nmae}")
        i += 1
        fold_pred = cat.predict(X_test_1)
        cat_pred_1.append(fold_pred)

    print(f"\nAVG of NMAE = {np.mean(cat_nmae)}")

In [None]:
#KFold ensemble1
cat_pred_sum_1 = sum(cat_pred_1)  
cat_pred_sum_1 /= len(cat_pred_1)
cat_pred_sum_1

In [None]:
#multi-kfold2
cat_pred_2 = []

kfold_list = [4, 5, 6]
for kfold in kfold_list:
    print(f"{kfold} Fold start")
    i = 0
    cat_nmae = []
    kf = KFold(n_splits=kfold, random_state=42, shuffle=True)
    for fold, (tr_idx, val_idx) in enumerate(kf.split(X_2)):
        tr_x, tr_y = X_2.iloc[tr_idx], y.iloc[tr_idx]
        val_x, val_y = X_2.iloc[val_idx], y.iloc[val_idx]

        cat = CatBoostRegressor(use_best_model = True,
                                task_type = 'GPU',
                                iterations = 10000, 
                                eval_metric = 'MAE')
        cat.fit(tr_x, tr_y, eval_set = [(val_x, val_y)], early_stopping_rounds = 100, verbose = 50)
        val_pred = cat.predict(val_x).astype(int)
        fold_nmae = NMAE(val_y, val_pred)
        cat_nmae.append(fold_nmae)
        print(f"{i + 1}/{kfold} Fold NMAE = {fold_nmae}")
        i += 1
        fold_pred = cat.predict(X_test_2)
        cat_pred_2.append(fold_pred)

    print(f"\nAVG of NMAE = {np.mean(cat_nmae)}")

In [None]:
#KFold ensemble2
cat_pred_sum_2 = sum(cat_pred_2)  
cat_pred_sum_2 /= len(cat_pred_2)
cat_pred_sum_2

In [None]:
submission = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_CODE_REVIEW\감귤 착과량\sample_submission.csv')

In [None]:
submission1 = submission.copy()
submission2 = submission.copy()

submission1['착과량(int)'] = xgb_pred_sum_1*0.4 + lgb_pred_sum_1*0.4 + cat_pred_sum_1*0.2
submission2['착과량(int)'] = xgb_pred_sum_2*0.4 + lgb_pred_sum_2*0.4 + cat_pred_sum_2*0.2

In [None]:
submission['착과량(int)'] = submission1['착과량(int)']*0.8 + submission2['착과량(int)']*0.2