In [None]:
# Data Wrangling
import pandas as pd
import numpy as np

#Utility
import random
import os

# Preprocessing & Feature Engineering
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile

# Evaluation
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

# Optuna
import optuna
from optuna.samplers import TPESampler
from optuna import Trial

# Modeling
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Visualization
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

pd.set_option('mode.chained_assignment',  None)

In [None]:
train = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_CODE_REVIEW\감귤 착과량\train.csv')
test = pd.read_csv(r'C:\Users\dlwks\OneDrive\바탕 화면\VSCode\DACON_CODE_REVIEW\감귤 착과량\test.csv')

In [None]:
from collections import Counter

def print_mode(df, col):
    cnt = Counter(df[col])
    list_cnt = cnt.most_common(3)

    for idx, value in enumerate(list_cnt):

        print(f'{col}의 최빈값 {idx + 1}순위 : {value[0]} & {value[-1]}개')

In [None]:
def print_statistics(df, col):

    max = df['착과량(int)'].max()
    min = df['착과량(int)'].min()
    mean = df['착과량(int)'].mean()
    median = df['착과량(int)'].median()

    print(f'{col}의 최대값 : {max}')
    print(f'{col}의 최소값 : {max}')
    print(f'{col}의 평균값 : {max}')
    print(f'{col}의 중앙값 : {max}')
    
    print_mode(df, col)

In [None]:
def identify_hist(df, col):

    sns.histplot(data = df[col], kde = True)
    print_statistics(df, col)

In [None]:
identify_hist(train, '착과량(int)')

In [None]:
y_train = train['착과량(int)']
X_drop_list = ['ID']
X_train = train.drop(X_drop_list, axis = 1)
X_test = test.drop(['ID'], axis = 1)

In [None]:
high_corr = train.corr().abs().sort_values(by = '착과량(int)', ascending = False).iloc[:,:1]
features_name = high_corr[high_corr['착과량(int)'] > 0.9].index
features_name = list(features_name)
features_name.remove('착과량(int)')
X, y = X_train.drop(['착과량(int)'], axis = 1), X_train['착과량(int)']

X_1 = X[features_name]
X_test_1 = X_test[features_name]

In [None]:
X_1 = X_1.apply(lambda x : x.clip(x.quantile(.01), x.quantile(.99)), axis = 0)

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42)

In [None]:
X_train = X_train.drop(['착과량(int)'], axis = 1)

In [None]:
X_train['9월_새순_mean'] = X_train.iloc[:,4:34].mean(axis = 1)
X_train['9월_새순_std'] = X_train.iloc[:,4:34].std(axis = 1)
X_train['9월_새순_var'] = X_train.iloc[:,4:34].var(axis = 1)
X_train['10월_새순_mean'] = X_train.iloc[:,34:65].mean(axis = 1)
X_train['10월_새순_std'] = X_train.iloc[:,34:65].std(axis = 1)
X_train['10월_새순_var'] = X_train.iloc[:,34:65].var(axis = 1)
X_train['11월_새순_mean'] = X_train.iloc[:,65:93].mean(axis = 1)
X_train['11월_새순_std'] = X_train.iloc[:,65:93].std(axis = 1)
X_train['11월_새순_var'] = X_train.iloc[:,65:93].var(axis = 1)
X_train['9월_엽록소_mean'] = X_train.iloc[:,93:123].mean(axis = 1)
X_train['9월_엽록소_std'] = X_train.iloc[:,93:123].std(axis = 1)
X_train['9월_엽록소_var'] = X_train.iloc[:,93:123].var(axis = 1)
X_train['10월_엽록소_mean'] = X_train.iloc[:,123:154].mean(axis = 1)
X_train['10월_엽록소_std'] = X_train.iloc[:,123:154].std(axis = 1)
X_train['10월_엽록소_var'] = X_train.iloc[:,123:154].var(axis = 1)
X_train['11월_엽록소_mean'] = X_train.iloc[:,154:182].mean(axis = 1)
X_train['11월_엽록소_std'] = X_train.iloc[:,154:182].std(axis = 1)
X_train['11월_엽록소_var'] = X_train.iloc[:,154:182].var(axis = 1)

In [None]:
X_train['새순max'] = X_train.iloc[:,4:93].max(axis=1)
X_train['새순min'] = X_train.iloc[:,4:93].min(axis=1)
X_train['엽록소max'] = X_train.iloc[:,93:182].max(axis=1)
X_train['엽록소min'] = X_train.iloc[:,93:182].min(axis=1)
X_train['새순차이'] = X_train['새순max']-X_train['새순min']
X_train['엽록소차이'] = X_train['엽록소max']-X_train['엽록소min']
X_train['수고X수관폭'] = X_train['수고(m)']*X_train['수관폭평균']
X_train['수관폭차이'] = X_train['수관폭2(max)']-X_train['수관폭1(min)']

In [None]:
for i in range(0,89):
    X_train[f'새순 + 엽록소_{i}'] = X_train.iloc[:,4:93].iloc[:,i] + X_train.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_train[f'새순 - 엽록소_{i}'] = X_train.iloc[:,4:93].iloc[:,i] - X_train.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_train[f'새순 * 엽록소_{i}'] = X_train.iloc[:,4:93].iloc[:,i] * X_train.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_train[f'새순 / 엽록소_{i}'] = X_train.iloc[:,4:93].iloc[:,i] / X_train.iloc[:,93:182].iloc[:,i]

In [None]:
X_test['9월_새순_mean'] = X_test.iloc[:,4:34].mean(axis = 1)
X_test['9월_새순_std'] = X_test.iloc[:,4:34].std(axis = 1)
X_test['9월_새순_var'] = X_test.iloc[:,4:34].var(axis = 1)
X_test['10월_새순_mean'] = X_test.iloc[:,34:65].mean(axis = 1)
X_test['10월_새순_std'] = X_test.iloc[:,34:65].std(axis = 1)
X_test['10월_새순_var'] = X_test.iloc[:,34:65].var(axis = 1)
X_test['11월_새순_mean'] = X_test.iloc[:,65:93].mean(axis = 1)
X_test['11월_새순_std'] = X_test.iloc[:,65:93].std(axis = 1)
X_test['11월_새순_var'] = X_test.iloc[:,65:93].var(axis = 1)
X_test['9월_엽록소_mean'] = X_test.iloc[:,93:123].mean(axis = 1)
X_test['9월_엽록소_std'] = X_test.iloc[:,93:123].std(axis = 1)
X_test['9월_엽록소_var'] = X_test.iloc[:,93:123].var(axis = 1)
X_test['10월_엽록소_mean'] = X_test.iloc[:,123:154].mean(axis = 1)
X_test['10월_엽록소_std'] = X_test.iloc[:,123:154].std(axis = 1)
X_test['10월_엽록소_var'] = X_test.iloc[:,123:154].var(axis = 1)
X_test['11월_엽록소_mean'] = X_test.iloc[:,154:182].mean(axis = 1)
X_test['11월_엽록소_std'] = X_test.iloc[:,154:182].std(axis = 1)
X_test['11월_엽록소_var'] = X_test.iloc[:,154:182].var(axis = 1)

In [None]:
X_test['새순max'] = X_test.iloc[:,4:93].max(axis=1)
X_test['새순min'] = X_test.iloc[:,4:93].min(axis=1)
X_test['엽록소max'] = X_test.iloc[:,93:182].max(axis=1)
X_test['엽록소min'] = X_test.iloc[:,93:182].min(axis=1)
X_test['새순차이'] = X_test['새순max']-X_test['새순min']
X_test['엽록소차이'] = X_test['엽록소max']-X_test['엽록소min']
X_test['수고X수관폭'] = X_test['수고(m)']*X_test['수관폭평균']
X_test['수관폭차이'] = X_test['수관폭2(max)']-X_test['수관폭1(min)']

In [None]:
for i in range(0,89):
    X_test[f'새순+엽록소_{i}'] = X_test.iloc[:,4:93].iloc[:,i]+X_test.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_test[f'새순-엽록소_{i}'] = X_test.iloc[:,4:93].iloc[:,i]-X_test.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_test[f'새순*엽록소_{i}'] = X_test.iloc[:,4:93].iloc[:,i]*X_test.iloc[:,93:182].iloc[:,i]
for i in range(0,89):
    X_test[f'새순/엽록소_{i}'] = X_test.iloc[:,4:93].iloc[:,i]/X_test.iloc[:,93:182].iloc[:,i]