In [1]:
#pip install pycaret
#anaconda prompt 관리자 권한으로 실행해서 해당 명령어 입력

# import

In [2]:
import pandas as pd
import numpy as np
import random
import os

import warnings
warnings.filterwarnings('ignore') 

import matplotlib
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt

from pycaret.regression import *
from sklearn.model_selection import StratifiedKFold

# 랜덤 시드와 폰트 설정, 평가산식 설정

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [4]:
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
    
matplotlib.rcParams['axes.unicode_minus'] = False  

In [5]:
def NMAE(true, pred):
    mae = np.mean(np.abs(true-pred))
    score = mae / np.mean(np.abs(true))
    return score

# 데이터 로드 및 파생변수 생성

In [6]:
df_train = pd.read_csv("train.csv")
df_test =  pd.read_csv("test.csv")
print(df_train.info())
print(df_train.isnull().sum())

print(df_test.info())
print(df_test.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2207 entries, 0 to 2206
Columns: 184 entries, ID to 2022-11-28 엽록소
dtypes: float64(182), int64(1), object(1)
memory usage: 3.1+ MB
None
ID                0
착과량(int)          0
수고(m)             0
수관폭1(min)         0
수관폭2(max)         0
수관폭평균             0
2022-09-01 새순     0
2022-09-02 새순     0
2022-09-03 새순     0
2022-09-04 새순     0
2022-09-05 새순     0
2022-09-06 새순     0
2022-09-07 새순     0
2022-09-08 새순     0
2022-09-09 새순     0
2022-09-10 새순     0
2022-09-11 새순     0
2022-09-12 새순     0
2022-09-13 새순     0
2022-09-14 새순     0
2022-09-15 새순     0
2022-09-16 새순     0
2022-09-17 새순     0
2022-09-18 새순     0
2022-09-19 새순     0
2022-09-20 새순     0
2022-09-21 새순     0
2022-09-22 새순     0
2022-09-23 새순     0
2022-09-24 새순     0
2022-09-25 새순     0
2022-09-26 새순     0
2022-09-27 새순     0
2022-09-28 새순     0
2022-09-29 새순     0
2022-09-30 새순     0
2022-10-01 새순     0
2022-10-02 새순     0
2022-10-03 새순     0
2022-10-04 새순     0
2022-10-05 새순 

In [7]:
# 함수 정의 : df_v1 ~ df_v9까지 원하는 데이터 추출할 수 있음
def fn_get_feature_v3(df, idx=2) :    
    idx = 2 - idx
    if idx > 7 :
        idx = 6
    
    # 파생 변수
    df = df.copy()
    df['새순mean'] = df.iloc[:,6-idx:95-idx].mean(axis=1)
    df['새순std'] = df.iloc[:,6-idx:95-idx].std(axis=1)
    df['새순min'] = df.iloc[:,6-idx:95-idx].min(axis=1)
    df['새순max'] = df.iloc[:,6-idx:95-idx].max(axis=1)
    df['새순gap'] = df.iloc[:,94-idx] - df.iloc[:,6-idx]
    
    df['엽록소mean'] = df.iloc[:,95-idx:184-idx].mean(axis=1)
    df['엽록소std'] = df.iloc[:,95-idx:184-idx].std(axis=1)
    df['엽록소min'] = df.iloc[:,95-idx:184-idx].min(axis=1)
    df['엽록소max'] = df.iloc[:,95-idx:184-idx].max(axis=1)
    df['엽록소gap'] = df.iloc[:,183-idx] - df.iloc[:,95-idx]  
    

    df_v1 = df.iloc[:,1-idx:6-idx] # 시계열 데이터 제외한 학습 데이터
    df_v2 = df.iloc[:,1-idx:184-idx] # 원본 데이터
    df_v3 = pd.concat([df.iloc[:,1-idx:6-idx], df.iloc[:,184-idx:]], axis=1) # 파생변수 포함한 학습 데이터 (세순 및 엽록소 시계열 데이터 제외)
    df_v4 = df.iloc[:,1-idx:] # 파생변수까지 모두 포함한 학습 데이터
    df_v5 = pd.concat([df.iloc[:,1-idx:6-idx], df.iloc[:,184-idx:189-idx]], axis=1) # 시계열 데이터 제외, 새순 관련 파생병수만 포함된 학습 데이터
    df_v6 = df.iloc[:,1-idx:189-idx] # 원본데이터에 새순 관련 파생변수만 포함한 학습 데이터
    
    df_v7 = pd.concat([df.iloc[:,1-idx:95-idx], df.iloc[:,184-idx:189-idx]], axis=1) # 수고,수관폭 및 세순관련 시계열 데이터 + 파생변수만 포함된 학습 데이터
    df_v8 = pd.concat([df.iloc[:,1-idx:95-idx], df.iloc[:,184-idx:]], axis=1) # 수고,수관폭, 세순관련 시계열 데이터, 세순 관련 파생변수, 엽록소 관련 파생변수가 포함된 학습 데이터
    
    df_v9 = pd.concat([df.iloc[:,1-idx], df.iloc[:,6-idx:95-idx]], axis=1)
    df_v9 = pd.concat([df_v9, df.iloc[:,184-idx:189-idx]], axis=1) # 새순관련 시계열 데이터 및 세순 관련 파생변수만 포한된 학습데이터
    

    return df_v1, df_v2, df_v3, df_v4, df_v5, df_v6, df_v7, df_v8, df_v9

In [8]:
# 첫번째 함수로 파생변수 생성성
df_train_v1, df_train_v2, df_train_v3, df_train_v4, df_train_v5, df_train_v6, df_train_v7, df_train_v8, df_train_v9 = fn_get_feature_v3(df_train, idx=2)
df_train_v2.head(2) # 원본 데이터

Unnamed: 0,착과량(int),수고(m),수관폭1(min),수관폭2(max),수관폭평균,2022-09-01 새순,2022-09-02 새순,2022-09-03 새순,2022-09-04 새순,2022-09-05 새순,2022-09-06 새순,2022-09-07 새순,2022-09-08 새순,2022-09-09 새순,2022-09-10 새순,2022-09-11 새순,2022-09-12 새순,2022-09-13 새순,2022-09-14 새순,2022-09-15 새순,2022-09-16 새순,2022-09-17 새순,2022-09-18 새순,2022-09-19 새순,2022-09-20 새순,2022-09-21 새순,2022-09-22 새순,2022-09-23 새순,2022-09-24 새순,2022-09-25 새순,2022-09-26 새순,2022-09-27 새순,2022-09-28 새순,2022-09-29 새순,2022-09-30 새순,2022-10-01 새순,2022-10-02 새순,2022-10-03 새순,2022-10-04 새순,2022-10-05 새순,2022-10-06 새순,2022-10-07 새순,2022-10-08 새순,2022-10-09 새순,2022-10-10 새순,2022-10-11 새순,2022-10-12 새순,2022-10-13 새순,2022-10-14 새순,2022-10-15 새순,2022-10-16 새순,2022-10-17 새순,2022-10-18 새순,2022-10-19 새순,2022-10-20 새순,2022-10-21 새순,2022-10-22 새순,2022-10-23 새순,2022-10-24 새순,2022-10-25 새순,2022-10-26 새순,2022-10-27 새순,2022-10-28 새순,2022-10-29 새순,2022-10-30 새순,2022-10-31 새순,2022-11-01 새순,2022-11-02 새순,2022-11-03 새순,2022-11-04 새순,2022-11-05 새순,2022-11-06 새순,2022-11-07 새순,2022-11-08 새순,2022-11-09 새순,2022-11-10 새순,2022-11-11 새순,2022-11-12 새순,2022-11-13 새순,2022-11-14 새순,2022-11-15 새순,2022-11-16 새순,2022-11-17 새순,2022-11-18 새순,2022-11-19 새순,2022-11-20 새순,2022-11-21 새순,2022-11-22 새순,2022-11-23 새순,2022-11-24 새순,2022-11-25 새순,2022-11-26 새순,2022-11-27 새순,2022-11-28 새순,2022-09-01 엽록소,2022-09-02 엽록소,2022-09-03 엽록소,2022-09-04 엽록소,2022-09-05 엽록소,2022-09-06 엽록소,2022-09-07 엽록소,2022-09-08 엽록소,2022-09-09 엽록소,2022-09-10 엽록소,2022-09-11 엽록소,2022-09-12 엽록소,2022-09-13 엽록소,2022-09-14 엽록소,2022-09-15 엽록소,2022-09-16 엽록소,2022-09-17 엽록소,2022-09-18 엽록소,2022-09-19 엽록소,2022-09-20 엽록소,2022-09-21 엽록소,2022-09-22 엽록소,2022-09-23 엽록소,2022-09-24 엽록소,2022-09-25 엽록소,2022-09-26 엽록소,2022-09-27 엽록소,2022-09-28 엽록소,2022-09-29 엽록소,2022-09-30 엽록소,2022-10-01 엽록소,2022-10-02 엽록소,2022-10-03 엽록소,2022-10-04 엽록소,2022-10-05 엽록소,2022-10-06 엽록소,2022-10-07 엽록소,2022-10-08 엽록소,2022-10-09 엽록소,2022-10-10 엽록소,2022-10-11 엽록소,2022-10-12 엽록소,2022-10-13 엽록소,2022-10-14 엽록소,2022-10-15 엽록소,2022-10-16 엽록소,2022-10-17 엽록소,2022-10-18 엽록소,2022-10-19 엽록소,2022-10-20 엽록소,2022-10-21 엽록소,2022-10-22 엽록소,2022-10-23 엽록소,2022-10-24 엽록소,2022-10-25 엽록소,2022-10-26 엽록소,2022-10-27 엽록소,2022-10-28 엽록소,2022-10-29 엽록소,2022-10-30 엽록소,2022-10-31 엽록소,2022-11-01 엽록소,2022-11-02 엽록소,2022-11-03 엽록소,2022-11-04 엽록소,2022-11-05 엽록소,2022-11-06 엽록소,2022-11-07 엽록소,2022-11-08 엽록소,2022-11-09 엽록소,2022-11-10 엽록소,2022-11-11 엽록소,2022-11-12 엽록소,2022-11-13 엽록소,2022-11-14 엽록소,2022-11-15 엽록소,2022-11-16 엽록소,2022-11-17 엽록소,2022-11-18 엽록소,2022-11-19 엽록소,2022-11-20 엽록소,2022-11-21 엽록소,2022-11-22 엽록소,2022-11-23 엽록소,2022-11-24 엽록소,2022-11-25 엽록소,2022-11-26 엽록소,2022-11-27 엽록소,2022-11-28 엽록소
0,692,275.0,287.0,292.0,289.5,2.8,2.8,2.7,2.7,2.7,2.7,2.6,2.6,2.6,2.6,2.5,2.5,2.5,2.5,2.4,2.4,2.4,2.4,2.3,2.3,2.3,2.3,2.2,2.2,2.2,2.2,2.1,2.1,2.1,2.1,2.0,2.0,2.0,2.0,1.9,1.9,1.9,1.9,1.8,1.8,1.8,1.8,1.7,1.7,1.7,1.7,1.6,1.6,1.6,1.6,1.5,1.5,1.5,1.5,1.4,1.4,1.4,1.4,1.3,1.3,1.3,1.2,1.2,1.2,1.2,1.2,1.1,1.1,1.1,1.1,1.0,1.0,1.0,1.0,1.0,0.9,0.9,0.9,0.9,0.9,0.8,0.8,0.8,0.7,0.7,0.7,0.7,0.7,0.6,78.336504,78.243462,78.166501,78.127526,78.058021,77.915689,77.86166,77.746645,77.592138,77.572477,77.382045,77.211789,77.151051,77.063069,76.905981,76.868475,76.732897,76.66928,76.662585,76.519409,76.349661,76.186091,76.040207,75.999415,75.990592,75.953731,75.829564,75.798673,75.621243,75.427005,75.266187,75.226094,75.162319,75.079444,75.013855,74.935697,74.842749,74.836657,74.816928,74.788574,74.690358,74.504942,74.394908,74.351704,74.166346,74.020254,73.835142,73.766164,73.755269,73.627395,73.465457,73.395076,73.311405,73.243028,73.087409,73.081651,73.023755,72.987263,72.977606,72.838462,72.645386,72.512869,72.445908,72.370097,72.30118,72.228374,72.207746,72.068354,71.987663,71.869466,71.852923,71.690854,71.576402,71.384018,71.301893,71.234817,71.076903,70.996113,70.985843,70.978249,70.876794,70.705253,70.559603,70.427356,70.340491,70.29383,70.262422,70.169841,70.043251
1,534,293.0,284.0,336.0,310.0,3.3,3.3,3.3,3.2,3.2,3.1,3.1,3.1,3.0,3.0,3.0,2.9,2.9,2.9,2.9,2.8,2.8,2.8,2.7,2.7,2.7,2.7,2.6,2.6,2.6,2.5,2.5,2.5,2.4,2.4,2.4,2.3,2.3,2.3,2.2,2.2,2.2,2.1,2.1,2.1,2.0,2.0,2.0,1.9,1.9,1.9,1.8,1.8,1.8,1.7,1.7,1.7,1.6,1.6,1.6,1.5,1.5,1.5,1.4,1.4,1.4,1.3,1.3,1.3,1.2,1.2,1.2,1.1,1.1,1.0,1.0,1.0,1.0,0.9,0.9,0.8,0.8,0.8,0.7,0.7,0.7,0.6,0.6,0.6,0.5,0.0,0.0,0.0,0.0,80.116691,79.974397,79.890399,79.709278,79.531061,79.348139,79.151023,79.128739,79.084197,78.924476,78.893248,78.830942,78.654775,78.609415,78.512242,78.382914,78.188714,77.999407,77.804256,77.640743,77.559244,77.403016,77.307459,77.107877,77.106228,77.005851,76.830049,76.810264,76.669418,76.572528,76.478269,76.328153,76.276879,76.092349,75.915247,75.76083,75.591943,75.431225,75.244795,75.11925,74.997296,74.880196,74.750138,74.603761,74.524808,74.351383,74.272749,74.13801,74.05279,74.008452,73.826668,73.685576,73.668765,73.542308,73.530723,73.52357,73.507814,73.470401,73.373457,73.248743,73.226885,73.183666,73.156484,73.072504,72.909826,72.880429,72.775387,72.624061,72.561918,72.437466,72.274675,72.145225,71.970863,71.955864,71.866504,71.678098,71.653838,71.593234,71.55146,71.535483,71.382303,71.253604,71.092665,70.955608,70.79663,70.59755,70.565088,70.560502,70.4276


In [9]:
df_train_v3.head() # 파생변수 포함한 학습 데이터 (새순 및 엽록소 시계열 데이터 제외)

Unnamed: 0,착과량(int),수고(m),수관폭1(min),수관폭2(max),수관폭평균,새순mean,새순std,새순min,새순max,새순gap,엽록소mean,엽록소std,엽록소min,엽록소max,엽록소gap
0,692,275.0,287.0,292.0,289.5,1.693258,0.638143,0.6,2.8,-2.2,74.155836,2.427798,70.043251,78.336504,-8.293252
1,534,293.0,284.0,336.0,310.0,1.861798,0.894491,0.0,3.3,-3.3,74.962123,2.813831,70.4276,80.116691,-9.68909
2,634,300.0,392.0,450.0,421.0,1.762921,0.712611,0.5,3.0,-2.5,74.727999,2.476758,70.399578,79.118529,-8.718951
3,639,289.0,368.0,379.0,373.5,1.857303,0.692802,0.7,3.1,-2.4,73.54621,2.641605,69.13397,77.936262,-8.802293
4,496,306.0,353.0,358.0,355.5,2.125843,0.898171,0.6,3.7,-3.1,71.841067,2.642378,67.410093,76.233231,-8.823138


In [10]:
df_train_v9.head()
#예린님의 EDA 코드 -> 새순과의 연관성만이 뚜렷이 나타남. 해당 데이터 사용.

Unnamed: 0,착과량(int),2022-09-01 새순,2022-09-02 새순,2022-09-03 새순,2022-09-04 새순,2022-09-05 새순,2022-09-06 새순,2022-09-07 새순,2022-09-08 새순,2022-09-09 새순,2022-09-10 새순,2022-09-11 새순,2022-09-12 새순,2022-09-13 새순,2022-09-14 새순,2022-09-15 새순,2022-09-16 새순,2022-09-17 새순,2022-09-18 새순,2022-09-19 새순,2022-09-20 새순,2022-09-21 새순,2022-09-22 새순,2022-09-23 새순,2022-09-24 새순,2022-09-25 새순,2022-09-26 새순,2022-09-27 새순,2022-09-28 새순,2022-09-29 새순,2022-09-30 새순,2022-10-01 새순,2022-10-02 새순,2022-10-03 새순,2022-10-04 새순,2022-10-05 새순,2022-10-06 새순,2022-10-07 새순,2022-10-08 새순,2022-10-09 새순,2022-10-10 새순,2022-10-11 새순,2022-10-12 새순,2022-10-13 새순,2022-10-14 새순,2022-10-15 새순,2022-10-16 새순,2022-10-17 새순,2022-10-18 새순,2022-10-19 새순,2022-10-20 새순,2022-10-21 새순,2022-10-22 새순,2022-10-23 새순,2022-10-24 새순,2022-10-25 새순,2022-10-26 새순,2022-10-27 새순,2022-10-28 새순,2022-10-29 새순,2022-10-30 새순,2022-10-31 새순,2022-11-01 새순,2022-11-02 새순,2022-11-03 새순,2022-11-04 새순,2022-11-05 새순,2022-11-06 새순,2022-11-07 새순,2022-11-08 새순,2022-11-09 새순,2022-11-10 새순,2022-11-11 새순,2022-11-12 새순,2022-11-13 새순,2022-11-14 새순,2022-11-15 새순,2022-11-16 새순,2022-11-17 새순,2022-11-18 새순,2022-11-19 새순,2022-11-20 새순,2022-11-21 새순,2022-11-22 새순,2022-11-23 새순,2022-11-24 새순,2022-11-25 새순,2022-11-26 새순,2022-11-27 새순,2022-11-28 새순,새순mean,새순std,새순min,새순max,새순gap
0,692,2.8,2.8,2.7,2.7,2.7,2.7,2.6,2.6,2.6,2.6,2.5,2.5,2.5,2.5,2.4,2.4,2.4,2.4,2.3,2.3,2.3,2.3,2.2,2.2,2.2,2.2,2.1,2.1,2.1,2.1,2.0,2.0,2.0,2.0,1.9,1.9,1.9,1.9,1.8,1.8,1.8,1.8,1.7,1.7,1.7,1.7,1.6,1.6,1.6,1.6,1.5,1.5,1.5,1.5,1.4,1.4,1.4,1.4,1.3,1.3,1.3,1.2,1.2,1.2,1.2,1.2,1.1,1.1,1.1,1.1,1.0,1.0,1.0,1.0,1.0,0.9,0.9,0.9,0.9,0.9,0.8,0.8,0.8,0.7,0.7,0.7,0.7,0.7,0.6,1.693258,0.638143,0.6,2.8,-2.2
1,534,3.3,3.3,3.3,3.2,3.2,3.1,3.1,3.1,3.0,3.0,3.0,2.9,2.9,2.9,2.9,2.8,2.8,2.8,2.7,2.7,2.7,2.7,2.6,2.6,2.6,2.5,2.5,2.5,2.4,2.4,2.4,2.3,2.3,2.3,2.2,2.2,2.2,2.1,2.1,2.1,2.0,2.0,2.0,1.9,1.9,1.9,1.8,1.8,1.8,1.7,1.7,1.7,1.6,1.6,1.6,1.5,1.5,1.5,1.4,1.4,1.4,1.3,1.3,1.3,1.2,1.2,1.2,1.1,1.1,1.0,1.0,1.0,1.0,0.9,0.9,0.8,0.8,0.8,0.7,0.7,0.7,0.6,0.6,0.6,0.5,0.0,0.0,0.0,0.0,1.861798,0.894491,0.0,3.3,-3.3
2,634,3.0,2.9,2.9,2.9,2.9,2.8,2.8,2.8,2.8,2.7,2.7,2.7,2.6,2.6,2.6,2.6,2.5,2.5,2.5,2.5,2.4,2.4,2.4,2.3,2.3,2.3,2.3,2.2,2.2,2.2,2.2,2.1,2.1,2.1,2.0,2.0,2.0,2.0,1.9,1.9,1.9,1.8,1.8,1.8,1.8,1.7,1.7,1.7,1.6,1.6,1.6,1.6,1.5,1.5,1.5,1.4,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.2,1.2,1.2,1.1,1.1,1.1,1.1,1.0,1.0,1.0,0.9,0.9,0.9,0.9,0.8,0.8,0.8,0.7,0.7,0.7,0.7,0.6,0.6,0.6,0.5,1.762921,0.712611,0.5,3.0,-2.5
3,639,3.1,3.0,3.0,3.0,3.0,2.9,2.9,2.9,2.8,2.8,2.8,2.7,2.7,2.7,2.7,2.6,2.6,2.6,2.5,2.5,2.5,2.5,2.4,2.4,2.4,2.3,2.3,2.3,2.3,2.2,2.2,2.2,2.2,2.1,2.1,2.1,2.1,2.0,2.0,2.0,2.0,1.9,1.9,1.9,1.9,1.8,1.8,1.8,1.8,1.7,1.7,1.7,1.6,1.6,1.6,1.6,1.5,1.5,1.5,1.4,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.2,1.2,1.2,1.2,1.1,1.1,1.1,1.0,1.0,1.0,0.9,0.9,0.9,0.9,0.8,0.8,0.8,0.8,0.7,0.7,0.7,1.857303,0.692802,0.7,3.1,-2.4
4,496,3.7,3.6,3.6,3.6,3.5,3.5,3.5,3.4,3.4,3.3,3.3,3.3,3.2,3.2,3.2,3.1,3.1,3.1,3.0,3.0,2.9,2.9,2.9,2.8,2.8,2.8,2.7,2.7,2.7,2.6,2.6,2.6,2.5,2.5,2.5,2.4,2.4,2.4,2.3,2.3,2.3,2.2,2.2,2.2,2.1,2.1,2.1,2.0,2.0,2.0,1.9,1.9,1.9,1.8,1.8,1.8,1.7,1.7,1.6,1.6,1.6,1.5,1.5,1.5,1.4,1.4,1.4,1.3,1.3,1.3,1.2,1.2,1.2,1.1,1.1,1.0,1.0,1.0,0.9,0.9,0.9,0.8,0.8,0.8,0.7,0.7,0.7,0.6,0.6,2.125843,0.898171,0.6,3.7,-3.1


# 모델링

In [11]:
reg= setup(data = df_train_v9, target = '착과량(int)', train_size = 0.8, data_split_shuffle=False, 
                normalize = True, session_id = 42)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,착과량(int)
2,Original Data,"(2207, 95)"
3,Missing Values,False
4,Numeric Features,94
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(1765, 47)"


In [12]:
models() #사용 가능한 모델명 리스트 조회

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Linear Regression,sklearn.linear_model._base.LinearRegression,True
lasso,Lasso Regression,sklearn.linear_model._coordinate_descent.Lasso,True
ridge,Ridge Regression,sklearn.linear_model._ridge.Ridge,True
en,Elastic Net,sklearn.linear_model._coordinate_descent.Elast...,True
lar,Least Angle Regression,sklearn.linear_model._least_angle.Lars,True
llar,Lasso Least Angle Regression,sklearn.linear_model._least_angle.LassoLars,True
omp,Orthogonal Matching Pursuit,sklearn.linear_model._omp.OrthogonalMatchingPu...,True
br,Bayesian Ridge,sklearn.linear_model._bayes.BayesianRidge,True
ard,Automatic Relevance Determination,sklearn.linear_model._bayes.ARDRegression,False
par,Passive Aggressive Regressor,sklearn.linear_model._passive_aggressive.Passi...,True


In [13]:
#pycaret에서의 AutoML은 NMAE가 적용되지 못함.
#regression으로 바꾸니까 mae, mse, rmse, r2, rmsle, mape로 가능해짐.
#best_5 = compare_models(sort = partial(NMAE, y_true = df_train_v9['착과량(int)']), n_select = 5)
best_5 = compare_models(sort = 'mae', n_select = 5)
top5 = [rank for rank in best_5]
tuned_top5 = [tune_model(i) for i in top5]
#참고 : https://pycaret.gitbook.io/docs/get-started/functions/train

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,33.4818,1744.1753,41.7633,0.9598,0.2745,0.1912
1,31.9454,1606.5601,40.0819,0.9639,0.2303,0.1451
2,36.1906,2172.8776,46.6141,0.9523,0.4069,0.5827
3,31.9017,1647.8546,40.5938,0.9683,0.4053,0.3297
4,30.9038,1451.196,38.0946,0.9666,0.3359,0.2927
5,35.659,1906.5591,43.6642,0.959,0.316,0.2712
6,31.8568,1605.8154,40.0726,0.9642,0.3136,0.2472
7,35.0713,1870.5125,43.2494,0.9642,0.3848,0.3724
8,32.7885,1641.7455,40.5185,0.9639,0.3776,0.3519
9,30.7706,1433.9774,37.8679,0.9715,0.2792,0.2006


In [14]:
# Create a list of (string, estimator) tuples from the best_5 and tuned_top5 lists
#estimator_list = [('best_' + str(i), best_5[i]) for i in range(len(best_5))] + \
                 #[('tuned_' + str(i), tuned_top5[i]) for i in range(len(tuned_top5))]
#print(estimator_list)

[('best_0', GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)), ('best_1', <catboost.core.CatBoostRegressor object at 0x0000016D349C1640>), ('best_2', LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
      

- 1위 : GradientBoostingRegressor
- 2위 : CatBoostRegresso
- 3위 : LGBMRegressor
- 4위 : RandomForestRegressor
- 5위 : XGBRegressor

# 앙상블

In [19]:
blend_model = blend_models(estimator_list = best_5, optimize = 'MAE')
pred = predict_model(blend_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,34.3703,1847.4479,42.9819,0.9574,0.2923,0.2041
1,32.2252,1714.805,41.4102,0.9615,0.2191,0.1408
2,34.977,1980.9186,44.5075,0.9565,0.3805,0.499
3,31.9738,1594.994,39.9374,0.9693,0.3683,0.3364
4,30.8715,1425.4932,37.7557,0.9672,0.3108,0.3019
5,35.6242,1898.6047,43.573,0.9591,0.2967,0.2401
6,31.017,1606.9251,40.0865,0.9641,0.3284,0.2757
7,33.9379,1748.2457,41.812,0.9666,0.3566,0.3888
8,30.7427,1489.7261,38.597,0.9673,0.3896,0.3692
9,30.4859,1418.9782,37.6693,0.9718,0.3092,0.2375


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,32.8271,1643.9434,40.5456,0.9679,0.4103,0.485


# test 데이터 변환 및 예측

In [23]:
df_test_v1, df_test_v2, df_test_v3, df_test_v4, df_test_v5, df_test_v6, df_test_v7, df_test_v8, df_test_v9 = fn_get_feature_v3(df_test, idx=1)

In [30]:
final_model = finalize_model(blend_model)
predictions = predict_model(final_model, data = df_test_v9)

In [41]:
predictions['Label'] = pd.Series(predictions['Label'], dtype = "int")
predictions['Label']

0       246
1       740
2       148
3       441
4       713
       ... 
2203    754
2204    343
2205    396
2206    232
2207     51
Name: Label, Length: 2208, dtype: int32

In [42]:
submission = pd.read_csv("sample_submission.csv")
submission.iloc[:, 1] = predictions['Label']
submission

Unnamed: 0,ID,착과량(int)
0,TEST_0000,246
1,TEST_0001,740
2,TEST_0002,148
3,TEST_0003,441
4,TEST_0004,713
...,...,...
2203,TEST_2203,754
2204,TEST_2204,343
2205,TEST_2205,396
2206,TEST_2206,232


In [43]:
submission.to_csv("pycaret_automl.csv", index=False)

# dacon 점수(public/private)
개선 전 : 0.0725775983 / 0.0727350146	
개선 후 : 0.079667045 / 0.0798747513

# 추가 참고 문헌
- https://blog.naver.com/didrh31/222555683014
- https://blog.naver.com/j7youngh/222802558038
- https://machineindeep.tistory.com/31
- https://blog.naver.com/charzim0611/222456178188
- https://velog.io/@gyounghwan1002/python-AutoML%EB%9D%BC%EC%9D%B4%EB%B8%8C%EB%9F%AC%EB%A6%AC-pycaret-%EC%82%AC%EC%9A%A9%EB%B2%95