In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from itertools import combinations
from scipy.stats import kurtosis, skew
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
input_data = pd.read_csv("2023_smartFarm_AI_hackathon_dataset.csv")

In [3]:
# date 컬럼을 datetime 형식으로 변환하고, year, month, day 컬럼 생성
input_data['date'] = pd.to_datetime(input_data['date'], format='%Y%m%d')
input_data["year"] = input_data["date"].dt.year
input_data["month"] = input_data["date"].dt.month
input_data["day"] = input_data["date"].dt.day

# date 컬럼 삭제
input_data.drop(['date'], axis=1, inplace=True)

# 0 값을 NaN으로 대체
input_data = input_data.replace(0, np.NaN)

# 결측값 비율이 90% 이하인 column만 선택
features_cols = []
for f in input_data.columns:
    if input_data[f].isnull().sum() / input_data.shape[0] <= 0.9:
        print(f, '\t', input_data[f].nunique(), '\t', input_data[f].isnull().sum() / input_data.shape[0])
        features_cols.append(f)

# 예제 데이터 프레임 생성
df = input_data[features_cols][['frmAr', 'frmDov', 'year', 'month', 'day']]

# 모든 열 조합 생성
combinations_list = list(combinations(df.columns, 2))

# 각 열 조합을 곱하여 새로운 열 생성
for combo in combinations_list:
    col1, col2 = combo
    new_col_name = f'{col1}_{col2}'
    df[new_col_name] = df[col1] * df[col2]
    
df = pd.concat([input_data[features_cols], df.iloc[:,5:]], axis = 1)

p_columns = ['frtstCo', 'tcdmt', 'grwtLt', 'fcluHg', 'lefLt', 'lefCunt', 'lefBt']
window_size = 7
alpha = 0.2

# 1. 이동평균 (Moving Average)
for combo in p_columns:
    col1 = combo
    new_col_name = f'{col1}_ma'
    df[new_col_name] = input_data[col1].rolling(window=window_size).mean()
    
# 2. 지수이동평균 (Exponential Moving Average)
for combo in p_columns:
    col1 = combo
    new_col_name = f'{col1}_ema'
    df[new_col_name] = input_data[col1].ewm(alpha=alpha).mean()

# 3. 차분 (Differencing)
for combo in p_columns:
    col1 = combo
    new_col_name = f'{col1}_d'
    df[new_col_name] = input_data[col1].diff()

# 4. 통계 요약 피처 (Statistical Summary Features)
window_size = 72  # 통계 요약을 계산할 윈도우 크기
for combo in p_columns:
    col1 = combo
    new_col_name = f'{col1}_std'
    df[new_col_name] = input_data[col1].rolling(window=window_size).std()

        
# 5. 시계열 데이터의 Lag 특성 생성 (과거 데이터 활용)
lag_steps = 3  # 라그 단계 수
for combo in p_columns:
    col1 = combo
    for i in range(1, lag_steps + 1):
        new_col_name = f'{col1}_lag_{i}'
        df[new_col_name] = input_data[col1].shift(i)
        
# 'plant_info', 'stem_info'열 생성
df['plant_info'] = input_data['grwtLt'] * input_data['lefCunt'] * input_data['lefLt'] * input_data['lefBt']
df['stem_info'] = input_data['fcluHg'] * input_data['stemThck'] * input_data['frmhsFclu']
df['Average_Single_Dose_Amount'] = input_data['otmsuplyqy'] / input_data['cunt']

# 계산에 사용한 컬럼 삭제
columns_to_remove = ['grwtLt', 'lefCunt', 'lefLt', 'lefBt', 'fcluHg', 'stemThck', 'frmhsFclu', 'flanGrupp', 'frtstGrupp']
input_data.drop(columns=columns_to_remove, inplace=True)

# 'water_usage_cost', 'fertilizer_usage_cost', 'co2_usage_cost', 'mist_usage_cost' 열 생성
df['water_usage_cost'] = input_data['WaterUsage'] * input_data['WaterCost']
df['fertilizer_usage_cost'] = input_data['FertilizerUsage'] * input_data['FertilizerCost']
df['co2_usage_cost'] = input_data['CO2Usage'] * input_data['CO2Cost']
df['mist_usage_cost'] = input_data['MistUsageTime'] * input_data['Mist Cost']

# 계산에 사용한 컬럼 삭제
columns_to_remove = ['WaterUsage', 'WaterCost', 'FertilizerUsage', 'FertilizerCost', 'CO2Usage', 'CO2Cost', 'MistUsageTime', 'Mist Cost']
input_data.drop(columns=columns_to_remove, inplace=True)

y = input_data.fillna(0)[['outtrn_cumsum','HeatingEnergyUsage_cumsum']]

# 'outtrn_cumsum' 열의 첨도와 왜도 계산
kurtosis_outtrn = kurtosis(y['outtrn_cumsum'])
skewness_outtrn = skew(y['outtrn_cumsum'])

# 'HeatingEnergyUsage_cumsum' 열의 첨도와 왜도 계산
kurtosis_energy = kurtosis(y['HeatingEnergyUsage_cumsum'])
skewness_energy = skew(y['HeatingEnergyUsage_cumsum'])

input_data['HeatingEnergyUsage_cumsum'] = np.log1p(input_data['HeatingEnergyUsage_cumsum'])

y = input_data.fillna(0)[['outtrn_cumsum','HeatingEnergyUsage_cumsum']]

X = df.fillna(0).drop(['frmDist','frmYear', 'frmWeek', 'outtrn_cumsum', 'lefCunt', 'hvstGrupp', 'outTp', 
                       'acSlrdQy', 'cunt', 'outWs', 'otmsuplyqy','inCo2', 'frtstGrupp', 'inHd', 'frmhsFclu', 'inTp', 
                       'flanGrupp', 'stemThck'], axis = 1)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 4)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create a Random Forest Regressor
model = RandomForestRegressor(random_state=0, n_estimators = 100, min_samples_split = 2)

# Train the model using the training sets 
model.fit(X_train, y_train)

# Model prediction on train data
y_pred_rf = model.predict(X_test)

rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf))
r2score = metrics.r2_score(y_test, y_pred_rf)

frmDist 	 360 	 0.0
inTp 	 72630 	 0.14391796322489392
inHd 	 72630 	 0.14391796322489392
otmsuplyqy 	 63090 	 0.25636492220650636
acSlrdQy 	 53070 	 0.3744695898161245
cunt 	 64560 	 0.23903818953323905
ph 	 66480 	 0.2164073550212164
outTp 	 72630 	 0.14391796322489392
outWs 	 38670 	 0.5442008486562943
inCo2 	 72630 	 0.14391796322489392
ec 	 66480 	 0.2164073550212164
frmYear 	 3 	 0.1364922206506365
frmWeek 	 52 	 0.1364922206506365
frtstGrupp 	 47460 	 0.4405940594059406
flanGrupp 	 46620 	 0.4504950495049505
frtstCo 	 51840 	 0.38896746817538896
frmhsFclu 	 47880 	 0.43564356435643564
hvstGrupp 	 43050 	 0.49257425742574257
grwtLt 	 43860 	 0.48302687411598305
fcluHg 	 42060 	 0.5042432814710043
lefLt 	 43110 	 0.4918670438472419
lefCunt 	 51510 	 0.39285714285714285
lefBt 	 43110 	 0.4918670438472419
stemThck 	 41790 	 0.5074257425742574
frmAr 	 7 	 0.0
frmDov 	 9 	 0.0
outtrn_cumsum 	 42390 	 0.5003536067892503
year 	 4 	 0.0
month 	 12 	 0.0
day 	 31 	 0.0
outtrn_cumsum 첨도: 7

In [4]:
### OUTPUT ### vanilla
print("RMSE:", rmse)
print("R2_score:", r2score)

RMSE: 1540.2758926230595
R2_score: 0.9972924279918429
