In [44]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.3f}'.format

In [45]:
train = pd.read_csv("./train_2016.csv")
test = pd.read_csv('../data/test.csv')
test['date'] = pd.to_datetime(test['date'])
test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           28512 non-null  int64         
 1   date         28512 non-null  datetime64[ns]
 2   store_nbr    28512 non-null  int64         
 3   family       28512 non-null  object        
 4   onpromotion  28512 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 1.1+ MB


In [46]:
test.columns

Index(['id', 'date', 'store_nbr', 'family', 'onpromotion'], dtype='object')

In [47]:
stores = pd.read_csv("../data//stores.csv")
test_stores = pd.merge(test, stores, on='store_nbr', how='left')

In [48]:
test_stores['weekday'] = test_stores['date'].dt.dayofweek + 1 # 1: 월요일, 7:일요일
test_stores['month'] = test_stores['date'].dt.month
test_stores['day'] = test_stores['date'].dt.day

test_stores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           28512 non-null  int64         
 1   date         28512 non-null  datetime64[ns]
 2   store_nbr    28512 non-null  int64         
 3   family       28512 non-null  object        
 4   onpromotion  28512 non-null  int64         
 5   city         28512 non-null  object        
 6   state        28512 non-null  object        
 7   type         28512 non-null  object        
 8   cluster      28512 non-null  int64         
 9   weekday      28512 non-null  int32         
 10  month        28512 non-null  int32         
 11  day          28512 non-null  int32         
dtypes: datetime64[ns](1), int32(3), int64(4), object(4)
memory usage: 2.3+ MB


In [49]:
test_stores.drop(columns=['city', 'state'], inplace=True)

In [50]:
test_stores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   id           28512 non-null  int64         
 1   date         28512 non-null  datetime64[ns]
 2   store_nbr    28512 non-null  int64         
 3   family       28512 non-null  object        
 4   onpromotion  28512 non-null  int64         
 5   type         28512 non-null  object        
 6   cluster      28512 non-null  int64         
 7   weekday      28512 non-null  int32         
 8   month        28512 non-null  int32         
 9   day          28512 non-null  int32         
dtypes: datetime64[ns](1), int32(3), int64(4), object(2)
memory usage: 1.8+ MB


In [51]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530508 entries, 0 to 530507
Data columns (total 19 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       530508 non-null  int64  
 1   date                     530508 non-null  object 
 2   store_nbr                530508 non-null  int64  
 3   family                   530508 non-null  object 
 4   sales                    530508 non-null  float64
 5   onpromotion              530508 non-null  int64  
 6   weekday                  530508 non-null  int64  
 7   year                     530508 non-null  int64  
 8   month                    530508 non-null  int64  
 9   day                      530508 non-null  int64  
 10  transactions             530508 non-null  float64
 11  is_holiday               530508 non-null  int64  
 12  type                     530508 non-null  object 
 13  cluster                  530508 non-null  int64  
 14  dcoi

In [52]:
# ▶ 학습에 필요없는 Column 제거
X_train= train.drop(['id', 'date', 'year'], axis=1)
X_test = test_stores.drop(['id', 'date'], axis=1)

In [53]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530508 entries, 0 to 530507
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   store_nbr                530508 non-null  int64  
 1   family                   530508 non-null  object 
 2   sales                    530508 non-null  float64
 3   onpromotion              530508 non-null  int64  
 4   weekday                  530508 non-null  int64  
 5   month                    530508 non-null  int64  
 6   day                      530508 non-null  int64  
 7   transactions             530508 non-null  float64
 8   is_holiday               530508 non-null  int64  
 9   type                     530508 non-null  object 
 10  cluster                  530508 non-null  int64  
 11  dcoilwtico_interpolated  530508 non-null  float64
 12  Rolling_Mean_Sales_14d   530508 non-null  float64
 13  Rolling_Std_Sales_14d    530508 non-null  float64
 14  Roll

In [54]:
# 계산을 원할하게 하기 위해 Dtype 을 줄임
# store_nbr, weekday, month, day 열의 데이터 타입을 int8로 변환
X_train['store_nbr'] = X_train['store_nbr'].astype('int8')
X_train['weekday'] = X_train['weekday'].astype('int8')
X_train['month'] = X_train['month'].astype('int8')
X_train['day'] = X_train['day'].astype('int8')
X_train['cluster'] = X_train['cluster'].astype('int8')


X_test['store_nbr'] = X_test['store_nbr'].astype('int8')
X_test['weekday'] = X_test['weekday'].astype('int8')
X_test['month'] = X_test['month'].astype('int8')
X_test['day'] = X_test['day'].astype('int8')
X_test['cluster'] = X_test['cluster'].astype('int8')

In [55]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   store_nbr    28512 non-null  int8  
 1   family       28512 non-null  object
 2   onpromotion  28512 non-null  int64 
 3   type         28512 non-null  object
 4   cluster      28512 non-null  int8  
 5   weekday      28512 non-null  int8  
 6   month        28512 non-null  int8  
 7   day          28512 non-null  int8  
dtypes: int64(1), int8(5), object(2)
memory usage: 807.6+ KB


In [56]:
# 피처 선택을 위한 데이터 전처리
train_encoded = pd.get_dummies(X_train, columns=['store_nbr', 'family', 'weekday', 'type', 'cluster'], drop_first=True)
test_encoded = pd.get_dummies(X_test, columns=['store_nbr', 'family', 'weekday', 'type', 'cluster'], drop_first=True)

In [57]:
missing_cols = set(train_encoded.columns) - set(test_encoded.columns)
for c in missing_cols:
    test_encoded[c] = 0

test_encoded = test_encoded[train_encoded.columns]

In [58]:
train_encoded.head()

Unnamed: 0,sales,onpromotion,month,day,transactions,is_holiday,dcoilwtico_interpolated,Rolling_Mean_Sales_14d,Rolling_Std_Sales_14d,Rolling_Mean_Sales_28d,Rolling_Std_Sales_28d,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,store_nbr_6,store_nbr_7,store_nbr_8,store_nbr_9,store_nbr_10,store_nbr_11,store_nbr_12,store_nbr_13,store_nbr_14,store_nbr_15,store_nbr_16,store_nbr_17,store_nbr_18,store_nbr_19,store_nbr_20,store_nbr_21,store_nbr_22,store_nbr_23,store_nbr_24,store_nbr_25,store_nbr_26,store_nbr_27,store_nbr_28,store_nbr_29,store_nbr_30,store_nbr_31,store_nbr_32,store_nbr_33,store_nbr_34,store_nbr_35,store_nbr_36,store_nbr_37,store_nbr_38,store_nbr_39,store_nbr_40,store_nbr_41,store_nbr_42,store_nbr_43,store_nbr_44,store_nbr_45,store_nbr_46,store_nbr_47,store_nbr_48,store_nbr_49,store_nbr_50,store_nbr_51,store_nbr_53,store_nbr_54,family_BABY CARE,family_BEAUTY,family_BEVERAGES,family_BOOKS,family_BREAD/BAKERY,family_CELEBRATION,family_CLEANING,family_DAIRY,family_DELI,family_EGGS,family_FROZEN FOODS,family_GROCERY I,family_GROCERY II,family_HARDWARE,family_HOME AND KITCHEN I,family_HOME AND KITCHEN II,family_HOME APPLIANCES,family_HOME CARE,family_LADIESWEAR,family_LAWN AND GARDEN,family_LINGERIE,"family_LIQUOR,WINE,BEER",family_MAGAZINES,family_MEATS,family_PERSONAL CARE,family_PET SUPPLIES,family_PLAYERS AND ELECTRONICS,family_POULTRY,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,type_B,type_C,type_D,type_E,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17
0,4.0,0,1,1,1558.562,0,37.05,2871.04,377.757,2921.52,267.115,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,13.0,0,1,1,1558.562,0,37.05,2770.08,513.271,2871.04,370.696,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,5104.0,1,1,1,1558.562,0,37.05,2669.121,601.863,2820.56,445.191,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,680.952,0,1,1,1558.562,0,37.05,2568.161,662.63,2770.08,503.676,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,13.0,0,1,1,1558.562,0,37.05,2467.201,702.825,2719.6,551.27,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [59]:
train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 530508 entries, 0 to 530507
Columns: 121 entries, sales to cluster_17
dtypes: bool(110), float64(7), int64(2), int8(2)
memory usage: 93.1 MB


In [68]:
test_encoded.head()

Unnamed: 0,sales,onpromotion,month,day,transactions,is_holiday,dcoilwtico_interpolated,Rolling_Mean_Sales_14d,Rolling_Std_Sales_14d,Rolling_Mean_Sales_28d,Rolling_Std_Sales_28d,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,store_nbr_6,store_nbr_7,store_nbr_8,store_nbr_9,store_nbr_10,store_nbr_11,store_nbr_12,store_nbr_13,store_nbr_14,store_nbr_15,store_nbr_16,store_nbr_17,store_nbr_18,store_nbr_19,store_nbr_20,store_nbr_21,store_nbr_22,store_nbr_23,store_nbr_24,store_nbr_25,store_nbr_26,store_nbr_27,store_nbr_28,store_nbr_29,store_nbr_30,store_nbr_31,store_nbr_32,store_nbr_33,store_nbr_34,store_nbr_35,store_nbr_36,store_nbr_37,store_nbr_38,store_nbr_39,store_nbr_40,store_nbr_41,store_nbr_42,store_nbr_43,store_nbr_44,store_nbr_45,store_nbr_46,store_nbr_47,store_nbr_48,store_nbr_49,store_nbr_50,store_nbr_51,store_nbr_53,store_nbr_54,family_BABY CARE,family_BEAUTY,family_BEVERAGES,family_BOOKS,family_BREAD/BAKERY,family_CELEBRATION,family_CLEANING,family_DAIRY,family_DELI,family_EGGS,family_FROZEN FOODS,family_GROCERY I,family_GROCERY II,family_HARDWARE,family_HOME AND KITCHEN I,family_HOME AND KITCHEN II,family_HOME APPLIANCES,family_HOME CARE,family_LADIESWEAR,family_LAWN AND GARDEN,family_LINGERIE,"family_LIQUOR,WINE,BEER",family_MAGAZINES,family_MEATS,family_PERSONAL CARE,family_PET SUPPLIES,family_PLAYERS AND ELECTRONICS,family_POULTRY,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,type_B,type_C,type_D,type_E,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17
0,0,0,8,16,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
1,0,0,8,16,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
2,0,2,8,16,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,0,20,8,16,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
4,0,0,8,16,0,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False


In [98]:
# 일별 transactions 평균 계산
daily_transactions_avg = train.groupby(train_encoded['day'])['transactions'].mean().reset_index(name='avg_transactions')

# 결과 확인
print(daily_transactions_avg)

    day  avg_transactions
0     1          1757.525
1     2          1805.570
2     3          1839.146
3     4          1742.056
4     5          1715.585
5     6          1729.235
6     7          1656.744
7     8          1634.930
8     9          1684.604
9    10          1678.069
10   11          1636.536
11   12          1636.305
12   13          1634.168
13   14          1594.849
14   15          1644.656
15   16          1715.221
16   17          1706.245
17   18          1715.051
18   19          1691.059
19   20          1745.767
20   21          1703.033
21   22          1706.954
22   23          1790.126
23   24          1766.591
24   25          1567.064
25   26          1632.808
26   27          1634.732
27   28          1582.304
28   29          1603.349
29   30          1751.655
30   31          1814.310


In [100]:
test_encoded = test_encoded.merge(daily_transactions_avg, on='day', how='left')

# transactions 열이 0인 경우 avg_transactions 값으로 대체
test_encoded.loc[test_encoded['transactions'] == 0, 'transactions'] = test_encoded['avg_transactions']

# 이제 avg_transactions 열은 필요 없으므로 삭제
test_encoded.drop(columns=['avg_transactions'], inplace=True)

test_encoded.head()

Unnamed: 0,sales,onpromotion,month,day,transactions,is_holiday,dcoilwtico_interpolated,Rolling_Mean_Sales_14d,Rolling_Std_Sales_14d,Rolling_Mean_Sales_28d,Rolling_Std_Sales_28d,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,store_nbr_6,store_nbr_7,store_nbr_8,store_nbr_9,store_nbr_10,store_nbr_11,store_nbr_12,store_nbr_13,store_nbr_14,store_nbr_15,store_nbr_16,store_nbr_17,store_nbr_18,store_nbr_19,store_nbr_20,store_nbr_21,store_nbr_22,store_nbr_23,store_nbr_24,store_nbr_25,store_nbr_26,store_nbr_27,store_nbr_28,store_nbr_29,store_nbr_30,store_nbr_31,store_nbr_32,store_nbr_33,store_nbr_34,store_nbr_35,store_nbr_36,store_nbr_37,store_nbr_38,store_nbr_39,store_nbr_40,store_nbr_41,store_nbr_42,store_nbr_43,store_nbr_44,store_nbr_45,store_nbr_46,store_nbr_47,store_nbr_48,store_nbr_49,store_nbr_50,store_nbr_51,store_nbr_53,store_nbr_54,family_BABY CARE,family_BEAUTY,family_BEVERAGES,family_BOOKS,family_BREAD/BAKERY,family_CELEBRATION,family_CLEANING,family_DAIRY,family_DELI,family_EGGS,family_FROZEN FOODS,family_GROCERY I,family_GROCERY II,family_HARDWARE,family_HOME AND KITCHEN I,family_HOME AND KITCHEN II,family_HOME APPLIANCES,family_HOME CARE,family_LADIESWEAR,family_LAWN AND GARDEN,family_LINGERIE,"family_LIQUOR,WINE,BEER",family_MAGAZINES,family_MEATS,family_PERSONAL CARE,family_PET SUPPLIES,family_PLAYERS AND ELECTRONICS,family_POULTRY,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weekday_7,type_B,type_C,type_D,type_E,cluster_2,cluster_3,cluster_4,cluster_5,cluster_6,cluster_7,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17
0,0,0,8,16,1715.221,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
1,0,0,8,16,1715.221,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
2,0,2,8,16,1715.221,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
3,0,20,8,16,1715.221,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False
4,0,0,8,16,1715.221,0,0,0,0,0,0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False


In [None]:
# 이동 평균 및 이동 표준 편차 계산
test_encoded['Rolling_Mean_Sales_14d'] = test_encoded['transactions'].rolling(window=14, min_periods=1).mean()
test_encoded['Rolling_Std_Sales_14d'] = test_encoded['transactions'].rolling(window=14, min_periods=1).std()

test_encoded['Rolling_Mean_Sales_28d'] = test_encoded['transactions'].rolling(window=28, min_periods=1).mean()
test_encoded['Rolling_Std_Sales_28d'] = test_encoded['transactions'].rolling(window=28, min_periods=1).std()


# 결측치를 다음 값으로 채우기
train_all['Rolling_Mean_sales_14d'].fillna(method='bfill', inplace=True)
train_all['Rolling_Std_sales_14d'].fillna(method='bfill', inplace=True)

train_all['Rolling_Mean_sales_28d'].fillna(method='bfill', inplace=True)
train_all['Rolling_Std_sales_28d'].fillna(method='bfill', inplace=True)

In [101]:
test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Columns: 121 entries, sales to cluster_17
dtypes: bool(110), float64(1), int64(8), int8(2)
memory usage: 5.0 MB


In [70]:
y_train = train_encoded['sales']
X_train = train_encoded.drop(['sales'], axis=1)

X_test = test_encoded.drop(['sales'], axis=1)

In [108]:
X_train.shape

(530508, 120)

In [109]:
from sklearn.preprocessing import MinMaxScaler

# MinMaxScaler 객체 생성
scaler = MinMaxScaler()

# 훈련 데이터에 스케일러 적용
X_train_scaled = scaler.fit_transform(X_train)

# 테스트 데이터에 스케일러 적용
X_test_scaled = scaler.transform(X_test)

In [110]:
X_test_scaled.shape

(28512, 120)

##### 하이퍼 파라미터 engineering

In [85]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
import numpy as np

# RMSLE 계산 함수
def rmsle(y_true, y_pred):
    y_true = np.maximum(y_true, 0)  # 음수 값을 0으로 변환
    y_pred = np.maximum(y_pred, 0)  # 음수 값을 0으로 변환
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [80, 100, 120]
    #,
    #'max_features': ['auto', 'sqrt', 'log2'],
    #'max_depth': [None, 5, 10, 20],
    #'min_samples_split': [2, 5, 10],
    #'min_samples_leaf': [1, 2, 4],
    #'bootstrap': [True, False]
}

# RandomForestRegressor 모델 생성
rf = RandomForestRegressor(random_state=42)

# GridSearchCV 설정
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_log_error')

# 모델 학습
grid_search.fit(X_train_scaled, y_train)

# 최적의 하이퍼파라미터 출력
print(f'Best parameters found: {grid_search.best_params_}')

# 최적 모델로 예측
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_train_scaled)

# RMSLE 평가
rmsle_best_rf = rmsle(y_train, y_pred_best_rf)
print(f'Best Random Forest RMSLE: {rmsle_best_rf}')

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ....................................n_estimators=80; total time=21.7min
[CV] END ....................................n_estimators=80; total time=22.4min
[CV] END ....................................n_estimators=80; total time=23.1min
[CV] END ...................................n_estimators=100; total time=27.2min
[CV] END ...................................n_estimators=100; total time=28.0min
[CV] END ...................................n_estimators=100; total time=28.7min
[CV] END ...................................n_estimators=120; total time=31.5min
[CV] END ...................................n_estimators=120; total time=31.9min
[CV] END ...................................n_estimators=120; total time=17.2min
Best parameters found: {'n_estimators': 120}
Best Random Forest RMSLE: 0.17493804729534518


### Modeling 적용

In [111]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor 

rf = RandomForestRegressor(n_estimators=120, random_state=42)
rf.fit(X_train_scaled, y_train)

# 예측
y_pred_rf = rf.predict(X_test_scaled)

# RMSLE 계산 함수
# def rmsle(y_true, y_pred):
#     y_true = np.maximum(y_true, 0)  # 음수 값을 0으로 변환
#     y_pred = np.maximum(y_pred, 0)  # 음수 값을 0으로 변환
#     return np.sqrt(mean_squared_log_error(y_true, y_pred))

# 평가
# rmsle_rf = rmsle(y_test, y_pred_rf)
# print(f'Random Forest RMSLE: {rmsle_rf}')

### submission 

In [103]:
submission = pd.read_csv("../data/sample_submission.csv")
predict_sales = pd.read_csv('../data/test.csv')

In [104]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28512 entries, 0 to 28511
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      28512 non-null  int64  
 1   sales   28512 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 445.6 KB


In [105]:
submission['sales'] = y_pred_rf
predict_sales['sales'] = y_pred_rf

In [106]:
submission.describe()

Unnamed: 0,id,sales
count,28512.0,28512.0
mean,3015143.5,161.57
std,8230.85,362.562
min,3000888.0,1.517
25%,3008015.75,4.1
50%,3015143.5,21.587
75%,3022271.25,137.161
max,3029399.0,2805.938


In [107]:
submission.to_csv("submission.csv")
predict_sales.to_csv("test_2017.csv")
