# 데이터 변경

In [1]:
# 라이브러리 호출
import numpy as np
import pandas as pd
import time
from itertools import product 
import lightgbm

In [2]:
# 데이터 불러오기
data = pd.read_pickle('./data/data_before_meancnt.pkl')
data.head()

Unnamed: 0,month_id,shop_id,item_id,cnt_month,price_max,price_mean,price_min,city_id,cat_id,main_cat,sub_cat,year,month,first_sell,sales_month
0,0,59,22154,1.0,999.0,999.0,999.0,30,37,11,1,2013,1,0,0
1,0,59,2552,0.0,0.0,0.0,0.0,30,58,13,29,2013,1,0,0
2,0,59,2554,0.0,0.0,0.0,0.0,30,58,13,29,2013,1,0,0
3,0,59,2555,0.0,0.0,0.0,0.0,30,56,13,5,2013,1,0,0
4,0,59,2564,0.0,0.0,0.0,0.0,30,59,13,40,2013,1,0,0


In [3]:
# 사용하지 않을 컬럼 drop
data.drop(['year','first_sell','price_max','price_min'], axis=1, inplace=True)
data.head()

Unnamed: 0,month_id,shop_id,item_id,cnt_month,price_mean,city_id,cat_id,main_cat,sub_cat,month,sales_month
0,0,59,22154,1.0,999.0,30,37,11,1,1,0
1,0,59,2552,0.0,0.0,30,58,13,29,1,0
2,0,59,2554,0.0,0.0,30,58,13,29,1,0
3,0,59,2555,0.0,0.0,30,56,13,5,1,0
4,0,59,2564,0.0,0.0,30,59,13,40,1,0


In [4]:
# 기준별 평균 월간 판매량을 계산해 새로운 컬럼으로 추가하는 함수 정의
def mean_cnt_month(data, features):
    
    # 새로 생성될 피처명 생성
    new_feature_name = '/'.join(features[1:])+'_mean_cnt'
    
    # 기준에 따라 월간 평균 판매량 구하기
    pivot = data.pivot_table(index=features, values='cnt_month', aggfunc='mean')
    pivot = pivot.reset_index().rename(columns={'cnt_month': new_feature_name})
    
    # data와 pivot 병합 
    data = data.merge(pivot, on=features, how='left')
    
    del pivot

    return data

In [5]:
# 각 매장/상품 별로 임의로 정한 개월 수 이전의 값으로 컬럼을 추가하는 함수 정의
# num_lags=2라면 1개월전의 값 컬럼과 2개월전의 값 컬럼이 생성됨
# rolling()과 같은 과정이지만 현 데이터에 rolling을 적용하기 복잡하기 때문에 함수로 정의해서 사용

def make_lags(data, feature, num_lags):
    temp = data[['month_id','shop_id','item_id',feature]].copy() # 원본은 건드리지 않도록 copy()

    # lag 피쳐 생성 및 기존 데이터와 병합
    for i in range(num_lags):
        column_name = feature+'_'+str(i+1) # 새로 생성될 컬럼명 지정 
        temp.columns = ['month_id','shop_id','item_id',column_name] # temp 컬럼명 새 이름으로 재지정
        temp['month_id'] = temp['month_id']+(i+1) # month_id에 해당 lag 숫자(몇 개월 전인지)를 더하기 > 해당 숫자만큼 월이 밀린 데이터가 됨 
        data = data.merge(temp, on=['month_id','shop_id','item_id'], how='left')
        data[column_name] = data[column_name].fillna(0) # 판매이력이 없는 경우(신상품) NaN이 존재할 수 있으므로 결측치는 0으로 대체
    del temp
    return data

In [6]:
data = mean_cnt_month(data, ['month_id','item_id'])
data = mean_cnt_month(data, ['month_id','item_id','city_id'])
data = mean_cnt_month(data, ['month_id','price_mean'])
data = mean_cnt_month(data, ['month_id','shop_id','price_mean'])
data = mean_cnt_month(data, ['month_id','sub_cat','sales_month'])

data = make_lags(data, 'cnt_month', 3)
data = make_lags(data, 'item_id_mean_cnt', 3)
data = make_lags(data, 'item_id/city_id_mean_cnt', 3)
data = make_lags(data, 'price_mean_mean_cnt', 3)
data = make_lags(data, 'shop_id/price_mean_mean_cnt', 3)
data = make_lags(data, 'sub_cat/sales_month_mean_cnt', 3)
data.head()

Unnamed: 0,month_id,shop_id,item_id,cnt_month,price_mean,city_id,cat_id,main_cat,sub_cat,month,...,item_id/city_id_mean_cnt_3,price_mean_mean_cnt_1,price_mean_mean_cnt_2,price_mean_mean_cnt_3,shop_id/price_mean_mean_cnt_1,shop_id/price_mean_mean_cnt_2,shop_id/price_mean_mean_cnt_3,sub_cat/sales_month_mean_cnt_1,sub_cat/sales_month_mean_cnt_2,sub_cat/sales_month_mean_cnt_3
0,0,59,22154,1.0,999.0,30,37,11,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,59,2552,0.0,0.0,30,58,13,29,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,59,2554,0.0,0.0,30,58,13,29,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,59,2555,0.0,0.0,30,56,13,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,59,2564,0.0,0.0,30,59,13,40,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# month_id 0,1,2 데이터 제거 > 3개월 만큼 lags 생성했기때문에 해당값이 없음
data = data.drop(data[data['month_id'] < 3].index)

In [8]:
data.isnull().sum()

month_id                               0
shop_id                                0
item_id                                0
cnt_month                         214200
price_mean                        214200
city_id                                0
cat_id                                 0
main_cat                               0
sub_cat                                0
month                                  0
sales_month                            0
item_id_mean_cnt                  214200
item_id/city_id_mean_cnt          214200
price_mean_mean_cnt               214200
shop_id/price_mean_mean_cnt       214200
sub_cat/sales_month_mean_cnt      214200
cnt_month_1                            0
cnt_month_2                            0
cnt_month_3                            0
item_id_mean_cnt_1                     0
item_id_mean_cnt_2                     0
item_id_mean_cnt_3                     0
item_id/city_id_mean_cnt_1             0
item_id/city_id_mean_cnt_2             0
item_id/city_id_

In [10]:
# 테스트 데이터에 존재하지 않을 데이터 컬럼 제거
data = data.drop(['price_mean','item_id_mean_cnt',
'item_id/city_id_mean_cnt', 
'price_mean_mean_cnt',
'shop_id/price_mean_mean_cnt','sub_cat/sales_month_mean_cnt'],axis=1)

In [11]:
data.to_pickle('./data_1_3.pkl')

In [12]:
# train/validation/test data split
X_train = data[data['month_id'] < 33].drop(['cnt_month'], axis=1)
y_train = data[data['month_id'] < 33]['cnt_month'].clip(0,20)

X_valid = data[data['month_id'] == 33].drop(['cnt_month'], axis=1)
y_valid = data[data['month_id'] == 33]['cnt_month'].clip(0,20)

X_test = data[data['month_id'] == 34].drop(['cnt_month'], axis=1)

In [13]:
# hyper parameter 지정
params = {'metric': 'rmse', # 평가지표 = rmse
          'num_leaves': 255,
          'learning_rate': 0.005,
          'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5,
          'force_col_wise': True,
          'random_state': 42}

cat = ['shop_id', 'city_id', 'cat_id', 'main_cat', 'sub_cat']

# 데이터 셋 지정
train = lightgbm.Dataset(X_train, y_train)
valid = lightgbm.Dataset(X_valid, y_valid)
 
# 모델 훈련
start = time.time()
lgb_model = lightgbm.train(params=params,
                      train_set=train,
                      num_boost_round=1500,
                      valid_sets=(train, valid),
                      early_stopping_rounds=150,
                      categorical_feature=cat,
                      verbose_eval=100)
end = time.time()
print(f'소요시간(s) : {end-start}')

New categorical_feature is ['cat_id', 'city_id', 'main_cat', 'shop_id', 'sub_cat']


[LightGBM] [Info] Total Bins 4419
[LightGBM] [Info] Number of data points in the train set: 7596885, number of used features: 27




[LightGBM] [Info] Start training from score 0.306474
Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 1.02271	valid_1's rmse: 0.916924
[200]	training's rmse: 0.921393	valid_1's rmse: 0.853193
[300]	training's rmse: 0.870009	valid_1's rmse: 0.832109
[400]	training's rmse: 0.841346	valid_1's rmse: 0.826508
[500]	training's rmse: 0.823134	valid_1's rmse: 0.827785
Early stopping, best iteration is:
[437]	training's rmse: 0.833823	valid_1's rmse: 0.826229
소요시간(s) : 368.7993977069855


In [19]:
# 예측 수행
y_pred = lgb_model.predict(X_test).clip(0, 20) # 실제 타겟값이 클립되어있으므로 동일하게 클립

In [20]:
# 제출 파일 생성
df_test = pd.read_csv('e:/Git_public_dodo_Riley/kaggle/Future_Sales/data/test.csv')
submission = pd.DataFrame({"ID": df_test.index, "item_cnt_month": y_pred})
submission.to_csv('lgbm_submission_4.csv', index=False)

In [None]:
# Score: 0.89852