## 1. 학습 데이터의 월 판매량, 예측결과 (0,20)범위로 clip

## 2. hyper parameter 지정
- 'metric': 'rmse', # 평가지표 = rmse
- 'num_leaves': 255, # reaf node 수
- 'learning_rate': 0.005, # 디폴트0.1, 작을수록 예측성능이 높아질수있지만 과적합가능성도 올라감
- 'feature_fraction': 0.7, # 디폴트 1,개별 트리를 학습할 때 무작위로 선택되는 피쳐 비율
- 'bagging_fraction': 0.7, # 디폴트 1, 학습데이터 샘플링 비율
- 'bagging_freq': 3, # 디폴트 0, 몇번 iteration할때마다 다시 bagging을 수행할 지
- 'force_col_wise': True, # 디폴트 false, 컬럼수가 많을떄 메모리 비용을 줄일 수 있음
- 'random_state': 42 # seed 고정
- 'num_boost_round' : 1500 # 디폴트 100, 반복수행하는 횟수 
- 'early_stopping_rounds' : 150 # 디폴트 0, 검증결과가 150번동안 나아지지 않으면 멈춤
- 'categorical_feature' : ['shop_id', 'city_id', 'cat_id', 'main_cat', 'sub_cat'] # 범주형 피쳐 지정
- 'verbose_eval'=100 # 몇번째마다 평가지표 출력할지

## 3. 사용 데이터
- data_2 : data_1에서 피쳐 변경, 월별 일수 추가(보고서 내 전처리 항목 참고)

## 4. submission 여부 : YES
- Score: 1.0657, nan이 포함되어있었음

In [10]:
# 라이브러리 호출
import numpy as np
import pandas as pd
import time
import itertools
import lightgbm

In [23]:
# 데이터 불러오기
data = pd.read_pickle('./data/data_before_meancnt.pkl')
data.head()

Unnamed: 0,month_id,shop_id,item_id,cnt_month,price_max,price_mean,price_min,city_id,cat_id,main_cat,sub_cat,year,month,first_sell,sales_month
0,0,59,22154,1.0,999.0,999.0,999.0,30,37,11,1,2013,1,0,0
1,0,59,2552,0.0,0.0,0.0,0.0,30,58,13,29,2013,1,0,0
2,0,59,2554,0.0,0.0,0.0,0.0,30,58,13,29,2013,1,0,0
3,0,59,2555,0.0,0.0,0.0,0.0,30,56,13,5,2013,1,0,0
4,0,59,2564,0.0,0.0,0.0,0.0,30,59,13,40,2013,1,0,0


In [24]:
# 사용하지 않을 컬럼 drop
data.drop(['first_sell','price_max','price_min'], axis=1, inplace=True)
data.head()

Unnamed: 0,month_id,shop_id,item_id,cnt_month,price_mean,city_id,cat_id,main_cat,sub_cat,year,month,sales_month
0,0,59,22154,1.0,999.0,30,37,11,1,2013,1,0
1,0,59,2552,0.0,0.0,30,58,13,29,2013,1,0
2,0,59,2554,0.0,0.0,30,58,13,29,2013,1,0
3,0,59,2555,0.0,0.0,30,56,13,5,2013,1,0
4,0,59,2564,0.0,0.0,30,59,13,40,2013,1,0


In [4]:
# 기준별 평균 월간 판매량을 계산해 새로운 컬럼으로 추가하는 함수 정의
def mean_cnt_month(data, features):
    
    # 새로 생성될 피처명 생성
    new_feature_name = '/'.join(features[1:])+'_mean_cnt'
    
    # 기준에 따라 월간 평균 판매량 구하기
    pivot = data.pivot_table(index=features, values='cnt_month', aggfunc='mean')
    pivot = pivot.reset_index().rename(columns={'cnt_month': new_feature_name})
    
    # data와 pivot 병합 
    data = data.merge(pivot, on=features, how='left')
    
    del pivot

    return data

In [5]:
# 각 매장/상품 별로 임의로 정한 개월 수 이전의 값으로 컬럼을 추가하는 함수 정의
# num_lags=2라면 1개월전의 값 컬럼과 2개월전의 값 컬럼이 생성됨
# rolling()과 같은 과정이지만 현 데이터에 rolling을 적용하기 복잡하기 때문에 함수로 정의해서 사용

def make_lags(data, feature, num_lags):
    temp = data[['month_id','shop_id','item_id',feature]].copy() # 원본은 건드리지 않도록 copy()

    # lag 피쳐 생성 및 기존 데이터와 병합
    for i in range(num_lags):
        column_name = feature+'_'+str(i+1) # 새로 생성될 컬럼명 지정 
        temp.columns = ['month_id','shop_id','item_id',column_name] # temp 컬럼명 새 이름으로 재지정
        temp['month_id'] = temp['month_id']+(i+1) # month_id에 해당 lag 숫자(몇 개월 전인지)를 더하기 > 해당 숫자만큼 월이 밀린 데이터가 됨 
        data = data.merge(temp, on=['month_id','shop_id','item_id'], how='left')
        data[column_name] = data[column_name].fillna(0) # 판매이력이 없는 경우(신상품) NaN이 존재할 수 있으므로 결측치는 0으로 대체
    del temp
    return data

In [6]:
# 주어진 피쳐를 가지고 가능합 모든 조합별로 mean_cnt_month 컬럼을 생성하고, cnt_month와의 상관계수를 보여주는 함수 정의

def all_corr(data,features,num_features): # 데이터프레임, 사용할피쳐, 조합을 만들 피쳐 갯수
    result = []
    combi = list(itertools.combinations(features,num_features))
    
    for i in range(len(combi)):
        
        feature_name = '/'.join(combi[i])+'_mean_cnt'
        
        temp = list(combi[i])
        temp.insert(0,'month_id')
        
        df = mean_cnt_month(data,temp)
        
        cor = df[[feature_name,'cnt_month']].corr().iloc[0,1]
        
        result.append([feature_name,cor])
        
    del combi, feature_name, temp, df, cor
    
    return result

In [12]:
features = ['shop_id','item_id','price_mean','city_id','cat_id','main_cat','sub_cat','sales_month']
all_corr(data, features, 2)

[['shop_id/item_id_mean_cnt', 1.0],
 ['shop_id/price_mean_mean_cnt', 0.9425583120331822],
 ['shop_id/city_id_mean_cnt', 0.08318385303791125],
 ['shop_id/cat_id_mean_cnt', 0.7857828107979159],
 ['shop_id/main_cat_mean_cnt', 0.30203683083331295],
 ['shop_id/sub_cat_mean_cnt', 0.7518928432042333],
 ['shop_id/sales_month_mean_cnt', 0.13711157462048765],
 ['item_id/price_mean_mean_cnt', 0.8481379474233779],
 ['item_id/city_id_mean_cnt', 0.8962189829845507],
 ['item_id/cat_id_mean_cnt', 0.621257524824796],
 ['item_id/main_cat_mean_cnt', 0.621257524824796],
 ['item_id/sub_cat_mean_cnt', 0.621257524824796],
 ['item_id/sales_month_mean_cnt', 0.621257524824796],
 ['price_mean/city_id_mean_cnt', 0.8872056522088636],
 ['price_mean/cat_id_mean_cnt', 0.8207810432008026],
 ['price_mean/main_cat_mean_cnt', 0.81458457657009],
 ['price_mean/sub_cat_mean_cnt', 0.82058339671588],
 ['price_mean/sales_month_mean_cnt', 0.8194800969583019],
 ['city_id/cat_id_mean_cnt', 0.6907711889951696],
 ['city_id/main_cat

In [25]:
data = mean_cnt_month(data, ['month_id','price_mean'])
data = mean_cnt_month(data, ['month_id','price_mean','shop_id'])
data = mean_cnt_month(data, ['month_id','shop_id','cat_id'])
data = mean_cnt_month(data, ['month_id','item_id','city_id'])
data = mean_cnt_month(data, ['month_id','sub_cat'])
data = make_lags(data, 'cnt_month', 3)
data = make_lags(data, 'price_mean_mean_cnt', 3)
data = make_lags(data, 'price_mean/shop_id_mean_cnt', 3)
data = make_lags(data, 'shop_id/cat_id_mean_cnt', 3)
data = make_lags(data, 'item_id/city_id_mean_cnt', 3)
data = make_lags(data, 'sub_cat', 3)
data.head()

Unnamed: 0,month_id,shop_id,item_id,cnt_month,price_mean,city_id,cat_id,main_cat,sub_cat,year,...,price_mean/shop_id_mean_cnt_3,shop_id/cat_id_mean_cnt_1,shop_id/cat_id_mean_cnt_2,shop_id/cat_id_mean_cnt_3,item_id/city_id_mean_cnt_1,item_id/city_id_mean_cnt_2,item_id/city_id_mean_cnt_3,sub_cat_1,sub_cat_2,sub_cat_3
0,0,59,22154,1.0,999.0,30,37,11,1,2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,59,2552,0.0,0.0,30,58,13,29,2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,59,2554,0.0,0.0,30,58,13,29,2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,59,2555,0.0,0.0,30,56,13,5,2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,59,2564,0.0,0.0,30,59,13,40,2013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
data.month.unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int8)

In [28]:
# 일 수 추가
days = pd.Series([np.nan,31,28,31,30,31,30,31,31,30,31,30,31])
data['day'] = data['month'].map(days).astype(np.int8)

In [30]:
# month_id 0,1,2 데이터 제거 > 3개월 만큼 lags 생성했기때문에 해당값이 없음
data = data.drop(data[data['month_id'] < 3].index)

In [31]:
data.isnull().sum()

month_id                              0
shop_id                               0
item_id                               0
cnt_month                        214200
price_mean                       214200
city_id                               0
cat_id                                0
main_cat                              0
sub_cat                               0
year                                  0
month                                 0
sales_month                           0
price_mean_mean_cnt              214200
price_mean/shop_id_mean_cnt      214200
shop_id/cat_id_mean_cnt          214200
item_id/city_id_mean_cnt         214200
sub_cat_mean_cnt                 214200
cnt_month_1                           0
cnt_month_2                           0
cnt_month_3                           0
price_mean_mean_cnt_1                 0
price_mean_mean_cnt_2                 0
price_mean_mean_cnt_3                 0
price_mean/shop_id_mean_cnt_1         0
price_mean/shop_id_mean_cnt_2         0


In [32]:
# 테스트 데이터에 존재하지 않을 데이터 컬럼 제거
data = data.drop(['price_mean','price_mean_mean_cnt',
'price_mean/shop_id_mean_cnt', 
'item_id/city_id_mean_cnt',
'sub_cat_mean_cnt'],axis=1)

In [33]:
data.to_pickle('./data_2.pkl')

In [34]:
# train/validation/test data split
X_train = data[data['month_id'] < 33].drop(['cnt_month'], axis=1)
y_train = data[data['month_id'] < 33]['cnt_month'].clip(0,20)

X_valid = data[data['month_id'] == 33].drop(['cnt_month'], axis=1)
y_valid = data[data['month_id'] == 33]['cnt_month'].clip(0,20)

X_test = data[data['month_id'] == 34].drop(['cnt_month'], axis=1)

In [35]:
# hyper parameter 지정
params = {'metric': 'rmse', # 평가지표 = rmse
          'num_leaves': 255,
          'learning_rate': 0.005,
          'feature_fraction': 0.7,
          'bagging_fraction': 0.7,
          'bagging_freq': 3,
          'force_col_wise': True,
          'random_state': 42}

cat = ['shop_id', 'city_id', 'cat_id', 'main_cat', 'sub_cat']

# 데이터 셋 지정
train = lightgbm.Dataset(X_train, y_train)
valid = lightgbm.Dataset(X_valid, y_valid)
 
# 모델 훈련
start = time.time()
lgb_model = lightgbm.train(params=params,
                      train_set=train,
                      num_boost_round=1500,
                      valid_sets=(train, valid),
                      early_stopping_rounds=150,
                      categorical_feature=cat,
                      verbose_eval=100)
end = time.time()
print(f'소요시간(s) : {end-start}')

New categorical_feature is ['cat_id', 'city_id', 'main_cat', 'shop_id', 'sub_cat']


[LightGBM] [Info] Total Bins 4070
[LightGBM] [Info] Number of data points in the train set: 7596885, number of used features: 30




[LightGBM] [Info] Start training from score 0.306474
Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 1.02216	valid_1's rmse: 0.909779
[200]	training's rmse: 0.920041	valid_1's rmse: 0.836191
[300]	training's rmse: 0.868745	valid_1's rmse: 0.807312
[400]	training's rmse: 0.84083	valid_1's rmse: 0.79701
[500]	training's rmse: 0.824038	valid_1's rmse: 0.793567
[600]	training's rmse: 0.812191	valid_1's rmse: 0.79212
[700]	training's rmse: 0.803092	valid_1's rmse: 0.792041
Early stopping, best iteration is:
[624]	training's rmse: 0.809815	valid_1's rmse: 0.791956
소요시간(s) : 491.6592950820923


In [36]:
# 예측 수행
y_pred = lgb_model.predict(X_test).clip(0, 20) # 실제 타겟값이 클립되어있으므로 동일하게 클립

In [37]:
# 제출 파일 생성
df_test = pd.read_csv('e:/Git_public_dodo_Riley/kaggle/Future_Sales/data/test.csv')
submission = pd.DataFrame({"ID": df_test.index, "item_cnt_month": y_pred})
submission.to_csv('lgbm_submission_4.csv', index=False)