In [917]:
import numpy as np
import pandas as pd
import warnings
import googletrans
from googletrans import Translator

warnings.filterwarnings(action='ignore') # 경고 메시지 생략

# 데이터 경로
data_path = './data/'

sales_train = pd.read_csv(data_path + 'sales_train.csv')
shops = pd.read_csv(data_path + 'shops.csv')
items = pd.read_csv(data_path + 'items.csv')
item_categories = pd.read_csv(data_path + 'item_categories.csv')
test = pd.read_csv(data_path + 'test.csv')
submission = pd.read_csv(data_path + 'sample_submission.csv')

## 피처 엔지니어링 I : 피처명 한글화와 데이터 다운캐스팅

In [918]:
sales_train = sales_train.rename(columns={'date': 'date', 
                                          'date_block_num': 'month_id',
                                          'shop_id': 'shop_id',
                                          'item_id': 'item_id',
                                          'item_price': 'item_price',
                                          'item_cnt_day': 'item_cnt'})

shops = shops.rename(columns={'shop_name': 'shop_name',
                              'shop_id': 'shop_id'})

items = items.rename(columns={'item_name': 'item_name',
                              'item_id': 'item_id',
                              'item_category_id': 'item_category_id'})

item_categories = item_categories.rename(columns=
                                         {'item_category_name': 'item_category_name',
                                          'item_category_id': 'item_category_id'})

test = test.rename(columns={'shop_id': 'shop_id',
                            'item_id': 'item_id'})

In [919]:
def downcast(df, verbose=True):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('{:.1f}% compressed'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

all_df = [sales_train, shops, items, item_categories, test]
for df in all_df:
    df = downcast(df)

62.5% compressed
38.6% compressed
54.2% compressed
39.9% compressed
70.8% compressed


## 피처 엔지니어링 II : 개별 데이터 피처 엔지니어링

In [920]:
# 판매가가 0보다 큰 데이터 추출
sales_train = sales_train[sales_train['item_price'] > 0]
# 판매가가 50,000보다 작은 데이터 추출
sales_train = sales_train[sales_train['item_price'] < 50000]

# 판매량이 0보다 큰 데이터 추출
sales_train = sales_train[sales_train['item_cnt'] > 0]
# 판매량이 1,000보다 작은 데이터 추출
sales_train = sales_train[sales_train['item_cnt'] < 1000]

In [921]:
# sales_train 데이터에서 상점ID 수정
sales_train.loc[sales_train['shop_id'] == 0, 'shop_id'] = 57
sales_train.loc[sales_train['shop_id'] == 1, 'shop_id'] = 58
sales_train.loc[sales_train['shop_id'] == 10, 'shop_id'] = 11
sales_train.loc[sales_train['shop_id'] == 39, 'shop_id'] = 40

# test 데이터에서 상점ID 수정
test.loc[test['shop_id'] == 0, 'shop_id'] = 57
test.loc[test['shop_id'] == 1, 'shop_id'] = 58
test.loc[test['shop_id'] == 10, 'shop_id'] = 11
test.loc[test['shop_id'] == 39, 'shop_id'] = 40

In [922]:
shops['city'] = shops['shop_name'].apply(lambda x: x.split()[0])

In [923]:
shops.loc[shops['city'] =='!Якутск', 'city'] = 'Якутск'

In [924]:
from sklearn.preprocessing import LabelEncoder

# 레이블 인코더 생성
label_encoder = LabelEncoder()
# 도시 피처 레이블 인코딩
shops['city'] = label_encoder.fit_transform(shops['city'])

In [925]:
# 상점명 피처 제거
shops = shops.drop('shop_name', axis=1)

shops.head()

Unnamed: 0,shop_id,city
0,0,29
1,1,29
2,2,0
3,3,1
4,4,2


In [926]:
# 상품명 피처 제거
items = items.drop(['item_name'], axis=1)

In [927]:
# 상품이 맨 처음 팔린 날을 피처로 추가
items['first_sell_month'] = sales_train.groupby('item_id').agg({'month_id': 'min'})['month_id']

items.head()

Unnamed: 0,item_id,item_category_id,first_sell_month
0,0,40,20.0
1,1,76,15.0
2,2,40,19.0
3,3,40,18.0
4,4,40,20.0


In [928]:
items[items['first_sell_month'].isna()]

Unnamed: 0,item_id,item_category_id,first_sell_month
83,83,40,
140,140,45,
168,168,44,
173,173,45,
204,204,44,
...,...,...,...
21974,21974,61,
21975,21975,61,
22022,22022,40,
22035,22035,40,


In [929]:
# 첫 판매월 피처의 결측값을 34로 대체
items['first_sell_month'] = items['first_sell_month'].fillna(34)

In [930]:
# 상품분류명의 첫 단어를 대분류로 추출
item_categories['major_category'] = item_categories['item_category_name'].apply(lambda x: x.split()[0])  

In [931]:
item_categories['major_category'].value_counts()

Игры          14
Книги         13
Подарки       12
Игровые        8
Аксессуары     7
Музыка         6
Программы      6
Карты          5
Кино           5
Служебные      2
Чистые         2
PC             1
Билеты         1
Доставка       1
Элементы       1
Name: major_category, dtype: int64

In [932]:
def make_etc(x):
    if len(item_categories[item_categories['major_category']==x]) >= 5:
        return x
    else:
        return 'etc'

# 대분류의 고윳값 개수가 5개 미만이면 'etc'로 바꾸기
item_categories['major_category'] = item_categories['major_category'].apply(make_etc)

In [933]:
item_categories.head()

Unnamed: 0,item_category_name,item_category_id,major_category
0,PC - Гарнитуры/Наушники,0,etc
1,Аксессуары - PS2,1,Аксессуары
2,Аксессуары - PS3,2,Аксессуары
3,Аксессуары - PS4,3,Аксессуары
4,Аксессуары - PSP,4,Аксессуары


In [934]:
# 레이블 인코더 생성
label_encoder = LabelEncoder()

# 대분류 피처 레이블 인코딩
item_categories['major_category'] = label_encoder.fit_transform(item_categories['major_category'])

# 상품분류명 피처 제거
item_categories = item_categories.drop('item_category_name', axis=1)

## 피처 엔지니어링 III : 데이터 조합 및 파생 피처 생성

In [935]:
from itertools import product

train = []
# 월ID, 상점ID, 상품ID 조합 생성
for i in sales_train['month_id'].unique():
    all_shop = sales_train.loc[sales_train['month_id']==i, 'shop_id'].unique()
    all_item = sales_train.loc[sales_train['month_id']==i, 'item_id'].unique()
    train.append(np.array(list(product([i], all_shop, all_item))))

idx_features = ['month_id', 'shop_id', 'item_id'] # 기준 피처
train = pd.DataFrame(np.vstack(train), columns=idx_features)

In [936]:
group = sales_train.groupby(idx_features).agg({'item_cnt': 'sum',
                                               'item_price': 'mean'})
group = group.reset_index()
group = group.rename(columns={'item_cnt': 'item_cnt_month', 'item_price': 'avg_item_price'})

train = train.merge(group, on=idx_features, how='left')

train.head()

Unnamed: 0,month_id,shop_id,item_id,item_cnt_month,avg_item_price
0,0,59,22154,1.0,999.0
1,0,59,2552,,
2,0,59,2554,,
3,0,59,2555,,
4,0,59,2564,,


In [937]:
import gc

# group 변수 가비지 컬렉션
del group
gc.collect();

In [938]:
# 상품 판매건수 피처 추가
group = sales_train.groupby(idx_features).agg({'item_cnt': 'count'})
group = group.reset_index()

train = train.merge(group, on=idx_features, how='left')

# 가비지 컬렉션
del group, sales_train
gc.collect()

train.head()

Unnamed: 0,month_id,shop_id,item_id,item_cnt_month,avg_item_price,item_cnt
0,0,59,22154,1.0,999.0,1.0
1,0,59,2552,,,
2,0,59,2554,,,
3,0,59,2555,,,
4,0,59,2564,,,


## 피처 엔지니어링 IV : 데이터 합치기

In [939]:
# 테스트 데이터 월ID를 34로 설정
test['month_id'] = 34

# train과 test 이어붙이기
all_data = pd.concat([train, test.drop('ID', axis=1)],
                     ignore_index=True,
                     keys=idx_features)
# 결측값을 0으로 대체
all_data = all_data.fillna(0)

all_data.head()

Unnamed: 0,month_id,shop_id,item_id,item_cnt_month,avg_item_price,item_cnt
0,0,59,22154,1.0,999.0,1.0
1,0,59,2552,0.0,0.0,0.0
2,0,59,2554,0.0,0.0,0.0
3,0,59,2555,0.0,0.0,0.0
4,0,59,2564,0.0,0.0,0.0


In [940]:
# 나머지 데이터 병합
all_data = all_data.merge(shops, on='shop_id', how='left')
all_data = all_data.merge(items, on='item_id', how='left')
all_data = all_data.merge(item_categories, on='item_category_id', how='left')

# 데이터 다운캐스팅
all_data = downcast(all_data)

64.6% compressed


In [941]:
# 가비지 컬렉션
del shops, items, item_categories
gc.collect();

## 피처 엔지니어링 V : 시차 피처 생성

In [942]:
def add_mean_features(df, mean_features, idx_features):
    # 기준 피처 확인 
    assert (idx_features[0] == 'month_id') and \
           len(idx_features) in [2, 3]
    
    # 파생 피처명 설정 
    if len(idx_features) == 2:
        feature_name = idx_features[1] + 'by_avg_sold_count'
    else:
        feature_name = idx_features[1] + '_' + idx_features[2] + 'by_avg_sold_count'
    
    # 기준 피처를 토대로 그룹화해 월간 평균 판매량 구하기 
    print(df.columns)
    group = df.groupby(idx_features).agg({'item_cnt_month': 'mean'})
    group = group.reset_index()
    group = group.rename(columns={'item_cnt_month': feature_name})
    
    # df와 group 병합 
    df = df.merge(group, on=idx_features, how='left')
    # 데이터 다운캐스팅 
    df = downcast(df, verbose=False)
    # 새로 만든 feature_name 피처명을 mean_features 리스트에 추가 
    mean_features.append(feature_name)
    
    # 가비지 컬렉션
    del group
    gc.collect()
    
    return df, mean_features

In [943]:
# 그룹화 기준 피처 중 '상품ID'가 포함된 파생 피처명을 담을 리스트
item_mean_features = []

# ['월ID', '상품ID']로 그룹화한 월간 평균 판매량 파생 피처 생성
all_data, item_mean_features = add_mean_features(df=all_data,
                                                 mean_features=item_mean_features,
                                                 idx_features=['month_id', 'item_id'])

# ['월ID', '상품ID', '도시']로 그룹화한 월간 평균 판매량 파생 피처 생성
all_data, item_mean_features = add_mean_features(df=all_data,
                                                 mean_features=item_mean_features,
                                                 idx_features=['month_id', 'item_id', 'city'])

Index(['month_id', 'shop_id', 'item_id', 'item_cnt_month', 'avg_item_price',
       'item_cnt', 'city', 'item_category_id', 'first_sell_month',
       'major_category'],
      dtype='object')
Index(['month_id', 'shop_id', 'item_id', 'item_cnt_month', 'avg_item_price',
       'item_cnt', 'city', 'item_category_id', 'first_sell_month',
       'major_category', 'item_idby_avg_sold_count'],
      dtype='object')


In [944]:
item_mean_features

['item_idby_avg_sold_count', 'item_id_cityby_avg_sold_count']

In [945]:
# 그룹화 기준 피처 중 '상점ID'가 포함된 파생 피처명을 담을 리스트
shop_mean_features = []

# ['월ID', '상점ID', '상품분류ID']로 그룹화한 월간 평균 판매량 파생 피처 생성
all_data, shop_mean_features = add_mean_features(df=all_data, 
                                                 mean_features=shop_mean_features,
                                                 idx_features=['month_id', 'shop_id', 'item_category_id'])

Index(['month_id', 'shop_id', 'item_id', 'item_cnt_month', 'avg_item_price',
       'item_cnt', 'city', 'item_category_id', 'first_sell_month',
       'major_category', 'item_idby_avg_sold_count',
       'item_id_cityby_avg_sold_count'],
      dtype='object')


In [946]:
shop_mean_features

['shop_id_item_category_idby_avg_sold_count']

In [947]:
def add_lag_features(df, lag_features_to_clip, idx_features, 
                     lag_feature, nlags=3, clip=False):
    # 시차 피처 생성에 필요한 DataFrame 부분만 복사 
    df_temp = df[idx_features + [lag_feature]].copy() 

    # 시차 피처 생성 
    for i in range(1, nlags+1):
        # 시차 피처명 
        lag_feature_name = lag_feature +'_time_gap' + str(i)
        # df_temp 열 이름 설정 
        df_temp.columns = idx_features + [lag_feature_name]
        # df_temp의 date_block_num 피처에 i 더하기 
        df_temp['month_id'] += i
        # idx_feature를 기준으로 df와 df_temp 병합하기 
        df = df.merge(df_temp.drop_duplicates(), 
                      on=idx_features, 
                      how='left')
        # 결측값 0으로 대체 
        df[lag_feature_name] = df[lag_feature_name].fillna(0)
        # 0 ~ 20 사이로 제한할 시차 피처명을 lag_features_to_clip에 추가 
        if clip: 
            lag_features_to_clip.append(lag_feature_name)
    
    # 데이터 다운캐스팅
    df = downcast(df, False)
    # 가비지 컬렉션
    del df_temp
    gc.collect()
    
    return df, lag_features_to_clip

In [948]:
all_data.columns

Index(['month_id', 'shop_id', 'item_id', 'item_cnt_month', 'avg_item_price',
       'item_cnt', 'city', 'item_category_id', 'first_sell_month',
       'major_category', 'item_idby_avg_sold_count',
       'item_id_cityby_avg_sold_count',
       'shop_id_item_category_idby_avg_sold_count'],
      dtype='object')

In [949]:
lag_features_to_clip = [] # 0 ~ 20 사이로 제한할 시차 피처명을 담을 리스트
idx_features = ['month_id', 'shop_id', 'item_id'] # 기준 피처

# idx_features를 기준으로 월간 판매량의 세 달치 시차 피처 생성
all_data, lag_features_to_clip = add_lag_features(df=all_data, 
                                                  lag_features_to_clip=lag_features_to_clip,
                                                  idx_features=idx_features,
                                                  lag_feature='item_cnt_month', 
                                                  nlags=3,
                                                  clip=True) # 값을 0 ~ 20 사이로 제한

In [950]:
all_data.head().T

Unnamed: 0,0,1,2,3,4
month_id,0.0,0.0,0.0,0.0,0.0
shop_id,59.0,59.0,59.0,59.0,59.0
item_id,22154.0,2552.0,2554.0,2555.0,2564.0
item_cnt_month,1.0,0.0,0.0,0.0,0.0
avg_item_price,999.0,0.0,0.0,0.0,0.0
item_cnt,1.0,0.0,0.0,0.0,0.0
city,30.0,30.0,30.0,30.0,30.0
item_category_id,37.0,58.0,58.0,56.0,59.0
first_sell_month,0.0,0.0,0.0,0.0,0.0
major_category,5.0,7.0,7.0,7.0,7.0


In [951]:
lag_features_to_clip

['item_cnt_month_time_gap1',
 'item_cnt_month_time_gap2',
 'item_cnt_month_time_gap3']

In [952]:
all_data.columns

Index(['month_id', 'shop_id', 'item_id', 'item_cnt_month', 'avg_item_price',
       'item_cnt', 'city', 'item_category_id', 'first_sell_month',
       'major_category', 'item_idby_avg_sold_count',
       'item_id_cityby_avg_sold_count',
       'shop_id_item_category_idby_avg_sold_count', 'item_cnt_month_time_gap1',
       'item_cnt_month_time_gap2', 'item_cnt_month_time_gap3'],
      dtype='object')

In [953]:
# idx_features를 기준으로 판매건수 피처의 세 달치 시차 피처 생성
all_data, lag_features_to_clip = add_lag_features(df=all_data, 
                                                  lag_features_to_clip=lag_features_to_clip,
                                                  idx_features=idx_features,
                                                  lag_feature='item_cnt', 
                                                  nlags=3)

# idx_features를 기준으로 평균 판매가 피처의 세 달치 시차 피처 생성
all_data, lag_features_to_clip = add_lag_features(df=all_data, 
                                                  lag_features_to_clip=lag_features_to_clip,
                                                  idx_features=idx_features,
                                                  lag_feature='avg_item_price', 
                                                  nlags=3)

In [954]:
# idx_features를 기준으로 item_mean_features 요소별 시차 피처 생성
for item_mean_feature in item_mean_features:
    all_data, lag_features_to_clip = add_lag_features(df=all_data, 
                                                      lag_features_to_clip=lag_features_to_clip, 
                                                      idx_features=idx_features, 
                                                      lag_feature=item_mean_feature, 
                                                      nlags=3,
                                                      clip=True)
# item_mean_features 피처 제거
all_data = all_data.drop(item_mean_features, axis=1)

In [955]:
# ['월ID', '상점ID', '상품분류ID']를 기준으로 shop_mean_features 요소별 시차 피처 생성
for shop_mean_feature in shop_mean_features:
    all_data, lag_features_to_clip = add_lag_features(df=all_data,
                                                      lag_features_to_clip=lag_features_to_clip, 
                                                      idx_features=['month_id', 'shop_id', 'item_category_id'], 
                                                      lag_feature=shop_mean_feature, 
                                                      nlags=3,
                                                      clip=True)
# shop_mean_features 피처 제거
all_data = all_data.drop(shop_mean_features, axis=1)

In [956]:
# 월ID 3미만인 데이터 제거
all_data = all_data.drop(all_data[all_data['month_id'] < 3].index)

## 피처 엔지니어링 VI : 기타 피처 엔지니어링

In [957]:
all_data.columns

Index(['month_id', 'shop_id', 'item_id', 'item_cnt_month', 'avg_item_price',
       'item_cnt', 'city', 'item_category_id', 'first_sell_month',
       'major_category', 'item_cnt_month_time_gap1',
       'item_cnt_month_time_gap2', 'item_cnt_month_time_gap3',
       'item_cnt_time_gap1', 'item_cnt_time_gap2', 'item_cnt_time_gap3',
       'avg_item_price_time_gap1', 'avg_item_price_time_gap2',
       'avg_item_price_time_gap3', 'item_idby_avg_sold_count_time_gap1',
       'item_idby_avg_sold_count_time_gap2',
       'item_idby_avg_sold_count_time_gap3',
       'item_id_cityby_avg_sold_count_time_gap1',
       'item_id_cityby_avg_sold_count_time_gap2',
       'item_id_cityby_avg_sold_count_time_gap3',
       'shop_id_item_category_idby_avg_sold_count_time_gap1',
       'shop_id_item_category_idby_avg_sold_count_time_gap2',
       'shop_id_item_category_idby_avg_sold_count_time_gap3'],
      dtype='object')

In [958]:
all_data['item_idby_avg_sold_count_time_gap_avg'] = all_data[['item_idby_avg_sold_count_time_gap1',
                                          'item_idby_avg_sold_count_time_gap2', 
                                          'item_idby_avg_sold_count_time_gap3']].mean(axis=1)

In [959]:
# 0 ~ 20 사이로 값 제한
all_data[lag_features_to_clip + ['item_cnt_month', 'item_idby_avg_sold_count_time_gap_avg']] = all_data[lag_features_to_clip + ['item_cnt_month', 'item_idby_avg_sold_count_time_gap_avg']].clip(0, 20)

In [960]:
all_data['time_gap_difference1'] = all_data['item_cnt_month_time_gap1']/all_data['item_cnt_month_time_gap2']
all_data['time_gap_difference1'] = all_data['time_gap_difference1'].replace([np.inf, -np.inf], 
                                                        np.nan).fillna(0)

all_data['time_gap_difference2'] = all_data['item_cnt_month_time_gap2']/all_data['item_cnt_month_time_gap3']
all_data['time_gap_difference2'] = all_data['time_gap_difference2'].replace([np.inf, -np.inf], 
                                                        np.nan).fillna(0)

In [961]:
all_data['is_new_item'] = all_data['first_sell_month'] == all_data['month_id']

In [962]:
all_data['passed_month'] = all_data['month_id'] - all_data['first_sell_month']

In [963]:
all_data['month'] = all_data['month_id'] % 12

In [964]:
all_data.columns

Index(['month_id', 'shop_id', 'item_id', 'item_cnt_month', 'avg_item_price',
       'item_cnt', 'city', 'item_category_id', 'first_sell_month',
       'major_category', 'item_cnt_month_time_gap1',
       'item_cnt_month_time_gap2', 'item_cnt_month_time_gap3',
       'item_cnt_time_gap1', 'item_cnt_time_gap2', 'item_cnt_time_gap3',
       'avg_item_price_time_gap1', 'avg_item_price_time_gap2',
       'avg_item_price_time_gap3', 'item_idby_avg_sold_count_time_gap1',
       'item_idby_avg_sold_count_time_gap2',
       'item_idby_avg_sold_count_time_gap3',
       'item_id_cityby_avg_sold_count_time_gap1',
       'item_id_cityby_avg_sold_count_time_gap2',
       'item_id_cityby_avg_sold_count_time_gap3',
       'shop_id_item_category_idby_avg_sold_count_time_gap1',
       'shop_id_item_category_idby_avg_sold_count_time_gap2',
       'shop_id_item_category_idby_avg_sold_count_time_gap3',
       'item_idby_avg_sold_count_time_gap_avg', 'time_gap_difference1',
       'time_gap_difference2', 'i

In [965]:
# 첫 판매월, 평균 판매가, 판매건수 피처 제거
all_data = all_data.drop(['first_sell_month', 'avg_item_price', 'item_cnt'], axis=1)

In [966]:
all_data = downcast(all_data, False) # 데이터 다운캐스팅

## 피처 엔지니어링 VII : 마무리

In [967]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9904582 entries, 1122386 to 11026967
Data columns (total 31 columns):
 #   Column                                               Dtype  
---  ------                                               -----  
 0   month_id                                             int8   
 1   shop_id                                              int8   
 2   item_id                                              int16  
 3   item_cnt_month                                       int8   
 4   city                                                 int8   
 5   item_category_id                                     int8   
 6   major_category                                       int8   
 7   item_cnt_month_time_gap1                             int8   
 8   item_cnt_month_time_gap2                             int8   
 9   item_cnt_month_time_gap3                             int8   
 10  item_cnt_time_gap1                                   int8   
 11  item_cnt_time_gap

In [968]:
all_data.columns

Index(['month_id', 'shop_id', 'item_id', 'item_cnt_month', 'city',
       'item_category_id', 'major_category', 'item_cnt_month_time_gap1',
       'item_cnt_month_time_gap2', 'item_cnt_month_time_gap3',
       'item_cnt_time_gap1', 'item_cnt_time_gap2', 'item_cnt_time_gap3',
       'avg_item_price_time_gap1', 'avg_item_price_time_gap2',
       'avg_item_price_time_gap3', 'item_idby_avg_sold_count_time_gap1',
       'item_idby_avg_sold_count_time_gap2',
       'item_idby_avg_sold_count_time_gap3',
       'item_id_cityby_avg_sold_count_time_gap1',
       'item_id_cityby_avg_sold_count_time_gap2',
       'item_id_cityby_avg_sold_count_time_gap3',
       'shop_id_item_category_idby_avg_sold_count_time_gap1',
       'shop_id_item_category_idby_avg_sold_count_time_gap2',
       'shop_id_item_category_idby_avg_sold_count_time_gap3',
       'item_idby_avg_sold_count_time_gap_avg', 'time_gap_difference1',
       'time_gap_difference2', 'is_new_item', 'passed_month', 'month'],
      dtype='objec

In [969]:
# 훈련 데이터 (피처)
X_train = all_data[all_data['month_id'] < 33]
X_train = X_train.drop(['item_cnt_month'], axis=1)
# 검증 데이터 (피처)
X_valid = all_data[all_data['month_id'] == 33]
X_valid = X_valid.drop(['item_cnt_month'], axis=1)
# 테스트 데이터 (피처)
X_test = all_data[all_data['month_id'] == 34]
X_test = X_test.drop(['item_cnt_month'], axis=1)

# 훈련 데이터 (타깃값)
y_train = all_data[all_data['month_id'] < 33]['item_cnt_month']
# 검증 데이터 (타깃값)
y_valid = all_data[all_data['month_id'] == 33]['item_cnt_month']

# 가비지 컬렉션
del all_data
gc.collect();

## 모델 훈련 및 성능 검증

In [970]:
import lightgbm as lgb

# LightGBM 하이퍼파라미터
params = {'metric': 'rmse', 
          'num_leaves': 255,
          'learning_rate': 0.005,
          'feature_fraction': 0.75,
          'bagging_fraction': 0.75,
          'bagging_freq': 5,
          'force_col_wise': True,
          'random_state': 42}

cat_features = ['shop_id', 'city', 'item_category_id', 'major_category', 'month']

# LightGBM 훈련 및 검증 데이터셋
dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid)
 
# LightGBM 모델 훈련
lgb_model = lgb.train(params=params,
                      train_set=dtrain,
                      num_boost_round=1500,
                      valid_sets=(dtrain, dvalid),
                      early_stopping_rounds=150,
                      categorical_feature=cat_features,
                      verbose_eval=100)      

[LightGBM] [Info] Total Bins 4059
[LightGBM] [Info] Number of data points in the train set: 9452298, number of used features: 30
[LightGBM] [Info] Start training from score 0.297707
Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 1.0129	valid_1's rmse: 0.988205
[200]	training's rmse: 0.911914	valid_1's rmse: 0.925325
[300]	training's rmse: 0.860741	valid_1's rmse: 0.899723
[400]	training's rmse: 0.832167	valid_1's rmse: 0.889894
[500]	training's rmse: 0.814592	valid_1's rmse: 0.885997
[600]	training's rmse: 0.801885	valid_1's rmse: 0.884589
[700]	training's rmse: 0.79185	valid_1's rmse: 0.884236
[800]	training's rmse: 0.783391	valid_1's rmse: 0.884301
[900]	training's rmse: 0.776562	valid_1's rmse: 0.884725
Early stopping, best iteration is:
[764]	training's rmse: 0.786351	valid_1's rmse: 0.883998


In [971]:
# 예측
preds_val = lgb_model.predict(X_valid).clip(0, 20)

In [972]:
X_valid.head()

Unnamed: 0,month_id,shop_id,item_id,city,item_category_id,major_category,item_cnt_month_time_gap1,item_cnt_month_time_gap2,item_cnt_month_time_gap3,item_cnt_time_gap1,...,item_id_cityby_avg_sold_count_time_gap3,shop_id_item_category_idby_avg_sold_count_time_gap1,shop_id_item_category_idby_avg_sold_count_time_gap2,shop_id_item_category_idby_avg_sold_count_time_gap3,item_idby_avg_sold_count_time_gap_avg,time_gap_difference1,time_gap_difference2,is_new_item,passed_month,month
10574684,33,45,13315,20,47,6,0,0,0,0,...,0.0,0.057143,0.145833,0.0,0.007937,0.0,0.0,0,2,9
10574685,33,45,13880,20,55,7,0,4,1,0,...,1.5,0.074534,0.129496,0.113445,1.982097,0.0,4.0,0,12,9
10574686,33,45,13881,20,55,7,1,1,0,1,...,1.0,0.074534,0.129496,0.113445,0.82333,1.0,0.0,0,33,9
10574687,33,45,13923,20,40,5,0,0,0,0,...,0.0,0.074074,0.092379,0.11191,0.442045,0.0,0.0,0,33,9
10574688,33,45,14227,20,30,3,0,0,0,0,...,0.5,0.509202,0.327485,1.166667,2.411259,0.0,0.0,0,23,9


In [974]:
# 예측
preds = lgb_model.predict(X_valid).clip(0, 20)