In [58]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import japanize_matplotlib
%matplotlib inline
import sweetviz as sv
sv.config_parser.read('../sweetviz_settings/sweetviz_settings.ini')
# sv.config_parser.read('sweetviz_setting.ini')
import category_encoders as ce
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

import lightgbm as gbm


pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 100)

# データの読み込み

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print('train shape', train.shape)
print('test shape' , test.shape)


train shape (207, 12)
test shape (40, 11)


# 前処理
1. remarks : trainにしかない項目はnanに変更
1. weather : 雷電、雪 は雨に丸め込み
1. 祝日フラグ : 前祝日、後祝日のフラグ

In [3]:
### 不要な項目を消す系

# remarksで１個しかないカテゴリは　カテゴリなしにする。
def replace_remarks_cat(df):
    _df = df['remarks'].value_counts().copy()
    _df = _df[_df < 2]
    
    
    for i in _df.index:
        df['remarks'].loc[df['remarks'] == i] = np.nan

# weather 雪と雷電を　雨に置き換え
def replace_weather_cat(df):
    _df = df['weather'].value_counts()
    _df = _df[_df < 2]
    
    for i in _df.index:
        df['weather'].loc[df['weather'] == i] = '雨'

# category_encoding関数
def encoding_func(df, encoding_col_name_list):
    def which_col_is_object_func(df):
        # 列がobjectを抽出してリストに
        keys = list(df.dtypes.loc[df.dtypes == 'object'].index)
        # 辞書化
        values = list(range(len(object_list)))
        object_dic = dict(zip(keys, values))
        print(object_dic)
    
    
    # カテゴリエンコーダーする
    ce_oe = ce.OrdinalEncoder(cols = encoding_col_name_list, handle_unknown = 'impute')
    df = ce_oe.fit_transform(df)

    # カテゴリ番号の確認
    
    for i in range(len(encoding_col_name_list)):
        print(pd.DataFrame(ce_oe.category_mapping[i]))
        print()
    return df


# 休日前flg (category encodingのあとにやる)
def before_holiday_func(df):
    '''
    祝日の前の日にフラグを立てる。
    category encodingのあとに使うこと。
    '''
    df['before_holiday'] = 0
    for i in range(len(df)):
        
        if df['week'].diff(-1).iloc[i] == -1.0:
            df['before_holiday'].iloc[i] = 0        
        
        elif i == len(df):
            df['before_holiday'].iloc[i] = 0
        
        else:
            df['before_holiday'].iloc[i] = 1

            
# 休日後flg (category encodingのあとにやる)
def after_holiday_func(df):
    '''
    祝日の次の日にフラグを立てる。
    category encodingのあとに使うこと。
    '''
    
    df['after_holiday'] = 0

    for i in range(len(df)):
        if i == 1:
            df['week'].iloc[i] == 1
            
        elif df['week'].diff(1).iloc[i] == 1.0:
            df['after_holiday'].iloc[i] = 0
            
        else:
            df['after_holiday'].iloc[i] = 1

## trainとtestの結合

In [4]:
# trainとtestの識別列を追加
train['train_flg'] = True
test['train_flg'] = False

# データの結合
all_data = pd.concat([train, test], axis = 0)
proc_data = all_data.copy()
proc_data['datetime'] = pd.to_datetime(proc_data['datetime'])

## カテゴリデータの処理

In [5]:
# カテゴリデータの処理
replace_remarks_cat(all_data)
replace_weather_cat(all_data)

# nan値の埋め
proc_data['payday'] = proc_data['payday'].fillna(0)
proc_data['remarks'] = proc_data['remarks'].fillna(0)
proc_data['event'] = proc_data['event'].fillna(0)

# 降水量の'--'の処理
proc_data['precipitation'] = proc_data['precipitation'].apply(lambda x : -1 if x == '--' else float(x))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


## 数値データの処理

In [6]:
# kcalのnanをmedianで埋める
proc_data['kcal'] = proc_data['kcal'].fillna(proc_data['kcal'].median())

## エンコーディング 

In [7]:
# encoding
encoding_col_name_list = ['week',
                         'remarks',
                         'event',
                         'weather']

proc_data = encoding_func(proc_data, encoding_col_name_list)

      col  mapping data_type
月    week        1    object
火    week        2    object
水    week        3    object
木    week        4    object
金    week        5    object
NaN  week       -2    object

                              col  mapping data_type
0                         remarks        1    object
鶏のレモンペッパー焼（50食）、カレー（42食）  remarks        2    object
酢豚（28食）、カレー（85食）          remarks        3    object
お楽しみメニュー                  remarks        4    object
料理長のこだわりメニュー              remarks        5    object
手作りの味                     remarks        6    object
スペシャルメニュー（800円）           remarks        7    object
近隣に飲食店複合ビルオープン            remarks        8    object
NaN                       remarks       -2    object

                 col  mapping data_type
0              event        1    object
ママの会           event        2    object
キャリアアップ支援セミナー  event        3    object
NaN            event       -2    object

         col  mapping data_type
快晴   weather        1    object


  elif pd.api.types.is_categorical(cols):


## 特徴量作成

In [8]:
# メニューに対する処理
proc_data['curry'] = proc_data['name'].apply(lambda x : 1 if 'カレー' in x else 0)
# all_data['menchikatsu'] = all_data['name'].apply(lambda x : 1 if 'メンチカツ' in x else 0)

In [9]:
# 祝日前後の平日にflg
after_holiday_func(proc_data)
before_holiday_func(proc_data)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


### 販売量に対する処理 (時系列の特徴量)

#### 変動率

In [10]:
proc_data['y'].shift(len(test)).fillna(proc_data['y'].median()).head(100)

0      78.0
1      78.0
2      78.0
3      78.0
4      78.0
5      78.0
6      78.0
7      78.0
8      78.0
9      78.0
10     78.0
11     78.0
12     78.0
13     78.0
14     78.0
15     78.0
16     78.0
17     78.0
18     78.0
19     78.0
20     78.0
21     78.0
22     78.0
23     78.0
24     78.0
25     78.0
26     78.0
27     78.0
28     78.0
29     78.0
30     78.0
31     78.0
32     78.0
33     78.0
34     78.0
35     78.0
36     78.0
37     78.0
38     78.0
39     78.0
40     90.0
41    101.0
42    118.0
43    120.0
44    130.0
45    135.0
46    145.0
47    140.0
48    151.0
49    116.0
50    151.0
51    153.0
52    151.0
53    171.0
54    134.0
55    165.0
56    155.0
57    157.0
58    109.0
59    111.0
60    160.0
61    145.0
62    145.0
63    151.0
64    134.0
65    122.0
66    121.0
67     80.0
68    131.0
69    128.0
70    129.0
71     87.0
72    129.0
73    134.0
74    107.0
75     85.0
76    126.0
77    129.0
78    126.0
79    107.0
80     92.0
81    126.0
82    120.0
83  

In [11]:
'''
販売量が現象傾向 という情報を追加したい。
1. trainをshiftしてtestの販売量を埋める
2. trainのnan値になった箇所をmeanで埋める
3. 各yに対して、pcf_changeで特徴量を作る
4. モデル作成時は、最初のmeanを埋めた行を学習させない（使わない)
'''

proc_data['shift_y'] = proc_data['y'].shift(len(test)).fillna(proc_data['y'].median())
proc_data['pcf_change1'] = proc_data['shift_y'].pct_change(1)
proc_data['pcf_change3'] = proc_data['shift_y'].pct_change(3)
proc_data['pcf_change5'] = proc_data['shift_y'].pct_change(5)

#### 移動平均

In [12]:
proc_data['rolling_mean3'] = proc_data['shift_y'].pct_change(3)
proc_data['rolling_mean5'] = proc_data['shift_y'].pct_change(5)

In [13]:
# shift_y は不要なので削除
proc_data.drop(columns = 'shift_y', inplace = True)

# 学習

## Xとyに分割

In [19]:
# shiftで前半30個くらいのyのデータが全部同じ値なので、学習からはずす
proc_data.drop(proc_data.index[:len(test)], inplace = True)

In [25]:
proc_data.columns

Index(['datetime', 'y', 'week', 'soldout', 'name', 'kcal', 'remarks', 'event',
       'payday', 'weather', 'precipitation', 'temperature', 'train_flg',
       'curry', 'after_holiday', 'before_holiday', 'pcf_change1',
       'pcf_change3', 'pcf_change5', 'rolling_mean3', 'rolling_mean5'],
      dtype='object')

In [30]:
# 前処理したデータをtrain と testに分割
proc_data_train = proc_data.loc[proc_data['train_flg'] == True]
proc_data_test = proc_data.loc[proc_data['train_flg'] == False]

# 不要な列の削除 datetime , name , 
proc_data_train = proc_data_train[['y', 'week', 'soldout', 'kcal', 'remarks', 'event',
       'payday', 'weather', 'precipitation', 'temperature',
       'curry', 'after_holiday', 'before_holiday', 'pcf_change1',
       'pcf_change3', 'pcf_change5', 'rolling_mean3', 'rolling_mean5']]
proc_data_test = proc_data_test[['y', 'week', 'soldout', 'kcal', 'remarks', 'event',
       'payday', 'weather', 'precipitation', 'temperature',
       'curry', 'after_holiday', 'before_holiday', 'pcf_change1',
       'pcf_change3', 'pcf_change5', 'rolling_mean3', 'rolling_mean5']]



## cv

In [38]:
X = proc_data_train.drop(columns = 'y')
y = proc_data_train['y']
X.shape, y.shape

((167, 17), (167,))

In [48]:
X

Unnamed: 0,week,soldout,kcal,remarks,event,payday,weather,precipitation,temperature,curry,after_holiday,before_holiday,pcf_change1,pcf_change3,pcf_change5,rolling_mean3,rolling_mean5
40,5,1,418.0,2,1,0.0,1,-1.0,10.1,1,0,1,0.153846,0.153846,0.153846,0.153846,0.153846
41,1,1,415.0,1,1,0.0,1,-1.0,5.1,0,1,0,0.122222,0.294872,0.294872,0.294872,0.294872
42,2,0,445.0,1,1,0.0,1,-1.0,12.6,0,0,0,0.168317,0.512821,0.512821,0.512821,0.512821
43,3,1,450.0,1,1,0.0,1,-1.0,10.3,0,0,0,0.016949,0.333333,0.538462,0.333333,0.538462
44,4,1,448.0,1,1,0.0,2,-1.0,15.3,0,0,0,0.083333,0.287129,0.666667,0.287129,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,3,1,408.0,1,1,0.0,2,0.0,24.8,0,1,0,0.056604,0.056604,0.098039,0.056604,0.098039
203,4,0,394.0,1,1,0.0,2,0.0,25.4,0,0,0,0.107143,0.127273,0.169811,0.127273,0.169811
204,5,0,412.0,7,3,0.0,3,-1.0,27.1,0,0,1,0.338710,0.566038,0.566038,0.566038,0.566038
205,1,1,404.0,1,1,0.0,1,-1.0,26.6,0,1,0,-0.216867,0.160714,0.181818,0.160714,0.181818


In [55]:
y

40      92.0
41     126.0
42     120.0
43     121.0
44     105.0
       ...  
202     59.0
203     50.0
204     45.0
205     56.0
206     40.0
Name: y, Length: 167, dtype: float64

In [61]:
folds = TimeSeriesSplit(n_splits = 4, test_size = 20)

params = {
    'boosting_type' : 'gbdt',
    'objective' : 'regression',
    'metric' : 'rmse',
    'max_depth' : 5,
        }


train_score = []
valid_score = []
models = []


for i, (train_index, valid_index) in enumerate(folds.split(proc_data_train)):
    
    X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
    
    train_data = gbm.Dataset(X_train, y_train)
    valid_data = gbm.Dataset(X_valid, y_valid, reference = train_data)
    
    print('-------------------------------')
    print('model No: ', i)
    
    model = gbm.train(params,
                    train_data,
                    valid_sets = [train_data,valid_data],
                    num_boost_round = 10000,
                    early_stopping_rounds = 10,
                    verbose_eval = 50
                    )
    
    # モデル評価
    # trainのモデル評価
    train_pred = model.predict(X_train)# arrayで返ってくる
    train_pred_vs_data_df = pd.concat([y_train, pd.Series(train_pred,index = train_index)],
                                      axis = 1)# 真値と予測値のｄｆ作成
    train_pred_vs_data_df.columns = ['true','pred']
    
    # validのモデル評価
    valid_pred = model.predict(X_valid)
    valid_pred_vs_data_df = pd.concat([y_valid, pd.Series(valid_pred, index = valid_index)],
                                      axis = 1)
    valid_pred_vs_data_df.columns = ['true','pred']
    
    train_score.append(np.sqrt(mean_squared_error(train_pred_vs_data_df['true'],train_pred_vs_data_df['pred'])))
    valid_score.append(np.sqrt(mean_squared_error(valid_pred_vs_data_df['true'],valid_pred_vs_data_df['pred'])))
    
    models.append(model)
    
   

-------------------------------
model No:  0
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 221
[LightGBM] [Info] Number of data points in the train set: 87, number of used features: 12
[LightGBM] [Info] Start training from score 90.241379
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	training's rmse: 16.1925	valid_1's rmse: 19.4681


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [75]:
train_pred = model.predict(X_train)# arrayで返ってくる
train_pred_vs_data_df = pd.concat([y_train, pd.Series(train_pred,index = train_index)],
                                  axis = 1)# 真値と予測値のｄｆ作成
train_pred_vs_data_df.columns = ['true','pred']

In [77]:
train_pred_vs_data_df

Unnamed: 0,true,pred
0,,99.197904
1,,112.941636
2,,111.458367
3,,106.058905
4,,97.969407
...,...,...
122,79.0,
123,74.0,
124,64.0,
125,119.0,
