In [1]:
# パッケージのインポート
import pandas as pd
import pandas_profiling as pdp
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder

In [2]:
# 設定
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
# dataの読み込み
df_train = pd.read_csv('../200_input/train.csv')
df_food = pd.read_csv('../200_input/food_uriage.csv')
df_uriage = pd.read_csv('../200_input/uriage.csv')
df_tenki = pd.read_csv('../200_input/tenki.csv')
df_kyaku = pd.read_csv('../200_input/kyaku.csv')
df_test = pd.read_csv('../200_input/test.csv')

In [4]:
df_train['nichi'] = pd.to_datetime(df_train['nichi'], format='%Y%m%d')
df_uriage['nichi'] = pd.to_datetime(df_uriage['nichi'], format='%Y%m%d')
df_food['nichi'] = pd.to_datetime(df_food['nichi'], format='%Y%m%d')
df_kyaku['nichi'] = pd.to_datetime(df_kyaku['nichi'], format='%Y%m%d')
df_tenki['nichi'] = pd.to_datetime(df_tenki['nichi'], format='%Y%m%d')
df_test['nichi'] = pd.to_datetime(df_test['nichi'], format='%Y%m%d')

In [17]:
df_train

Unnamed: 0,nichi,group_mise,group_item,target
0,2018-03-01,X,A,183
1,2018-03-01,X,B,22
2,2018-03-01,X,C,8
3,2018-03-01,X,D,13
4,2018-03-01,X,E,162
...,...,...,...,...
7984,2020-05-29,Z,A,40
7985,2020-05-29,Z,B,17
7986,2020-05-29,Z,C,18
7987,2020-05-29,Z,D,15


In [5]:
# 店ごとに分割
df_train_x = df_train[df_train['group_mise'] == 'X'].reset_index()
df_train_y = df_train[df_train['group_mise'] == 'Y'].reset_index()
df_train_z = df_train[df_train['group_mise'] == 'Z'].reset_index()
df_test_x = df_test[df_test['group_mise'] == 'X'].reset_index()
df_test_y = df_test[df_test['group_mise'] == 'Y'].reset_index()
df_test_z = df_test[df_test['group_mise'] == 'Z'].reset_index()
df_uriage_x = df_uriage[df_uriage['group_mise'] == 'X'].reset_index()
df_uriage_y = df_uriage[df_uriage['group_mise'] == 'Y'].reset_index()
df_uriage_z = df_uriage[df_uriage['group_mise'] == 'Z'].reset_index()

In [6]:
# 商品ごとに分割
df_train_xa = df_train_x[df_train_x['group_item'] == 'A'].reset_index()
df_train_xb = df_train_x[df_train_x['group_item'] == 'B'].reset_index()
df_train_xc = df_train_x[df_train_x['group_item'] == 'C'].reset_index()
df_train_xd = df_train_x[df_train_x['group_item'] == 'D'].reset_index()
df_train_xe = df_train_x[df_train_x['group_item'] == 'E'].reset_index()

In [7]:
df_train_xa.shape

(542, 6)

In [8]:
# 天気データをtrainデータに分割
df_tenki_train = df_tenki[df_tenki['nichi'] <= '2020-05-29'].reset_index()
df_tenki_tenki = df_tenki[df_tenki['nichi'] > '2020-05-29'].reset_index()

In [9]:
df_ft = pd.merge(df_tenki_train, df_train_xa,  on='nichi', how='inner')
df_ft = df_ft[['nichi', 'kion_ave', 'kousuiryou', 'tenki_id2_hiru', 'target']]
df_ft = pd.merge(df_ft, df_uriage_x,  on='nichi', how='inner')
df_ft = df_ft[['kion_ave', 'kousuiryou', 'tenki_id2_hiru', 'uriage_param', 'target']]

In [10]:
df_ft.dtypes

kion_ave            int64
kousuiryou          int64
tenki_id2_hiru      int64
uriage_param      float64
target              int64
dtype: object

In [11]:
df_ft['tenki_id2_hiru'] = df_ft['tenki_id2_hiru'].astype('str')

In [12]:
def one_hot_encoding(df_category):
    '''
    OneHotEncoderを使った場合
    多重共線性に注意する必要あり
    カテゴリーごとに列の削減を行うべき
    '''

    # 定性データと定量データに分割
    category_data = df_category.select_dtypes(include='object')
    numeric_data = df_category.select_dtypes(exclude='object')

    # OneHotEncoderのインスタンス作成
    ohe = OneHotEncoder(handle_unknown='ignore')
    # ダミー変数化したいデータをfit
    ohe.fit(category_data)
    # ダミー変数に変換して、Numpy配列の形へ
    dummy_data = ohe.transform(category_data).toarray()
    # ダミー変数の列名を作成
    dummies_name = ohe.get_feature_names(category_data.columns)
    # データフレームに変換
    df_dummies = pd.DataFrame(dummy_data, columns=dummies_name)
    # 元データに結合
    df_merge = pd.concat([df_dummies, numeric_data], axis=1)

    return df_merge

In [13]:
df_ft = one_hot_encoding(df_ft)

In [14]:
df_ft

Unnamed: 0,tenki_id2_hiru_10,tenki_id2_hiru_20,tenki_id2_hiru_30,tenki_id2_hiru_40,kion_ave,kousuiryou,uriage_param,target
0,1.00,0.00,0.00,0.00,13,62,0.59,183
1,1.00,0.00,0.00,0.00,10,0,0.59,177
2,0.00,0.00,1.00,0.00,16,38,0.62,203
3,1.00,0.00,0.00,0.00,9,1,0.59,186
4,0.00,1.00,0.00,0.00,6,0,0.61,166
...,...,...,...,...,...,...,...,...
537,1.00,0.00,0.00,0.00,21,0,0.23,50
538,0.00,1.00,0.00,0.00,21,0,0.21,42
539,1.00,0.00,0.00,0.00,22,0,0.20,50
540,1.00,0.00,0.00,0.00,21,0,0.19,40


In [15]:
def stds_norms_mms(df, scaler):
    if scaler == 'mms':
        mms = MinMaxScaler()
        mms.fit_transform(df)
    elif scaler == 'stds':
        stds = StandardScaler()
        stds.fit_transform(df)
    elif scaler == 'norms':
        norms = Normalizer()
        norms.fit_transform(df)
    return df

In [25]:
df_train['week'] = df_train['nichi'].dt.strftime('%A')

In [27]:
df_train = df_train[['nichi', 'week', 'group_mise', 'group_item', 'target']]

In [28]:
df_train

Unnamed: 0,nichi,week,group_mise,group_item,target
0,2018-03-01,Thursday,X,A,183
1,2018-03-01,Thursday,X,B,22
2,2018-03-01,Thursday,X,C,8
3,2018-03-01,Thursday,X,D,13
4,2018-03-01,Thursday,X,E,162
...,...,...,...,...,...
7984,2020-05-29,Friday,Z,A,40
7985,2020-05-29,Friday,Z,B,17
7986,2020-05-29,Friday,Z,C,18
7987,2020-05-29,Friday,Z,D,15


In [30]:
df_train.columns[0]

'nichi'

In [32]:
df_train['group_item'].unique()

array(['A', 'B', 'C', 'D', 'E'], dtype=object)

In [41]:
df_xa1 = df_train[(df_train['group_mise'] == 'X') & (df_train['group_item'] == 'A')]

In [40]:
df_uriage

Unnamed: 0,nichi,group_mise,uriage_param
0,2018-03-01,X,0.59
1,2018-03-01,Y,0.66
2,2018-03-01,Z,0.44
3,2018-03-02,X,0.59
4,2018-03-02,Y,0.64
...,...,...,...
1687,2020-06-29,Y,0.48
1688,2020-06-29,Z,0.30
1689,2020-06-30,X,0.32
1690,2020-06-30,Y,0.49


In [43]:
df_xa1 = pd.merge(df_xa1, df_uriage, how='left', on=['nichi', 'group_mise'])

In [44]:
df_xa1

Unnamed: 0,nichi,week,group_mise,group_item,target,uriage_param
0,2018-03-01,Thursday,X,A,183,0.59
1,2018-03-02,Friday,X,A,177,0.59
2,2018-03-05,Monday,X,A,203,0.62
3,2018-03-06,Tuesday,X,A,186,0.59
4,2018-03-07,Wednesday,X,A,166,0.61
...,...,...,...,...,...,...
537,2020-05-25,Monday,X,A,50,0.23
538,2020-05-26,Tuesday,X,A,42,0.21
539,2020-05-27,Wednesday,X,A,50,0.20
540,2020-05-28,Thursday,X,A,40,0.19


In [49]:
df_xa1.dtypes

nichi           datetime64[ns]
week                    object
group_mise              object
group_item              object
target                   int64
uriage_param           float64
dtype: object

In [52]:
df_xa1[df_xa1['uriage_param'] < 0.2]

Unnamed: 0,nichi,week,group_mise,group_item,target,uriage_param
540,2020-05-28,Thursday,X,A,40,0.19


In [54]:
df_kyaku['group_mise'].unique()

array(['X', 'Y', 'Z'], dtype=object)

In [58]:
df_train

Unnamed: 0,nichi,week,group_mise,group_item,target
0,2018-03-01,Thursday,X,A,183
1,2018-03-01,Thursday,X,B,22
2,2018-03-01,Thursday,X,C,8
3,2018-03-01,Thursday,X,D,13
4,2018-03-01,Thursday,X,E,162
...,...,...,...,...,...
7984,2020-05-29,Friday,Z,A,40
7985,2020-05-29,Friday,Z,B,17
7986,2020-05-29,Friday,Z,C,18
7987,2020-05-29,Friday,Z,D,15


In [62]:
for i in df_kyaku['group_mise'].unique():
    exec(f"df__{i} = df_train[df_train \
         ['group_mise'] == '{i}']")

In [63]:
df__Y

Unnamed: 0,nichi,week,group_mise,group_item,target
5,2018-03-01,Thursday,Y,A,134
6,2018-03-01,Thursday,Y,B,16
7,2018-03-01,Thursday,Y,C,17
8,2018-03-01,Thursday,Y,D,20
9,2018-03-01,Thursday,Y,E,138
...,...,...,...,...,...
7979,2020-05-29,Friday,Y,A,76
7980,2020-05-29,Friday,Y,B,17
7981,2020-05-29,Friday,Y,C,29
7982,2020-05-29,Friday,Y,D,16
