In [1]:
# パッケージのインポート
import pandas as pd
import pandas_profiling as pdp
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder

In [2]:
# 設定
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
# dataの読み込み
df_train = pd.read_csv('../200_input/train.csv')
df_food = pd.read_csv('../200_input/food_uriage.csv')
df_uriage = pd.read_csv('../200_input/uriage.csv')
df_tenki = pd.read_csv('../200_input/tenki.csv')
df_kyaku = pd.read_csv('../200_input/kyaku.csv')
df_test = pd.read_csv('../200_input/test.csv')

In [4]:
df_train['nichi'] = pd.to_datetime(df_train['nichi'], format='%Y%m%d')
df_uriage['nichi'] = pd.to_datetime(df_uriage['nichi'], format='%Y%m%d')
df_food['nichi'] = pd.to_datetime(df_food['nichi'], format='%Y%m%d')
df_kyaku['nichi'] = pd.to_datetime(df_kyaku['nichi'], format='%Y%m%d')
df_tenki['nichi'] = pd.to_datetime(df_tenki['nichi'], format='%Y%m%d')
df_test['nichi'] = pd.to_datetime(df_test['nichi'], format='%Y%m%d')

In [5]:
# 店ごとに分割
df_train_x = df_train[df_train['group_mise'] == 'X'].reset_index()
df_train_y = df_train[df_train['group_mise'] == 'Y'].reset_index()
df_train_z = df_train[df_train['group_mise'] == 'Z'].reset_index()
df_test_x = df_test[df_test['group_mise'] == 'X'].reset_index()
df_test_y = df_test[df_test['group_mise'] == 'Y'].reset_index()
df_test_z = df_test[df_test['group_mise'] == 'Z'].reset_index()
df_uriage_x = df_uriage[df_uriage['group_mise'] == 'X'].reset_index()
df_uriage_y = df_uriage[df_uriage['group_mise'] == 'Y'].reset_index()
df_uriage_z = df_uriage[df_uriage['group_mise'] == 'Z'].reset_index()

In [6]:
# 商品ごとに分割
df_train_xa = df_train_x[df_train_x['group_item'] == 'A'].reset_index()
df_train_xb = df_train_x[df_train_x['group_item'] == 'B'].reset_index()
df_train_xc = df_train_x[df_train_x['group_item'] == 'C'].reset_index()
df_train_xd = df_train_x[df_train_x['group_item'] == 'D'].reset_index()
df_train_xe = df_train_x[df_train_x['group_item'] == 'E'].reset_index()

In [7]:
df_train_xa.shape

(542, 6)

In [8]:
# 天気データをtrainデータに分割
df_tenki_train = df_tenki[df_tenki['nichi'] <= '2020-05-29'].reset_index()
df_tenki_tenki = df_tenki[df_tenki['nichi'] > '2020-05-29'].reset_index()

In [9]:
df_ft = pd.merge(df_tenki_train, df_train_xa,  on='nichi', how='inner')
df_ft = df_ft[['nichi', 'kion_ave', 'kousuiryou', 'tenki_id2_hiru', 'target']]
df_ft = pd.merge(df_ft, df_uriage_x,  on='nichi', how='inner')
df_ft = df_ft[['kion_ave', 'kousuiryou', 'tenki_id2_hiru', 'uriage_param', 'target']]

In [10]:
df_ft.dtypes

kion_ave            int64
kousuiryou          int64
tenki_id2_hiru      int64
uriage_param      float64
target              int64
dtype: object

In [11]:
df_ft['tenki_id2_hiru'] = df_ft['tenki_id2_hiru'].astype('str')

In [12]:
def one_hot_encoding(df_category):
    '''
    OneHotEncoderを使った場合
    多重共線性に注意する必要あり
    カテゴリーごとに列の削減を行うべき
    '''

    # 定性データと定量データに分割
    category_data = df_category.select_dtypes(include='object')
    numeric_data = df_category.select_dtypes(exclude='object')

    # OneHotEncoderのインスタンス作成
    ohe = OneHotEncoder(handle_unknown='ignore')
    # ダミー変数化したいデータをfit
    ohe.fit(category_data)
    # ダミー変数に変換して、Numpy配列の形へ
    dummy_data = ohe.transform(category_data).toarray()
    # ダミー変数の列名を作成
    dummies_name = ohe.get_feature_names(category_data.columns)
    # データフレームに変換
    df_dummies = pd.DataFrame(dummy_data, columns=dummies_name)
    # 元データに結合
    df_merge = pd.concat([df_dummies, numeric_data], axis=1)

    return df_merge

In [13]:
df_ft = one_hot_encoding(df_ft)

In [14]:
df_ft

Unnamed: 0,tenki_id2_hiru_10,tenki_id2_hiru_20,tenki_id2_hiru_30,tenki_id2_hiru_40,kion_ave,kousuiryou,uriage_param,target
0,1.00,0.00,0.00,0.00,13,62,0.59,183
1,1.00,0.00,0.00,0.00,10,0,0.59,177
2,0.00,0.00,1.00,0.00,16,38,0.62,203
3,1.00,0.00,0.00,0.00,9,1,0.59,186
4,0.00,1.00,0.00,0.00,6,0,0.61,166
...,...,...,...,...,...,...,...,...
537,1.00,0.00,0.00,0.00,21,0,0.23,50
538,0.00,1.00,0.00,0.00,21,0,0.21,42
539,1.00,0.00,0.00,0.00,22,0,0.20,50
540,1.00,0.00,0.00,0.00,21,0,0.19,40


In [22]:
def stds_norms_mms(df, scaler):
    if scaler == 'mms':
        mms = MinMaxScaler()
        mms.fit_transform(df)
    elif scaler == 'stds':
        stds = StandardScaler()
        stds.fit_transform(df)
    elif scaler == 'norms':
        norms = Normalizer()
        norms.fit_transform(df)
    return df

In [None]:
df_ft = 