In [1]:
# パッケージのインポート
import pandas as pd
import pandas_profiling as pdp
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error

In [2]:
# 設定
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
pd.options.display.float_format = '{:.2f}'.format

In [3]:
# データの読み込み
df_train = pd.read_csv('../500_output/train_xe.csv', index_col=0)
df_tenki = pd.read_csv('../200_input/tenki.csv')

In [4]:
# 日付をデータタイム型に変更
df_train['nichi'] = pd.to_datetime(df_train['nichi'], format='%Y-%m-%d')
df_tenki['nichi'] = pd.to_datetime(df_tenki['nichi'], format='%Y%m%d')

In [5]:
# 天気データをマージする
df = pd.merge(df_train, df_tenki[['nichi', 'kion_ave']], on='nichi', how='left')
df = df[['nichi', 'week', 'kyaku_param', 'uriage_param', 'food_uriage_wariai', 'kion_ave', 'target']]
# df = df.iloc[:, [0, 1, 2, 3, 4, 5, 6, 8, 7]]

In [6]:
df_train

Unnamed: 0,nichi,week,group_mise,group_item,kyaku_param,uriage_param,food_uriage_wariai,target
0,2018-03-01,Thursday,X,E,0.85,0.59,0.89,162
1,2018-03-02,Friday,X,E,0.83,0.59,0.86,183
2,2018-03-05,Monday,X,E,0.86,0.62,0.85,142
3,2018-03-06,Tuesday,X,E,0.83,0.59,0.87,187
4,2018-03-07,Wednesday,X,E,0.86,0.61,0.89,208
...,...,...,...,...,...,...,...,...
537,2020-05-25,Monday,X,E,0.33,0.23,0.87,22
538,2020-05-26,Tuesday,X,E,0.30,0.21,0.89,22
539,2020-05-27,Wednesday,X,E,0.29,0.20,0.87,25
540,2020-05-28,Thursday,X,E,0.28,0.19,0.89,21


In [7]:
df.dtypes

nichi                 datetime64[ns]
week                          object
kyaku_param                  float64
uriage_param                 float64
food_uriage_wariai           float64
kion_ave                       int64
target                         int64
dtype: object

In [8]:
def one_hot_encoding(df_category):
    '''
    OneHotEncoderを使った場合
    多重共線性に注意する必要あり
    カテゴリーごとに列の削減を行うべき
    '''

    # 定性データと定量データに分割
    category_data = df_category.select_dtypes(include='object')
    numeric_data = df_category.select_dtypes(exclude='object')

    # OneHotEncoderのインスタンス作成
    ohe = OneHotEncoder(handle_unknown='ignore')
    # ダミー変数化したいデータをfit
    ohe.fit(category_data)
    # ダミー変数に変換して、Numpy配列の形へ
    dummy_data = ohe.transform(category_data).toarray()
    # ダミー変数の列名を作成
    dummies_name = ohe.get_feature_names(category_data.columns)
    # データフレームに変換
    df_dummies = pd.DataFrame(dummy_data, columns=dummies_name)
    # 元データに結合
    df_merge = pd.concat([df_dummies, numeric_data], axis=1)

    return df_merge

In [9]:
df = one_hot_encoding(df)
df = df[['week_Monday', 'week_Tuesday', 'week_Wednesday', 'week_Thursday', 'week_Friday',
         'kyaku_param', 'uriage_param', 'food_uriage_wariai', 'kion_ave', 'target']]

In [10]:
df

Unnamed: 0,week_Monday,week_Tuesday,week_Wednesday,week_Thursday,week_Friday,kyaku_param,uriage_param,food_uriage_wariai,kion_ave,target
0,0.00,0.00,0.00,1.00,0.00,0.85,0.59,0.89,13,162
1,0.00,0.00,0.00,0.00,1.00,0.83,0.59,0.86,10,183
2,1.00,0.00,0.00,0.00,0.00,0.86,0.62,0.85,16,142
3,0.00,1.00,0.00,0.00,0.00,0.83,0.59,0.87,9,187
4,0.00,0.00,1.00,0.00,0.00,0.86,0.61,0.89,6,208
...,...,...,...,...,...,...,...,...,...,...
537,1.00,0.00,0.00,0.00,0.00,0.33,0.23,0.87,21,22
538,0.00,1.00,0.00,0.00,0.00,0.30,0.21,0.89,21,22
539,0.00,0.00,1.00,0.00,0.00,0.29,0.20,0.87,22,25
540,0.00,0.00,0.00,1.00,0.00,0.28,0.19,0.89,21,21


In [11]:
# データを説明変数と目的変数に分割
X = df.iloc[:,:-1]
y = df.iloc[:,-1:]

In [12]:
# 特徴量の標準化
stsc = StandardScaler()
X_stsc = pd.DataFrame(stsc.fit_transform(X), index=X.index, columns=X.columns)
X_stsc.head()

Unnamed: 0,week_Monday,week_Tuesday,week_Wednesday,week_Thursday,week_Friday,kyaku_param,uriage_param,food_uriage_wariai,kion_ave
0,-0.46,-0.51,-0.51,1.95,-0.51,0.22,0.07,1.0,-0.59
1,-0.46,-0.51,-0.51,-0.51,1.97,0.08,0.11,-0.04,-1.02
2,2.16,-0.51,-0.51,-0.51,-0.51,0.33,0.39,-0.64,-0.17
3,-0.46,1.97,-0.51,-0.51,-0.51,0.1,0.07,0.33,-1.16
4,-0.46,-0.51,1.97,-0.51,-0.51,0.37,0.29,1.19,-1.58


In [13]:
# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X_stsc, y, test_size=0.2, shuffle=True)

In [14]:
# モデル作成
model = LR()
model.fit(X_train, y_train)

LinearRegression()

In [15]:
model.coef_

array([[-1.91652684e+15, -2.02587192e+15, -2.02587192e+15,
        -2.03929347e+15, -2.02587192e+15,  2.80156250e+01,
        -1.20937500e+01, -1.25000000e+00, -8.80000000e+01]])

In [16]:
model.intercept_

array([127.43682363])

In [17]:
# モデルを使った予測
pred = model.predict(X_test)
pred = np.where(pred < 0, 0, pred)
pred = np.round(pred)

In [18]:
# Root Mean Squared Error (RMSE)
rmse = np.sqrt(mean_squared_error(y_test, pred))

In [19]:
rmse

31.01582904544302

In [20]:
y_test

Unnamed: 0,target
265,216
495,157
57,29
500,164
201,255
...,...
36,66
521,83
71,32
276,136


In [22]:
pred

array([[241.],
       [146.],
       [ 40.],
       [128.],
       [224.],
       [178.],
       [  0.],
       [241.],
       [274.],
       [ 13.],
       [207.],
       [126.],
       [271.],
       [ 10.],
       [215.],
       [178.],
       [102.],
       [185.],
       [ 27.],
       [142.],
       [179.],
       [ 75.],
       [178.],
       [117.],
       [215.],
       [143.],
       [ 55.],
       [239.],
       [ 51.],
       [  0.],
       [216.],
       [  0.],
       [ 27.],
       [172.],
       [244.],
       [169.],
       [115.],
       [227.],
       [188.],
       [ 54.],
       [285.],
       [ 97.],
       [173.],
       [143.],
       [181.],
       [153.],
       [245.],
       [118.],
       [  0.],
       [  2.],
       [  1.],
       [  0.],
       [ 39.],
       [  0.],
       [100.],
       [ 88.],
       [251.],
       [ 41.],
       [112.],
       [161.],
       [145.],
       [  0.],
       [151.],
       [274.],
       [218.],
       [185.],
       [16