## 民泊サービスの宿泊料金予測
- 一泊の適正価格はいくら？
- Lassoで特徴量を選択
- P値での絞り込み
- LightGBM

### Google Driveのマウント

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/'My Drive'

Mounted at /content/drive
/content/drive/My Drive


### ライブラリのインストール、インポート

In [None]:
# カレントディレクトリを変更
import os
os.chdir('/content/drive/My Drive/Probdata/airbnb/')
print(os.getcwd())

/content/drive/My Drive/Probdata/airbnb


In [None]:
class Config():
    root_path = './'
    input_path = os.path.join(root_path, 'input')
    output_path = os.path.join(root_path, 'output')
    intermediate_path = os.path.join(root_path, 'intermediate')
    seed = 42
    debug = False

In [None]:
import pandas as pd
import numpy as np
import warnings
import datetime

from geopy.distance import geodesic

import sklearn
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb

### データの読み込み

In [None]:
train_df = pd.read_csv(f'{Config.intermediate_path}/train_tmp.csv')
test_df = pd.read_csv(f'{Config.intermediate_path}/test_tmp.csv')
submission_df = pd.read_csv(f'{Config.input_path}/submission.csv')

In [None]:
print(train_df.shape)
print(test_df.shape)

(9990, 147)
(4996, 145)


## 特徴量整理

In [None]:
# columnsをターゲット変数とtargetとカテゴリカル変数に分割
target_column = "y_log"

category_column = ['neighbourhood', 'room_type', 'sta_nm_1', 'sta_nm_2', 'sta_nm_3']
standard_column = ['minimum_nights', 'number_of_reviews', 'reviews_per_month', 'availability_365',
                   'dis_200', 'dis_500', 'dis_1000', 'date_cnt', 'dis', 'word_count', 'name_len', 'centroid_dist', 'gmm']
del_columns = ['name', 'y', 'y_log', 'last_review', 'host_id', 'id', 'last_date'] + category_column + standard_column

features = list(set(train_df.columns) - set(del_columns))

y = train_df[target_column]
X = train_df[category_column]
test_X = test_df[category_column]

traintest = pd.concat([X, test_X], ignore_index = True)

In [None]:
# 説明変数をラベルエンコーディング
warnings.simplefilter('ignore')
for column in traintest.columns:
    le = preprocessing.LabelEncoder()
    target_column = traintest[column]
    train_target_column = X[column]
    test_target_column = test_X[column]  
    le.fit(target_column)
    train_label_encoded_column = le.transform(train_target_column)
    test_label_encoded_column = le.transform(test_target_column)
    X[column] = pd.Series(train_label_encoded_column).astype("category")
    test_X[column] = pd.Series(test_label_encoded_column).astype("category")

In [None]:
X = pd.concat([X, train_df[standard_column]], axis=1)
test_X = pd.concat([test_X, test_df[standard_column]], axis=1)

In [None]:
# 標準化
ss = preprocessing.StandardScaler()
ss.fit(X)
X = pd.DataFrame(ss.transform(X))
test_X = pd.DataFrame(ss.transform(test_X))

In [None]:
X = pd.concat([X, train_df[features]], axis=1)
test_X = pd.concat([test_X, test_df[features]], axis=1)

In [None]:
print(X.shape)
print(y.shape)
print(test_X.shape)

(9990, 140)
(9990,)
(4996, 140)


In [None]:
print(train_df.shape)

(9990, 147)


In [None]:
# trainデータを訓練用とテスト用に分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = Config.seed)

In [None]:
print(X.shape)
print(y.shape)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9990, 140)
(9990,)
(7992, 140)
(7992,)
(1998, 140)
(1998,)


### Lasso 実施

In [None]:
from sklearn.linear_model import Lasso
model_lasso = Lasso(alpha=0.0001)

In [None]:
model_lasso.fit(X_train, y_train)

y_test_lasso = model_lasso.predict(X_test)

print(mean_squared_error(y_test, y_test_lasso))
print(np.sqrt(mean_squared_error(y_test, y_test_lasso)))

0.6231914290586705
0.7894247456589326


In [None]:
model_lasso_index = np.where(model_lasso.coef_ != 0)

In [None]:
X_col = X.columns[model_lasso_index]
len(X_col)

120

In [None]:
X = X[X_col]

In [None]:
X.shape

(9990, 120)

## P値の判定

In [None]:
import statsmodels.api as sma

In [None]:
X2 = sma.add_constant(np.array(X))
X2 = X2.astype(float)
y_train_np = np.array(y).reshape(-1,1)
y_train_np = y_train_np.astype(float)

In [None]:
print(X2.shape)

(9990, 121)


In [None]:
est = sma.OLS(y_train_np, X2)
est_trained = est.fit()
# print(est_trained.summary())

In [None]:
pvalues_list = list(est_trained.pvalues)
print(len(pvalues_list))
pvalues_list = pvalues_list[1:]
pvalues_list.sort(reverse=True)
print(len(pvalues_list))

121
120


In [None]:

model_p_max = pvalues_list[0:2]
print(model_p_max)

# model_p_index = np.array(np.where(est_trained.pvalues is not in model_p_max)).reshape(-1,)
model_p_index = np.array([i for i, x in enumerate(est_trained.pvalues) if x not in model_p_max])
model_p_index = model_p_index - 1
model_p_index = model_p_index[1:]
print(model_p_index.shape)

[0.9991858161334879, 0.9673334256954187]
(118,)


In [None]:
print(model_p_index.shape)
X_col = X.columns[model_p_index]
# X_col = X_col.reshape(X_col.shape[1],)
X = X[X_col]
X.shape

(118,)


(9990, 118)

## LGBMの実行

In [None]:
lgb_params = {
    'n_estimators':5000,
    'boosting_type': 'gbdt',
    'metric': 'regression',
    'objective': 'rmse',
    'n_jobs': -1,
    'seed': Config.seed,
    'learning_rate': 0.01,
    'num_leaves': 34,
    # 'max_depth': -1, 'min_data_in_leaf': 60,
    # 'max_bin': 300,
}

In [None]:
# trainデータを訓練用とテスト用に分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = Config.seed)
test_X = test_X[X_col]

In [None]:
# 学習/予測

y_oof = np.zeros(len(y_train))
y_preds = []
kf = KFold(n_splits=5)
for fold, (tr_idx, vl_idx) in enumerate(kf.split(X_train, y_train)):
    x_tr_fold = X.iloc[tr_idx]
    y_tr_fold = y.iloc[tr_idx]
    x_vl_fold = X.iloc[vl_idx]
    y_vl_fold = y.iloc[vl_idx]

    # データセットを生成する
    model = lgb.LGBMRegressor(**lgb_params)
    model.fit(
        x_tr_fold, y_tr_fold,
        eval_set=(x_vl_fold, y_vl_fold),
        verbose=False,
        early_stopping_rounds=100,
    )
    y_oof[vl_idx] = model.predict(x_vl_fold)
    y_preds.append(np.exp(model.predict(test_X)) - 1)
    print(
        f'fold {fold} score:', np.sqrt(mean_squared_error(y_vl_fold, y_oof[vl_idx]))
    )

fold 0 score: 0.5565111876208347
fold 1 score: 0.5480106144251429
fold 2 score: 0.532057965020943
fold 3 score: 0.5818398391573686
fold 4 score: 0.5473182397243728


In [None]:
model = lgb.LGBMRegressor(**lgb_params)
model.fit(X_train, y_train,
        eval_set=(X_test, y_test),
        verbose=False,
        early_stopping_rounds=100
        )

LGBMRegressor(learning_rate=0.01, metric='regression', n_estimators=5000,
              num_leaves=34, objective='rmse', seed=42)

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
print(np.sqrt(mean_squared_error(y_train, y_train_pred)))
print(np.sqrt(mean_squared_error(y_test, y_test_pred)))

0.09864289171831148
0.5432714086968571


In [None]:
print(np.sqrt(mean_squared_error(y_test, y_test_pred))/mean_absolute_error(y_test, y_test_pred))

1.46890126994959


### 提出データの作成

In [None]:
result = np.mean(y_preds, axis=0)

In [None]:
result.shape

(4996,)

In [None]:
result

array([13062.52290253, 13051.48196524, 20518.18414348, ...,
       10423.23441994,  6012.59147601,  9260.2613646 ])

In [None]:
submission_df['y'] = result
print(submission_df)

        id             y
0        1  13062.522903
1        2  13051.481965
2        3  20518.184143
3        4  14483.207342
4        5  15819.249982
...    ...           ...
4991  4992  16886.032227
4992  4993   5491.286463
4993  4994  10423.234420
4994  4995   6012.591476
4995  4996   9260.261365

[4996 rows x 2 columns]


In [None]:
# submitファイルのファイル名に利用する作成時刻の取得

now = datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=1)))
date_time = "{0:%Y%m%d_%H%M}".format(now)

In [None]:
# submitファイルの出力(Google Driveに出力）

submission_df.to_csv(os.path.join(Config.output_path, f"submit_{date_time}.csv"), index=False)

In [None]:
list(X_col)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 14,
 15,
 16,
 17,
 'name_tfidf_svd_14',
 'USE_34',
 'name_tfidf_svd_9',
 'name_tfidf_svd_31',
 'name_tfidf_svd_16',
 'name_tfidf_svd_11',
 'name_tfidf_svd_13',
 'USE_35',
 'USE_42',
 'USE_18',
 'is_whitestar',
 'BERT_16',
 'BERT_12',
 'BERT_4',
 'USE_30',
 'BERT_6',
 'BERT_1',
 'name_tfidf_svd_27',
 'BERT_15',
 'USE_26',
 'USE_20',
 'BERT_8',
 'BERT_9',
 'USE_39',
 'name_tfidf_svd_28',
 'USE_11',
 'USE_29',
 'USE_2',
 'name_tfidf_svd_23',
 'name_tfidf_svd_0',
 'USE_16',
 'USE_40',
 'USE_14',
 'USE_5',
 'USE_15',
 'name_tfidf_svd_22',
 'USE_33',
 'BERT_19',
 'BERT_13',
 'name_tfidf_svd_7',
 'latitude',
 'name_tfidf_svd_10',
 'name_tfidf_svd_17',
 'name_tfidf_svd_1',
 'USE_3',
 'USE_24',
 'name_tfidf_svd_3',
 'is_wifi',
 'USE_28',
 'USE_17',
 'BERT_11',
 'name_tfidf_svd_4',
 'name_tfidf_svd_5',
 'name_tfidf_svd_24',
 'USE_22',
 'name_tfidf_nmf_6',
 'name_tfidf_svd_18',
 'USE_45',
 'BERT_5',
 'is_mark',
 'USE_6',
 'BERT_18',
 'BERT_