<a href="https://colab.research.google.com/github/fregean/Aidemy_handson/blob/master/day4_am_pm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [67]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib
from scipy.stats import norm, skew #for some statistics
from scipy.special import boxcox1p

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold,cross_val_score,train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv('/content/drive/MyDrive/data/HousePrise/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/data/HousePrise/test.csv')

# Idを削除
train_Id = train_df['Id']
test_Id = test_df['Id']
train_df.drop(columns=['Id'], inplace=True)
test_df.drop(columns=['Id'], inplace=True)

# 正解ラベル
train_df["SalePrice"] = np.log1p(train_df["SalePrice"])
y_train = train_df['SalePrice']
train_df.drop(['SalePrice'], axis=1, inplace=True)

# 学習データと評価データの結合
combined_df = pd.concat([train_df, test_df], axis=0)


In [68]:
# 欠損値処理
for col in ['PoolQC','MiscFeature','Alley', 'FireplaceQu', 'GarageType','GarageFinish','GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType', 'MSSubClass']:
    combined_df[col] = combined_df[col].fillna('None')
for col in ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea']:
    combined_df[col] = combined_df[col].fillna(0)
for col in ['MSZoning', 'Electrical', 'KitchenQual', 'Exterior1st', 'Exterior2nd', 'SaleType']:
    combined_df[col] = combined_df[col].fillna(combined_df[col].mode()[0])
combined_df['Functional'] = combined_df['Functional'].fillna('Typ')
combined_df['LotFrontage'] = combined_df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
combined_df['GarageYrBlt'] = combined_df['GarageYrBlt'].fillna(combined_df['GarageYrBlt'].median())
for col in ['MSSubClass', 'OverallCond', 'YrSold', 'MoSold', 'MSSubClass', 'OverallCond', 'YrSold', 'MoSold']:
    combined_df[col] = combined_df[col].astype(str)

# 列削除
combined_df = combined_df.drop(columns='Utilities')

# ラベルのエンコーディング
lbe = LabelEncoder()
cols = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond',
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold']

for col in cols:
    combined_df[col] = lbe.fit_transform(list(combined_df[col].values))
print(f'LabelEncoder:{combined_df.shape}')

# Onehotエンコーディング    
combined_df = pd.get_dummies(data=combined_df, drop_first=True)
print(f'get_dummies:{combined_df.shape}')    

# 特徴量エンジニアリング
combined_df['TotalSF'] = combined_df['TotalBsmtSF'] + combined_df['1stFlrSF'] + combined_df['2ndFlrSF']

# 歪度修正
numeric_feats = combined_df.dtypes[combined_df.dtypes != "object"].index

skewed_feats = combined_df[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")

skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness = skewness[abs(skewness) > 0.75]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    #combined_df[feat] += 1
    combined_df[feat] = boxcox1p(combined_df[feat], lam)

# データ分割
X_train = combined_df[:len(train_df)]
X_test = combined_df[len(train_df):]

# ホールドアウト法
print('--'*10 + 'Hold-Out' + '--'*10)
X_trn, X_val, y_trn, y_val = train_test_split(X_train, y_train, random_state=42, test_size=0.33)

LabelEncoder:(2919, 78)
get_dummies:(2919, 200)

Skew in numerical features: 

There are 201 skewed numerical features to Box Cox transform
--------------------Hold-Out--------------------


In [70]:
# 決定木
print('--'*10 + 'KFold' + '--'*10)
cv = KFold(n_splits=3, shuffle=True, random_state=42)
rmse_results = []
for trn_index, val_index in cv.split(X_train):
    X_trn, X_val = X_train.iloc[trn_index, :], X_train.iloc[val_index, :]
    y_trn, y_val = y_train[trn_index], y_train[val_index]
    
    model = DecisionTreeRegressor()
    model.fit(X_trn, y_trn)
    pred = model.predict(X_val)
    rsme = np.sqrt(mean_squared_error(pred, y_val))
    rmse_results.append(rsme)

print(f'RMSE : {rmse_results}')
print(f'Ave  : {np.mean(rmse_results)}')

--------------------KFold--------------------
RMSE : [0.19638646043655195, 0.24713585917019557, 0.21367577019277256]
Ave  : 0.21906602993317334


In [61]:
# LightGBM
print('--'*10 + 'KFold' + '--'*10)
cv = KFold(n_splits=3, shuffle=True, random_state=42)
rmse_results = []
for trn_index, val_index in cv.split(X_train):
    X_trn, X_val = X_train.iloc[trn_index, :], X_train.iloc[val_index, :]
    y_trn, y_val = y_train[trn_index], y_train[val_index]
    
    model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)
    model.fit(X_trn, y_trn)
    pred = model.predict(X_val)
    rsme = np.sqrt(mean_squared_error(pred, y_val))
    rmse_results.append(rsme)

print(f'RMSE : {rmse_results}')
print(f'Ave  : {np.mean(rmse_results)}')

--------------------KFold--------------------
RMSE : [0.1979355577911816, 0.24081939677879133, 0.21290264103441403]
Ave  : 0.21721919853479565


In [62]:
# 評価指標
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

model_lgb.fit(X_train, y_train)
lgb_train_pred = model_lgb.predict(X_train)
lgb_pred = np.expm1(model_lgb.predict(X_test.values))
print(rmsle(y_train, lgb_train_pred))


0.07484429088328226


In [63]:
# 提出用csv作成
sub = pd.DataFrame()
sub['Id'] = test_Id
sub['SalePrice'] = lgb_pred
sub.to_csv('submission.csv',index=False)