In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
train_test_data = pd.concat([train_data, test_data])

In [7]:
object_columns = train_test_data.dtypes[train_test_data.dtypes == object].index.tolist()

In [8]:
for column in object_columns:
    train_test_data[column] = pd.factorize(train_test_data[column])[0]

In [9]:
def feature_processed(train_test_data):
    train_test_data['Lot_GrLiv_Area_Sum'] = train_test_data[['LotArea', 'GrLivArea']].sum(axis=1)
    train_test_data['Lot_GrLiv_Area_Ratio'] = train_test_data['GrLivArea'] / train_test_data['LotArea']
    
    train_test_data['GrLiv_Bsmt_Area_Sum'] = train_test_data[['GrLivArea', 'TotalBsmtSF']].sum(axis=1)
    train_test_data['GrLiv_Bsmt_Area_Ratio'] = train_test_data['TotalBsmtSF'] / train_test_data['GrLivArea']
    
    train_test_data['1st_2nd_Area_Sum'] = train_test_data[['1stFlrSF', '2ndFlrSF']].sum(axis=1)
    train_test_data['1st_2nd_Area_Mean'] = train_test_data[['1stFlrSF', '2ndFlrSF']].mean(axis=1)
    train_test_data['1st_2nd_Area_Std'] = train_test_data[['1stFlrSF', '2ndFlrSF']].std(axis=1)
    train_test_data['1st_2nd_Area_Std'] = train_test_data['1st_2nd_Area_Std'].fillna(train_test_data['1st_2nd_Area_Std'].mean())
    
    train_test_data['Built_Remod_Year_Mean'] = train_test_data[['YearBuilt', 'YearRemodAdd']].mean(axis=1)
    
    train_test_data['Total_Bsmtfin_Mean'] = train_test_data[['BsmtFinSF1', 'BsmtFinSF2']].mean(axis=1)
    
    train_test_data['BsmtUnf_Ratio'] = train_test_data['BsmtUnfSF'] / train_test_data['TotalBsmtSF']
    
    train_test_data['Porch_All_Sum'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)
    train_test_data['Porch_All_Mean'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].mean(axis=1)
    train_test_data['Porch_All_Std'] = train_test_data[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].std(axis=1)
    train_test_data['Porch_All_Std'] = train_test_data['Porch_All_Std'].fillna(train_test_data['Porch_All_Std'].mean())
    
    train_test_data['Qual_All_Sum'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].sum(axis=1)
    train_test_data['Qual_All_Mean'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].mean(axis=1)
    train_test_data['Qual_All_Std'] = train_test_data[['ExterQual', 'BsmtQual', 'KitchenQual', 'GarageQual']].std(axis=1)
    train_test_data['Qual_All_Std'] = train_test_data['Qual_All_Std'].fillna(train_test_data['Qual_All_Std'].mean())
    
    train_test_data['Cond_All_Sum'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].sum(axis=1)
    train_test_data['Cond_All_Mean'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].mean(axis=1)
    train_test_data['Cond_All_Std'] = train_test_data[['ExterCond', 'BsmtCond', 'GarageCond']].std(axis=1)
    train_test_data['Cond_All_Std'] = train_test_data['Cond_All_Std'].fillna(train_test_data['Cond_All_Std'].mean())
    
    train_test_data['Room_Kitchen_Sum'] = train_test_data[['BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd']].sum(axis=1)
    
    return train_test_data

In [10]:
train_test_data = feature_processed(train_test_data)

In [11]:
train_data = train_test_data[~train_test_data['SalePrice'].isnull()]
test_data = train_test_data[train_test_data['SalePrice'].isnull()]
test_data = test_data.drop('SalePrice', axis=1)

In [16]:
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from lightgbm import LGBMRegressor

def train_with_oof(train_data, test_data, nfolds=5):
    ftr_train = train_data.drop(['Id', 'SalePrice'], axis=1)
    saleprice_train = train_data['SalePrice']
    
    folds = KFold(n_splits=nfolds, shuffle=True, random_state=2022)
    oof_preds = np.zeros(ftr_train.shape[0])
    test_preds = np.zeros(test_data.shape[0])
    
    lgbr = LGBMRegressor(
        njobs=-1,
        n_estimators=2000,
        learning_rate=0.01,
        max_depth = 6,
        num_leaves=12,
        colsample_bytree=0.5,
        subsample=0.5,
        max_bin=89,
        reg_alpha=32.453,
        reg_lambda=0.001,
        min_child_weight=1,
        min_child_samples=10,
        silent=-1,
        verbose=-1
    )
    
    for fold_idx, (train_idx, valid_idx) in enumerate(folds.split(ftr_train)):
        train_x = ftr_train.iloc[train_idx, :]
        train_y = saleprice_train.iloc[train_idx]
        valid_x = ftr_train.iloc[valid_idx, :]
        valid_y = saleprice_train.iloc[valid_idx]
        
        lgbr.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='r2',
                verbose=100, early_stopping_rounds=100)
        
        oof_preds[valid_idx] = lgbr.predict(valid_x, num_iteration=lgbr.best_iteration_)
        test_preds += lgbr.predict(test_data.drop('Id', axis=1), num_iteration=lgbr.best_iteration_)/folds.n_splits
        
    return lgbr, test_preds

In [17]:
lgbr, test_preds = train_with_oof(train_data, test_data, nfolds=5)

[100]	training's l2: 1.67323e+09	valid_1's l2: 1.70529e+09
[200]	training's l2: 6.99836e+08	valid_1's l2: 9.66669e+08
[300]	training's l2: 4.19649e+08	valid_1's l2: 7.82065e+08
[400]	training's l2: 3.01624e+08	valid_1's l2: 7.00479e+08
[500]	training's l2: 2.40294e+08	valid_1's l2: 6.58613e+08
[600]	training's l2: 2.04452e+08	valid_1's l2: 6.34361e+08
[700]	training's l2: 1.79578e+08	valid_1's l2: 6.30362e+08
[800]	training's l2: 1.58597e+08	valid_1's l2: 6.28593e+08
[900]	training's l2: 1.40685e+08	valid_1's l2: 6.24541e+08
[1000]	training's l2: 1.25812e+08	valid_1's l2: 6.22426e+08
[1100]	training's l2: 1.12722e+08	valid_1's l2: 6.18304e+08
[1200]	training's l2: 1.01928e+08	valid_1's l2: 6.15848e+08
[1300]	training's l2: 9.28114e+07	valid_1's l2: 6.11279e+08
[1400]	training's l2: 8.48904e+07	valid_1's l2: 6.08423e+08
[1500]	training's l2: 7.79952e+07	valid_1's l2: 6.06686e+08
[1600]	training's l2: 7.18786e+07	valid_1's l2: 6.04221e+08
[1700]	training's l2: 6.62828e+07	valid_1's l2: 6

In [20]:
test_data['SalePrice'] = test_preds
test_data[['Id', 'SalePrice']].to_csv('House_prices_oof_01.csv', index=False)