In [23]:
from sklearn.ensemble import RandomForestRegressor as rf

In [24]:
# import library
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random

plt.style.use(style='ggplot')
np.random.seed(1234)
random.seed(1234)

In [25]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")
submission = pd.read_csv("./data/sample_submission.csv")

In [26]:
all_df = pd.concat([train_df, test_df], axis=0, sort=False).reset_index(drop=True)

In [27]:
from sklearn.preprocessing import LabelEncoder

categories = all_df.columns[all_df.dtypes == "object"]
print(categories)
print(len(categories))

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')
43


In [28]:
for cat in categories:
    le = LabelEncoder()
    all_df[cat].fillna("missing", inplace=True)
    le = le.fit(all_df[cat])
    all_df[cat] = le.transform(all_df[cat])
    all_df[cat] = all_df[cat].astype("category")

In [29]:
all_df["TotalSF"] = all_df["TotalBsmtSF"] + all_df["1stFlrSF"] + all_df["2ndFlrSF"]
all_df["Total_bathrooms"] = (
    all_df["FullBath"]
    + all_df["HalfBath"]
    + all_df["BsmtFullBath"]
    + all_df["BsmtHalfBath"]
)

In [30]:
hasnan_cat = []
for col in all_df.columns:
    tmp_null_count = all_df[col].isnull().sum()
    if (tmp_null_count > 0) & (col != "SalePrice"):
        hasnan_cat.append(col)
        print(col, tmp_null_count)

LotFrontage 486
MasVnrArea 23
BsmtFinSF1 1
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
BsmtFullBath 2
BsmtHalfBath 2
GarageYrBlt 159
GarageCars 1
GarageArea 1
TotalSF 1
Total_bathrooms 2


In [31]:
all_df[hasnan_cat].describe()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,BsmtFullBath,BsmtHalfBath,GarageYrBlt,GarageCars,GarageArea,TotalSF,Total_bathrooms
count,2433.0,2896.0,2918.0,2918.0,2918.0,2918.0,2917.0,2917.0,2760.0,2918.0,2918.0,2918.0,2917.0
mean,69.305795,102.201312,441.423235,49.582248,560.772104,1051.777587,0.429894,0.061364,1978.113406,1.766621,472.874572,2548.048663,2.43915
std,23.344905,179.334253,455.610826,169.205611,439.543659,440.766258,0.524736,0.245687,25.574285,0.761624,215.394815,804.677866,0.941799
min,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1895.0,0.0,0.0,334.0,1.0
25%,59.0,0.0,0.0,0.0,220.0,793.0,0.0,0.0,1960.0,1.0,320.0,2000.5,2.0
50%,68.0,0.0,368.5,0.0,467.0,989.5,0.0,0.0,1979.0,2.0,480.0,2450.0,2.0
75%,80.0,164.0,733.0,0.0,805.5,1302.0,1.0,0.0,2002.0,2.0,576.0,2991.75,3.0
max,313.0,1600.0,5644.0,1526.0,2336.0,6110.0,3.0,2.0,2207.0,5.0,1488.0,11752.0,8.0


In [32]:
for col in all_df.columns:
    tmp_null_count = all_df[col].isnull().sum()
    if (tmp_null_count > 0) & (col != "SalePrice"):
        print(col, tmp_null_count)
        all_df[col].fillna(all_df[col].median(), inplace=True)

LotFrontage 486
MasVnrArea 23
BsmtFinSF1 1
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
BsmtFullBath 2
BsmtHalfBath 2
GarageYrBlt 159
GarageCars 1
GarageArea 1
TotalSF 1
Total_bathrooms 2


In [34]:
train_df_le = all_df[~all_df["SalePrice"].isnull()]
test_df_le = all_df[all_df["SalePrice"].isnull()]

train_df_le["SalePrice_log"] = np.log(train_df_le["SalePrice"])
train_X = train_df_le.drop(["SalePrice", "SalePrice_log", "Id"], axis=1)
train_Y = train_df_le["SalePrice_log"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [35]:
from sklearn.model_selection import KFold

folds = 3
kf = KFold(n_splits=folds)

In [36]:
# import mean_squared_error
from pyexpat import model
from sklearn.metrics import mean_squared_error

models_rf = []
rmses_rf = []
oof_rf = np.zeros(len(train_X))

for train_index, val_index in kf.split(train_X):
    X_train = train_X.iloc[train_index]
    X_vaild = train_X.iloc[val_index]
    y_train = train_Y.iloc[train_index]
    y_valid = train_Y.iloc[val_index]
    model_rf = rf(n_estimators=50, random_state=1234)
    model_rf.fit(X_train, y_train)
    y_pred = model_rf.predict(X_vaild)
    tmp_rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    print(tmp_rmse)
    models_rf.append(model_rf)
    rmses_rf.append(tmp_rmse)
    oof_rf[val_index] = y_pred

0.14041732688138514
0.15352088210494158
0.14194449912040752


In [37]:
sum(rmses_rf) / len(rmses_rf)

0.1452942360355781

In [39]:
test_X = test_df_le.drop(["SalePrice", "Id"], axis=1)

In [40]:
preds_rf = []
for model in models_rf:
    pred = model.predict(test_X)
    preds_rf.append(pred)

In [41]:
preds_array_rf = np.array(preds_rf)
preds_mean_rf = np.mean(preds_array_rf, axis=0)
preds_exp_rf = np.exp(preds_mean_rf)
submission["SalePrice"] = preds_exp_rf

In [43]:
submission.to_csv("./submit/submission_rf.csv", index=False)