In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from empiricaldist import Pmf, Cdf
from matplotlib.ticker import MaxNLocator
%matplotlib inline  
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
# for one hot encoding with feature-engine
from feature_engine.categorical_encoders import OneHotCategoricalEncoder
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

OSError: dlopen(/Users/dina/house-prices-advanced/venv/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib
  Referenced from: /Users/dina/house-prices-advanced/venv/lib/python3.8/site-packages/lightgbm/lib_lightgbm.so
  Reason: image not found

In [None]:
train_data = pd.read_csv("train.csv")

In [None]:
train_data.info()

**Imputing missing values**
From EDA notebook, missing data is imputed as follows:
- For LotFrontage and MasVnrArea, impute by median of column
- For MasVnrType and Electrical, impute with mode of the column
- For all other categorical columns with missing data, impute with a "NA" label
- Drop the GrgYrBuilt column 

In [None]:
train_data["LotFrontage"] = train_data["LotFrontage"].fillna(train_data["LotFrontage"].median())
train_data["MasVnrArea"] = train_data["MasVnrArea"].fillna(train_data["MasVnrArea"].median())

train_data["MasVnrType"] = train_data["MasVnrType"].fillna(train_data["MasVnrType"].mode())
train_data["Electrical"] = train_data["Electrical"].fillna(train_data["Electrical"].mode())

cat_cols= [x for x in train_data.columns if train_data[x].dtype==np.object]
train_data[cat_cols]=train_data[cat_cols].fillna("NA")
train_data.drop("GarageYrBlt", axis=1, inplace=True)

In [None]:
train_data.info()

**Applying features' transformations**
- Transform target to log10 (not necessarily needed)
- MoSold and MSSubClass turned from int to str

In [None]:
train_data["LogSalePrice"]=np.log(train_data["SalePrice"])

train_data[["MoSold", "MSSubClass"]] = train_data[["MoSold", "MSSubClass"]].astype(str)

In [None]:
train_data.info()

In [None]:
train_data["LogSalePrice"].hist(bins=30)
plt.xlabel("log Sale Price")
plt.show()

In [None]:
# cardinality of each categorical feature 

for column in cat_cols:
    print('Number of categories in the variable {}: {}'.format(column,len(train_data[column].unique())))


**Encoding categorical variables**

For the baseline models, hot one encode all cat vars

In [None]:
ohe_enc = OneHotCategoricalEncoder(
    top_categories=None,
    drop_last=False) # automatically detects all cat vars in data

ohe_enc.fit(train_data)

train_data = ohe_enc.transform(train_data)

In [None]:
train_data.info()

In [None]:
train_data.head()

In [None]:
X = train_data.drop(["Id", "SalePrice", "LogSalePrice"], axis=1)
y = train_data["SalePrice"]

**Dummy mean baseline**

In [None]:
dummy_regr = DummyRegressor(strategy="mean")
scores = cross_val_score(dummy_regr, X, y, cv=5, scoring='neg_mean_squared_log_error')
scores = np.sqrt(np.abs(scores))
print("RMSLE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
scores

In [None]:
dummy_regr.fit(X, y)

predictions = dummy_regr.predict(X)

In [None]:
predictions

In [None]:
score = np.sqrt(mean_squared_log_error(y, predictions))

score

**Random Forest Regressor Baseline**

In [None]:
rf_regr = RandomForestRegressor()
scores = cross_val_score(rf_regr, X, y, cv=5, scoring='neg_mean_squared_log_error')
scores = np.sqrt(np.abs(scores))
print("RMSLE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [None]:
lightgbm_regr = LGBMRegressor()
scores = cross_val_score(rf_regr, X, y, cv=5, scoring='neg_mean_squared_log_error')
scores = np.sqrt(np.abs(scores))
print("RMSLE: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))