<a href="https://colab.research.google.com/github/danielbauer1979/ML_656/blob/main/Module8_HousePriceExample_inClass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install shap

In [None]:
import numpy as np 
import matplotlib.pyplot as plt  
import pandas as pd 
import seaborn as sns
import graphviz
import pydot
from io import StringIO  

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_val_score
import sklearn.metrics as metrics

from plotly.subplots import make_subplots
import plotly.graph_objects as go

from keras.models import Sequential
from keras.layers import Dense

import shap

# HOUSE PRICE EXAMPLE

# Get data

In [None]:
!git clone https://github.com/danielbauer1979/ML_656.git

In [None]:
house = pd.read_csv('ML_656/HouseData.csv')
house.head()

## Prep Data

In [None]:
pd.set_option("display.max_rows", None)
house.isnull().sum(axis = 0)

In [None]:
house = house.drop(columns=['Id','LotFrontage','Alley', 'BsmtQual', 'BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','FireplaceQu','GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond','PoolQC','Fence','MiscFeature'])

In [None]:
house = house.dropna()
house.info()

In [None]:
col_types = house.columns.to_series().groupby(house.dtypes).groups
numerics = list(house.select_dtypes(include=['int64']).columns)
factors = list(house.select_dtypes(include=['object']).columns)

In [None]:
house_numcols = house[numerics].drop(columns = ['SalePrice'])
house_faccols = house[factors]
dummies = pd.get_dummies(house_faccols, drop_first=True)
house_numcols_sc_0 = scale(house_numcols)
house_numcols_sc = pd.DataFrame(data=house_numcols_sc_0, columns = house_numcols.columns, index = dummies.index)
house_sc = pd.concat([house_numcols_sc, dummies], axis = 1)
house_sc = pd.concat([house_sc, house['SalePrice']], axis =1)
house_sc = house_sc.rename(columns={"SalePrice":"Y"})
house_sc.head()

## Explore

In [None]:
#From https://towardsdatascience.com/machine-learning-with-python-regression-complete-tutorial-47268e546cea
x = "Y"
fig, ax = plt.subplots(nrows=1, ncols=2,  sharex=False, sharey=False)
fig.suptitle(x, fontsize=20)
### distribution
ax[0].title.set_text('distribution')
variable = house_sc[x].fillna(house_sc[x].mean())
breaks = np.quantile(variable, q=np.linspace(0, 1, 11))
variable = variable[ (variable > breaks[0]) & (variable < 
                    breaks[10]) ]
sns.distplot(variable, hist=True, kde=True, kde_kws={"shade": True}, ax=ax[0])
des = house_sc[x].describe()
ax[0].axvline(des["25%"], ls='--')
ax[0].axvline(des["mean"], ls='--')
ax[0].axvline(des["75%"], ls='--')
ax[0].grid(True)
des = round(des, 2).apply(lambda x: str(x))
box = '\n'.join(("min: "+des["min"], "25%: "+des["25%"], "mean: "+des["mean"], "75%: "+des["75%"], "max: "+des["max"]))
ax[0].text(0.95, 0.95, box, transform=ax[0].transAxes, fontsize=10, va='top', ha="right", bbox=dict(boxstyle='round', facecolor='white', alpha=1))
### boxplot 
ax[1].title.set_text('outliers (log scale)')
tmp_dtf = pd.DataFrame(house_sc[x])
tmp_dtf[x] = np.log(tmp_dtf[x])
tmp_dtf.boxplot(column=x, ax=ax[1])
plt.show()

In [None]:
np.random.seed(42)
train, test = train_test_split(house_sc, test_size = 0.5)
val, test = train_test_split(test, test_size = 0.5)
X_train = train.drop(columns = ['Y']).values
y_train = train['Y'].values
X_val = val.drop(columns = ['Y']).values
y_val = val['Y'].values
X_test = test.drop(columns = ['Y']).values
y_test = test['Y'].values
X_train.shape

## Deep Learning

In [None]:
model = Sequential()
model.add(Dense(50, input_shape=(195, ), activation='relu', name='dense_1'))
model.add(Dense(25, activation='relu', name='dense_2'))
model.add(Dense(1, activation='linear', name='dense_output'))
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
#https://keras.io/api/models/model_training_apis/
model.summary()

In [None]:
history = model.fit(X_train, y_train, epochs=500, validation_split=0.05)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['loss'],
                    name='Train'))
fig.add_trace(go.Scattergl(y=history.history['val_loss'],
                    name='Valid'))
fig.update_layout(height=500, width=700,
                  xaxis_title='Epoch',
                  yaxis_title='Loss')
fig.show()

In [None]:
history = model.fit(X_train, y_train, epochs=200, validation_split=0.05)

In [None]:
fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['loss'],
                    name='Train'))
fig.add_trace(go.Scattergl(y=history.history['val_loss'],
                    name='Valid'))
fig.update_layout(height=500, width=700,
                  xaxis_title='Epoch',
                  yaxis_title='Loss')
fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scattergl(y=history.history['mae'],
                    name='Train'))
fig.add_trace(go.Scattergl(y=history.history['val_mae'],
                    name='Valid'))
fig.update_layout(height=500, width=700,
                  xaxis_title='Epoch',
                  yaxis_title='Mean Absolute Error')
fig.show() 

In [None]:
house_nn_test_pred = model.predict(X_test)

In [None]:
## Kpi
print("R2 (explained variance):", round(metrics.r2_score(y_test, house_nn_test_pred), 2))
print("Mean Absolute Perc Error (Σ(|y-pred|/y)/n):", round(np.mean(np.abs((y_test-house_nn_test_pred)/house_nn_test_pred)), 2))
print("Mean Absolute Error (Σ|y-pred|/n):", "{:,.0f}".format(metrics.mean_absolute_error(y_test, house_nn_test_pred)))
print("Root Mean Squared Error (sqrt(Σ(y-pred)^2/n)):", "{:,.0f}".format(np.sqrt(metrics.mean_squared_error(y_test, house_nn_test_pred))))

In [None]:
explainer = shap.KernelExplainer(model.predict,X_train)

In [None]:
shap_values = explainer.shap_values(X_test,nsamples=100)

In [None]:
shap.summary_plot(X_test, plot_type="bar", color='red')

In [None]:
features = house_sc.columns[1:].values.tolist()

In [None]:
shap.summary_plot(shap_values,X_test,feature_names=features)