# Discover ML with Ames, Ioha House pricing dataset

## Import libraries and data

Importing base python libraries we will use throughout the workshop + base configuration for the plots.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

For this workshop we will only work on a subset of the available features.

In [None]:
selected_features=[
    'SalePrice',
    'LotArea', 
    'Neighborhood', 
    'HouseStyle', 
    'OverallQual', 
    'KitchenQual',
    'OverallCond', 
    'YearBuilt', 
    'Foundation', 
    'Heating', 
    'CentralAir', 
    'GrLivArea', 
    'GarageCars',
    'PoolArea'
]

Importing the data from an online repository

In [None]:
full_df = pd.read_csv('https://raw.githubusercontent.com/cloderic/ml_intro/master/data/house_prices/data.csv', index_col='Id')
df = full_df[selected_features]
df.head()

Counting the number of transactions

In [None]:
len(df)

Importing some feature metadata

In [None]:
metadata_df = pd.read_json('https://raw.githubusercontent.com/cloderic/ml_intro/master/data/house_prices/data_description.json')
metadata_df = metadata_df.loc[metadata_df['feature'].isin(selected_features)].set_index('feature')
metadata_df

## Data Exploration

### Numerical & categorical features

In [None]:
numerical_features = list(df._get_numeric_data().columns)
categorical_features = list(set(df.columns) - set(numerical_features))
print('categorical features', categorical_features)
print('numerical features', numerical_features)

In [None]:
def describe_feature(df, feature):
    description = df[feature].describe()
    print('feature: \t\t', feature)
    if (feature in metadata_df['description']):
        print('description: \t\t', metadata_df['description'][feature])
    print('# records: \t\t', description['count'])
    print('# null records: \t', df[feature].isnull().sum())
    if 'unique' in description:
        # It's a categorical feature
        print('# values: \t\t', description['unique'])
        values_count=df[feature].value_counts()
        print('values:')
        for value in values_count.index:
            print('  - value: \t\t', value)
            print('    description: \t', metadata_df['values'][feature][value])
            print('    # records: \t\t', values_count[value])
    else:
        # It's a numerical feature
        print('average: \t\t', description['mean'])
        print('standard deviation: \t', description['std'])
        print('min: \t\t\t', description['min'])
        print('1st quartile: \t\t', description['25%'])
        print('median: \t\t', description['50%'])
        print('3rd quartile: \t\t', description['75%'])
        print('max: \t\t\t', description['max'])

Use the `describe_feature` function to explore the features, categorical or numerical.

In [None]:
describe_feature(df, 'Foundation')

In [None]:
describe_feature(df, 'OverallCond')

### Price distribution

In [None]:
# Setup Seaborn style
sns.set(rc={'figure.figsize':(18,12)})

sns.distplot(df['SalePrice']);
#sns.distplot(df[df['Neighborhood']=='NridgHt']['SalePrice'])

In [None]:
df_g_neighborhood = df.groupby(by='Neighborhood')
df_g_neighborhood['SalePrice'].describe().sort_values('mean', ascending=False)

In [None]:
def plot_neighborhood_distributions(df, column):
    for neighborhood, df_neighborhood in df.groupby(by='Neighborhood'):
        sns.distplot(df_neighborhood[column], hist=False, rug=False, label=metadata_df['values']['Neighborhood'][neighborhood] + ' (' + neighborhood + ')')
    
plot_neighborhood_distributions(df, 'SalePrice')   

### Relationship with other numerical features

In [None]:
corrmat = df.corr()
sns.heatmap(corrmat, square=True);

In [None]:
correlations_df = pd.DataFrame({ 'correlation': corrmat['SalePrice'] })
correlations_df = pd.merge(correlations_df, metadata_df[['description']], how='left', left_index=True, right_index=True)
correlations_df['abs_correlation'] = abs(correlations_df['correlation'])
correlations_df = correlations_df.sort_values('abs_correlation', ascending=False)
correlations_df

#### Price vs Living Area

In [None]:
def plot_2d(x_feature, y_feature):
  data = pd.concat([df[x_feature], df[y_feature], ], axis=1)
  data.plot.scatter(x=x_feature, y=y_feature)

plot_2d('GrLivArea', 'SalePrice')

#### Price vs Pool area

In [None]:
plot_2d('PoolArea', 'SalePrice')

#### Price vs Overall Quality

![Box plot explanation](https://upload.wikimedia.org/wikipedia/commons/1/1a/Boxplot_vs_PDF.svg)

In [None]:
def plot_box2d(x_feature, y_feature):
  data = pd.concat([df[x_feature], df[y_feature], ], axis=1)
  sns.boxplot(x=x_feature, y=y_feature, data=data)

plot_box2d('OverallQual', 'SalePrice')

#### Price vs Overall condition

In [None]:
plot_box2d('OverallCond', 'SalePrice')

#### OverallCond vs Neighborhood

In [None]:
plot_box2d('Neighborhood', 'OverallQual')

### Encoding categorical features

In [None]:
categorical_features

In [None]:
describe_feature(df, 'CentralAir')
print('------------')
describe_feature(df, 'KitchenQual')

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

def target_encoder(df, encoded_feature, target_feature):
    df_te = pd.DataFrame()
    df_te = df_te.append(df.groupby(encoded_feature)[target_feature].agg(['mean']).reset_index())
    df_te.rename(columns={'mean': 'TargetEncoded{}Mean{}'.format(target_feature,encoded_feature)},
                 inplace=True)
    df = pd.merge(df, df_te, how='left').set_index(df.index)
    return df

def encode_categorical_features(df, onehot_encoded_features=[], ordinal_encoded_features={}, target_encoded_features={}):
    # One hot encoding
    onehot_encoder = OneHotEncoder(sparse=False, drop='first')
    onehot_encoder.fit(df[onehot_encoded_features])
    encoded_df = pd.concat([df,
                            pd.DataFrame(
                                data=onehot_encoder.transform(df[onehot_encoded_features]), 
                                columns=onehot_encoder.get_feature_names(onehot_encoded_features), 
                                index=df.index)
                           ], axis=1)
    

    # Create the ordinal encoder
    ordinal_encoded_features_keys = [key for key in ordinal_encoded_features.keys()] 
    ordinal_encoded_features_values = [value for value in ordinal_encoded_features.values()]
    ordinal_encoded_features_output = ['Encoded{}'.format(key) for key in ordinal_encoded_features_keys]
    ordinal_encoder = OrdinalEncoder(ordinal_encoded_features_values)
    ordinal_encoder.fit(df[ordinal_encoded_features_keys])
    encoded_df = pd.concat([encoded_df,
                            pd.DataFrame(
                                data=ordinal_encoder.transform(df[ordinal_encoded_features_keys]), 
                                columns=ordinal_encoded_features_output, index=df.index)
                           ], axis=1)
    
    target_encoded_features_keys = [key for key in target_encoded_features.keys()] 
    for encoded_feature in target_encoded_features_keys:
        encoded_df = target_encoder(df=encoded_df, encoded_feature=encoded_feature, target_feature=target_encoded_features[encoded_feature])
    
    # Drop the encoded features
    encoded_df = encoded_df.drop(onehot_encoded_features, axis=1, errors='ignore')
    encoded_df = encoded_df.drop(ordinal_encoded_features_keys, axis=1, errors='ignore')
    encoded_df = encoded_df.drop(target_encoded_features_keys, axis=1, errors='ignore')
    return encoded_df
    
encoded_df = encode_categorical_features(
    df,
    onehot_encoded_features=['CentralAir','HouseStyle','Foundation','Heating', 'Neighborhood'], 
    ordinal_encoded_features={'KitchenQual':['Po', 'TA', 'Fa', 'Gd', 'Ex']},
    #target_encoded_features={'Neighborhood': 'SalePrice'}
)
encoded_df.head()

### Update the correlation matrix with the encoded features

In [None]:
corrmat = encoded_df.corr()
sns.heatmap(corrmat, square=True);

In [None]:
correlations_df = pd.DataFrame({ 'correlation': corrmat['SalePrice'] })
correlations_df['abs_correlation'] = abs(correlations_df['correlation'])
correlations_df = correlations_df.sort_values('abs_correlation', ascending=False)
correlations_df

## Linear regression

### Train / Test split

In [None]:
from sklearn.model_selection import train_test_split

target = 'SalePrice'
ignored_features = ['OverallCond']
features = list(set(encoded_df.columns) - set(['SalePrice']) - set(['OverallCond']))

Input values can be normalized

In [None]:
from sklearn.preprocessing import StandardScaler

normalized_encoded_df = encoded_df.copy()
features_normalizer = StandardScaler()
normalized_encoded_df[features] = features_normalizer.fit_transform(encoded_df[features])
target_normalizer = StandardScaler()
normalized_encoded_df[[target]] = target_normalizer.fit_transform(encoded_df[[target]])
normalized_encoded_df.head()

In [None]:
train_df, test_df = train_test_split(encoded_df, test_size=0.3, random_state=666)
train_normalized_df, test_normalized_df = train_test_split(normalized_encoded_df, test_size=0.3, random_state=666)
train_results_df = train_df[[target]].rename(columns={target: 'Truth'})
test_results_df = test_df[[target]].rename(columns={target: 'Truth'})

In [None]:
sns.distplot(train_df['SalePrice'], hist=False, rug=False, label='train')
sns.distplot(test_df['SalePrice'], hist=False, rug=False, label='test')

### Training

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score

def plot_results(results):
  sns.lineplot(data=results.sort_values('Truth').reset_index().drop(columns=['Id']))
  
def compute_scores(method, results):
  return pd.Series(
    data=[mean_absolute_error(results['Truth'], results[method]), 
          r2_score(results['Truth'], results[method])],
    index=['mae ($)', 'r2'],
    name=method)

def update_results(method, trained_regressor, df, result_df, trained_scaler = None):
    if trained_scaler:
        result_df[method] = trained_scaler.inverse_transform(trained_regressor.predict(df[features]))
    else:
        result_df[method] = trained_regressor.predict(df[features])
    
    plot_results(result_df)

    return pd.DataFrame([compute_scores(method, result_df) for method in list(set(result_df.columns) - set(['Truth']))])

lin_regressor = LinearRegression(fit_intercept=False).fit(train_df[features], train_df[[target]])
update_results('Simple Linear Regression', lin_regressor, train_df, train_results_df)

### Testing

In [None]:
update_results('Simple Linear Regression', lin_regressor, test_df, test_results_df)

Let's extract the largest errors

In [None]:
worse_error_df = (test_results_df
                  .assign(absolute_error=lambda df: abs(df['Truth'] - df['Simple Linear Regression']))
                  .sort_values('absolute_error', ascending=False)
                  .head())
worse_error_df

In [None]:
df[df.index.isin(worse_error_df.index)]

### Understanding the learned model

In [None]:
pd.DataFrame(data=lin_regressor.coef_[0], columns=['coef'], index=features).sort_values('coef', ascending=False)

## Normalized Linear regression

### Training

In [None]:
normalized_lin_regressor = LinearRegression(fit_intercept=False).fit(train_normalized_df[features], train_normalized_df[[target]])
 
update_results('Normalized Linear Regression', normalized_lin_regressor, train_normalized_df, train_results_df, target_normalizer)

### Testing

In [None]:
update_results('Normalized Linear Regression', normalized_lin_regressor, test_normalized_df, test_results_df, target_normalizer)

### Understanding the learned model

In [None]:
pd.DataFrame(data=normalized_lin_regressor.coef_[0], columns=['coef'], index=features).sort_values('coef', ascending=False)

## Decision Trees

### Learning

In [None]:
from sklearn.tree import DecisionTreeRegressor

dt_regressor = DecisionTreeRegressor(max_depth=10).fit(train_df[features], train_df[[target]])
update_results('Decision Tree', dt_regressor, train_df, train_results_df)

### Test

In [None]:
update_results('Decision Tree', dt_regressor, test_df, test_results_df)

### Understanding the learned model

In [None]:
from io import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

def plot_dt(dt, max_depth):
    dot_data = StringIO()
    export_graphviz(dt, 
                    out_file=dot_data,  
                    filled=True, 
                    rounded=True,
                    feature_names=features,
                    max_depth=max_depth,
                    special_characters=True)
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())
    
plot_dt(dt_regressor, max_depth=3)

## Neural Network

### Training

In [None]:
from keras.models import Sequential
from keras.layers import Dense

# Creating a validation set 
train_nn_df, validate_nn_df = train_test_split(train_normalized_df, test_size=0.1, random_state=666)

nn_regressor = Sequential([
    Dense(8, name='hidden', activation='relu', input_shape=(len(features),)),
    Dense(1, name='output', activation='sigmoid', use_bias=False),
])

nn_regressor.compile(optimizer='sgd',
                     loss='mean_squared_error',
                     metrics=['mae'])

nn_regressor.fit(train_nn_df[features], train_nn_df[[target]],
                 batch_size=32, epochs=100,
                 validation_data=(validate_nn_df[features], validate_nn_df[[target]]))

update_results('Neural Network', nn_regressor, train_normalized_df, train_results_df, target_normalizer)

### Testing

In [None]:
update_results('Neural Network', nn_regressor, test_normalized_df, test_results_df, target_normalizer)

### Understanding the learned model

In [None]:
nn_regressor.summary()

In [None]:
nn_regressor.get_weights()

## Random Forest

### Training

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_regressor = RandomForestRegressor().fit(train_df[features], train_df[[target]])
update_results('Random Forest', rf_regressor, train_df, train_results_df)

### Test

In [None]:
update_results('Random Forest', rf_regressor, test_df, test_results_df)

### Understanding the model

In [None]:
pd.DataFrame(data=rf_regressor.feature_importances_, columns=['feature_importance'], index=features).sort_values('feature_importance', ascending=False)

In [None]:
plot_dt(rf_regressor[0], max_depth=3)