# How do the different models compare?

We will compare the model evaluation results across the models we've trained in this module.

In [None]:
# Import modules
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV
from sklearn import linear_model
import pandas as pd
import numpy as np
import sklearn
import sklearn.ensemble
import sklearn.svm

In [None]:
# Use same evaluate_model function as before
def evaluate_model(model_fn, print_result=False):
    '''
    Consumes a function model_fn
    and evaluates its predictive accuracy against 
    the housing prices test set.
    We have included a switch for the output to be a more human readable
    printed version or the uncurtailed floating point value of the average.
    '''
    test_data = pd.read_csv("https://raw.githubusercontent.com/eliiza/ml-training-data/master/housing_price_data/test_data.csv")
    actual_values = test_data['SalePrice']
    # Pass in all columns except SalePrice
    test_input = test_data.filter(regex='^(?!SalePrice$).*')
    predicted_saleprice = model_fn(test_input)
    mae = np.mean(np.abs(predicted_saleprice-actual_values))
    if print_result:
        return print("The model is inaccurate by $%.2f on average." % mae)
    else:
        return mae

In [None]:
# Functions for data encoding

# Step 1: Feature Engineering
def encode_data(data,scaler = None):
    """
    Encode a dataframe of house price data using the desired feature engineering process. 
    The scaler argument allows you to either scale the data anew (scaler = None), 
    or use previously derived scaling parameters
    e.g. when you want to encoding test data using the scaling parameters from the training dataset.
    Returns a dataframe of engineered features and the scaler object.
    """
    
    features = data.copy()
    
    # Numerical features
    features = features[['OverallQual','GrLivArea','BedroomAbvGr','FullBath','YearBuilt']]
    features['QualAreaInteract'] = features['OverallQual'] * features['GrLivArea']
    
    # Ordinal feature - map to numerical as before
    cond_map = {'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1}
    features['KitchenQual'] = data['KitchenQual'].map(cond_map).fillna(0)
    
    # Categorical features - one-hot encode using pre-written helper functions (below)
    features['CentralAir'] = data['CentralAir'] == 'Y'
    electrical = encode_electrical(data['Electrical'])
    heating = encode_heating(data['Heating'])
    features = pd.concat([features,electrical,heating],axis=1)
    
    # Convert to float data type for scaling process
    features = features.astype(float)
    
    # Scale all the features
    # If no `scaler` object in the function arguments - carry out scaling anew
    # If `scaler` object in the function arguments - use those scaling parameters
    if(not scaler):
       scaler = MinMaxScaler()
       scaler.fit(features)
    features = pd.DataFrame(scaler.transform(features), 
                            columns = ['OverallQual','GrLivArea','BedroomAbvGr','FullBath','YearBuilt',
                                       'QualAreaInteract','KitchenQual',
                                       'CentralAir','FuseA','FuseF','FuseP','Mix','SBrkr',
                                       'GasA','GasW','Grav','Wall'])
    
    # Return the desired data frame and the scaling parameters used
    return(features,scaler)


# Helper functions for one hot encoding

def encode_electrical(electrical):
    """
    Create data frame with one column per category in 'electrical' column, rows are Boolean with respect to
    category string in electrical column.
    """
    one_hot_encoding = pd.DataFrame()
    one_hot_encoding['FuseA'] = electrical == 'FuseA'
    one_hot_encoding['FuseF'] = electrical == 'FuseF'
    one_hot_encoding['FuseP'] = electrical == 'FuseP'
    one_hot_encoding['Mix']   = electrical == 'Mix'
    one_hot_encoding['SBrkr'] = electrical == 'SBrkr'
    return(one_hot_encoding)

def encode_heating(heating):
    """
    Create data frame with one column per category in 'heating' column, rows are Boolean with respect to
    category string in heating column.
    """
    one_hot_encoding = pd.DataFrame()
    one_hot_encoding['GasA'] = heating == 'GasA'
    one_hot_encoding['GasW'] = heating == 'GasW'
    one_hot_encoding['Grav'] = heating == 'Grav'
    one_hot_encoding['Wall'] = heating == 'Wall'
    return(one_hot_encoding)

In [None]:
training_set = pd.read_csv("https://raw.githubusercontent.com/eliiza/ml-training-data/master/housing_price_data/training_data.csv")

## Heuristic Brute Force

In [None]:
def heuristic(input_data):
    """
    Extracts a single vector called 'OverallQual' from input data and multiplies every value by 100,000
    """
    bedrooms = input_data['OverallQual']
    prediction = 29000*bedrooms
    return(prediction)

brute_force = evaluate_model(heuristic, print_result=False)

## Linear Regression

In [None]:
training_features = training_set[['OverallQual']]
predictor = linear_model.LinearRegression()
predictor.fit(training_features, training_set['SalePrice'])

# Define function with prediction
def linear_model(input_data):
    return(predictor.predict(input_data[['OverallQual']]))

In [None]:
linear_reg = evaluate_model(linear_model, print_result=False)

## Multiple Linear Regression

In [None]:
# Step 1: Feature engineering
training_features, scaler = encode_data(training_set)
    
# Step 2: Train the model
predictor = sklearn.linear_model.LinearRegression()
predictor.fit(training_features, training_set['SalePrice'])

# Step 3: Create a function that can make predictions using the model
def mlr_model(input_data):
    input_features,_ = encode_data(input_data,scaler)
    predictions = predictor.predict(input_features)
    return(predictions)

In [None]:
# Step 4: Evaluate
mlr = evaluate_model(mlr_model, print_result=False)

## Random Forest

In [None]:
# Step 1: Feature engineering
training_features, scaler = encode_data(training_set)

# Step 2: Train the model
predictor = sklearn.ensemble.RandomForestRegressor(n_estimators=100) 
predictor.fit(training_features, training_set['SalePrice'])

# Step 3: Create a function that can make predictions using the model
def rf_model(input_data):
    input_features,_ = encode_data(input_data,scaler)
    predictions = predictor.predict(input_features)
    return(predictions)

# Step 4: Evaluate
rf = evaluate_model(rf_model, print_result=False)

## Gradient Boosting

In [None]:
# Step 1: Feature engineering
training_features, scaler = encode_data(training_set)

# Step 2: Train the model
predictor = sklearn.ensemble.GradientBoostingRegressor()
predictor.fit(training_features, training_set['SalePrice'])

# Step 3: Create a function that can make predictions using the model
def boosting_model(input_data):
    input_features,_ = encode_data(input_data,scaler)
    predictions = predictor.predict(input_features)
    return(predictions)

# Step 4: Evaluate
gb = evaluate_model(boosting_model, print_result=False)

## Plot the results

In [None]:
results = pd.DataFrame({'Mean Absolute Error $':[brute_force, linear_reg, mlr, rf, gb],
                        'Model': ['Brute Force', 'Linear Regression', 'Multiple Linear Regression', 
              'Random Forest', 'Gradient Boosting']})
results = results.sort_values('Mean Absolute Error $')
results

In [None]:
ax = results.plot.barh(x='Model', y='Mean Absolute Error $', rot=0)

In [None]:
fig = ax.get_figure()
fig.savefig('./model_comparison.png', dpi = 300, bbox_inches= 'tight', pad_inches = 0.1)