In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#import sklearn
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
from sklearn import ensemble # for random forest


# Hyper-parameter tuning
from sklearn.model_selection import GridSearchCV

# Examining feature importance
from sklearn.inspection import permutation_importance
import shap


%matplotlib inline
# Set the style
plt.style.use('fivethirtyeight')

pd.set_option('display.max_columns', None)





ModuleNotFoundError: No module named 'shap'

# FUNCTION DEFINITION

In [None]:
# Define some function for evaluation and feature importance

def evaluate(model, test_features, test_labels, train_features, train_labels):
    r2_test = model.score(test_features, test_labels)
    r2_train = model.score(train_features, train_labels)
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    error_percent_sale_price = (predictions - test_labels) / test_labels
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Test Error: {:0.4f} $USD'.format(np.mean(errors)))
    print('Test Accuracy = {:0.2f}%'.format(accuracy))
    print('R-squared train = {:0.2f}%'.format(r2_train))
    print('R-squared test = {:0.2f}%'.format(r2_test))
    return (accuracy, r2_train, r2_test, error_percent_sale_price)

# Built in Gini Importance: the mean decrease in impurity (features selected based on variance reduction)
# But has tendency to prefer numerical features and categorical features with high cardinality
# Not robust to correlated features

def get_importance(model, x_test, y_test):
    # Get numerical feature importances
    importances = list(model.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(features, importances)]
    # Sort the feature importances by most important first
    feature_importances_sort = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_sort if pair[1] > 0.001];
    
    # Plot Gini Importance
    feature_importances = sorted(feature_importances, key = lambda x: x[1])

    featureNames, featureScores = zip(*list(feature_importances))
    plt.barh(range(len(featureScores[-20:])), featureScores[-20:], tick_label=featureNames[-20:])
    plt.xlabel("Gini Feature Importance")

    
    return feature_importances_sort

In [4]:
# Import Cleaned Data (no encoding or dummification)

data = pd.read_csv('../../Austin/ames_house_price_no_dummies.csv')

In [5]:
data.head()

Unnamed: 0,pid,gr_liv_area,sale_price,ms_sub_class,ms_zoning,lot_frontage,lot_area,alley,lot_shape,land_contour,...,mo_sold,yr_sold,sale_type,sale_condition,bc_sale_price,near_rr,near_main_rd,near_pos,house_age_at_sale,construction_age_at_sale
0,909176150,856,126000,30,RL,80.0,7890,,4,Lvl,...,3,2010,WD,Normal,10.299527,0,0,0,71,60
1,905476230,1049,139500,120,RL,42.0,4235,,4,Lvl,...,2,2009,WD,Normal,10.377252,0,0,0,25,25
2,911128020,1001,124900,30,C (all),60.0,6060,,4,Lvl,...,11,2007,WD,Normal,10.292822,0,0,0,77,0
3,535377150,1039,114000,70,RL,80.0,8146,,4,Lvl,...,5,2009,WD,Normal,10.222923,0,0,0,109,6
4,534177230,1665,227000,60,RL,70.0,8400,,4,Lvl,...,11,2009,WD,Normal,10.746564,0,0,0,8,8
