In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#import sklearn
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
from sklearn import ensemble # for random forest


# Hyper-parameter tuning
from sklearn.model_selection import GridSearchCV

# Examining feature importance
from sklearn.inspection import permutation_importance
import shap


%matplotlib inline
# Set the style
plt.style.use('fivethirtyeight')

pd.set_option('display.max_columns', None)





# FUNCTION DEFINITION

In [1]:
# Define some function for evaluation and feature importance

def evaluate(model, test_features, test_labels, train_features, train_labels):
    r2_test = model.score(test_features, test_labels)
    r2_train = model.score(train_features, train_labels)
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    error_percent_sale_price = (predictions - test_labels) / test_labels
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Test Error: {:0.4f} $USD'.format(np.mean(errors)))
    print('Test Accuracy = {:0.2f}%'.format(accuracy))
    print('R-squared train = {:0.2f}%'.format(r2_train))
    print('R-squared test = {:0.2f}%'.format(r2_test))
    return (accuracy, r2_train, r2_test, error_percent_sale_price)

# Built in Gini Importance: the mean decrease in impurity (features selected based on variance reduction)
# But has tendency to prefer numerical features and categorical features with high cardinality
# Not robust to correlated features

def get_importance(model, x_test, y_test):
    # Get numerical feature importances
    importances = list(model.feature_importances_)
    # List of tuples with variable and importance
    feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(features, importances)]
    # Sort the feature importances by most important first
    feature_importances_sort = sorted(feature_importances, key = lambda x: x[1], reverse = True)
    # Print out the feature and importances 
    [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances_sort if pair[1] > 0.001];
    
    # Plot Gini Importance
    feature_importances = sorted(feature_importances, key = lambda x: x[1])

    featureNames, featureScores = zip(*list(feature_importances))
    plt.barh(range(len(featureScores[-20:])), featureScores[-20:], tick_label=featureNames[-20:])
    plt.xlabel("Gini Feature Importance")

    
    return feature_importances_sort