# Feature Importance Comparison

Compare feature importance values from random forest prediction to the components in the backbone network

## Import modules and libraries

In [0]:
from pathlib import Path

import numpy as np
import pandas as pd
pd.options.display.max_rows = 50
from random import randint
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")
import networkx as nx
from scipy import stats
import sklearn as sk
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import setup
from data_processing import classify_features, load_data
from mutual_info import calc_mi
from network import output_graph_json, output_pairs_json, threshold_using_backbone_method, get_components
from visualization import draw_graph, viz, show_edge_thinning

In [0]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
import dash
import dash_core_components as dcc
import dash_html_components as html

## Select our target directory and set our parameters:
Optional: You can set up parameters from an outside file using `args = setup.arg_setup()`

In [0]:
target_dir = 'example_housing' #'example_housing' #'example_groceries' #'example_icu' #'example_data'
args = {'charter': 'Plotly',
        'input_file': f'../{target_dir}/data.csv',
        'output_dir': f'../{target_dir}/output',
        'sample_n': None, #None, #100 (recommended for testing example_icu or other large data sets)
        'output_json': True,
        'output_charts': False,
        'feature_of_interest': 'YearBuilt' #'Neighborhood' #'whole_milk' #'diabetes_mellitus' #'Continuous_Trinary_Normal'
       }

## Run Sirius algorithm to get sparsified mutual information graph

In [0]:
# Load Data
df = load_data(Path(args['input_file']), sample_n=args['sample_n'])
print(f'There are {df.shape[0]} records and {df.shape[1]} features in {args["input_file"]}')
# Classify features as discrete or continuous
feature_info = classify_features(df)
# Calculate mutual information for each pair of features
edges = calc_mi(df, feature_info).sort_values(by='v', ascending=False).reset_index(drop=True)
# Sparsify the mutual information graph using the backbone method
thresheld = threshold_using_backbone_method(edges, debug=True)
# Get the component list from the sparsified network
components = get_components(thresheld)
# Visualize the network graph
draw_graph(thresheld, f'Filtered Feature Graph: Reduced to {thresheld.shape[0]} Connections', display=True)

# Find importance values of other features for prediction of one feature

In [0]:
# Choose a discrete variable to predict (with the specific response being predicted)
feature_of_interest = 'Neighborhood'

In [0]:
# or choose a continuous feature to predict:
#feature_of_interest = 'YearBuilt'

In [0]:
if feature_of_interest in list(feature_info[feature_info['type']=='d'].index):
    print(f"Because this is a discrete variable, you must set a response of interest from one of the following:{list(df[feature_of_interest].unique())}")
    response_of_interest="OldTown"
    predict = str(feature_of_interest+"_"+response_of_interest)
else:
    predict = feature_of_interest

In [0]:
# Get a list of discrete features from the Sirius type classifier
d_features = list(feature_info[feature_info['type']=='d'].index)
# Create a subset dataframe of only continuous values
d_df = df.filter(d_features)
try: d_df.drop(columns=[feature_of_interest], inplace=True)
except: print(f'{feature_of_interest} not included in discrete subset dataframe')
print(d_features)

In [0]:
# Map discrete features to binary values using one-hot encoding
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(d_df.fillna('None').values)
encoded = enc.transform(d_df.fillna('None').values).toarray()

In [0]:
# Convert the encoded array back into a dataframe
e_df = pd.DataFrame(encoded, columns=enc.get_feature_names(d_df.columns))

In [0]:
# Get a list of continuous features from the Sirius type classifier
c_features = list(feature_info[feature_info['type']=='c'].index)
# Create a subset dataframe of only continuous values
c_df = df.filter(c_features)

In [0]:
# Combine the encoded discrete data frame with the continuous data frame
merged_df = e_df.merge(c_df, left_index=True, right_index=True)

In [0]:
# Pull array of values to predict and create matrix of known values
y = merged_df[predict].values
try: known_df = merged_df.drop(columns=[predict])
except: known_df = merged_df
X = known_df.values

In [0]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [0]:
# Create a random forest regressor on all the known features
regressor = RandomForestRegressor(n_estimators=known_df.shape[1], random_state=0)
# Fit the random forest regressor to the data
regressor.fit(X_train, y_train)
# Predict the feature of interest value
y_pred = regressor.predict(X_test)
# Print the prediction error metadata
print('Mean absolute error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean squared error:', metrics.mean_squared_error(y_test, y_pred))
print('Root mean squared error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [0]:
# Generate a data frame of the feature importances for the prediction task
importances = pd.DataFrame(sorted(zip(map(lambda x: round(x, 4), regressor.feature_importances_), known_df.columns), 
             reverse=True), columns=['importance','feature_encoded'])
# If the feature was encoded, map the feature importance value for that feature to the highest importance value for all encoded fields (this is just to get a rough estimate)
importances['feature']=[f if f in df.columns else f.split('_')[0] for f in importances['feature_encoded']]

In [0]:
# Create a rankings data frame which includes mutual information scores for all features related to the feature of interest
rankings = edges[(edges['x']==feature_of_interest) | (edges['y']==feature_of_interest)].rename(columns={'v':'mi_score'}).reset_index(drop=True)
rankings['mi_ranking']=[i+1 for i in rankings.index] # Feature importance rankings start at 1 instead of 0
rankings['feature']=[x if x!=feature_of_interest else y for x,y in zip(rankings['x'],rankings['y'])]
rankings.drop(columns=['x','y'],inplace=True)
rankings['importance_score']=[list(importances[importances.feature==f].importance)[0] if f in list(importances.feature) else None for f in rankings['feature']]
rankings['importance_ranking']=[list(importances.feature).index(f) if f in list(importances.feature) else None for f in rankings['feature']]

In [0]:
# Generate a visualization of any feature vs. the feature for which a value is being predicted, and show the mutual information and importance scores and rankings
def makeviz(known,predicting):
    print(f'Creating visualization of known value "{known}" vs. prediction feature "{predicting}"')
    viz(known, predicting, df, feature_info, charter=args['charter'],display=True, resolution=100)
    print(f'Importance score for {known} and {predicting}: {list(rankings[rankings.feature==known].importance_score)[0]}')
    print(f'Mutual information score for {known} and {predicting}: {list(rankings[rankings.feature==known].mi_score)[0]}')
    print(f'Importance rank for {known} and {predicting}: {list(rankings[rankings.feature==known].importance_ranking)[0]}')
    print(f'Mutual information rank for {known} and {predicting}: {list(rankings[rankings.feature==known].mi_ranking)[0]}')

In [0]:
# Create pairwise visualizations of the top 5 most important features for prediction (these may contain duplicates due to one-hot encoding step)
for i in list(importances['feature'])[:5]:
    makeviz(i,feature_of_interest)

In [0]:
rankings

In [0]:
# Calculate the spearman rank correlation between mutual information and feature importance
spearman = stats.spearmanr(rankings['mi_score'],rankings['importance_score'])
correlation = spearman.correlation
p_val = spearman.pvalue
spearman

In [0]:
# Create a visualization of the relationship between mutual information and importance for features in the prediction of the feature of interest
fig = px.scatter(rankings, title=f"Mutual Information vs. Importance Ranking of Features for Prediction of <b>'{predict.replace('_',': ')}</b>'<br>Spearman Correlation: {np.round(correlation,3)}, p-Value: {np.format_float_scientific(p_val,precision=1)}", x='mi_ranking',y='importance_ranking',hover_name='feature',height=800, trendline="ols",template='plotly_white')
fig.update_xaxes(range=[rankings['mi_ranking'].max()+5, rankings['mi_ranking'].min()-5])
fig.update_yaxes(range=[rankings['importance_ranking'].max()+5, rankings['importance_ranking'].min()-5])
fig.update_traces(marker=dict(size=15))
fig.show()