In [None]:
np.set_printoptions(suppress=True)

# Import workspace and other libraries

import os
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix

# Filepath
script_folder = os.getcwd()
filepath = script_folder + '/census-income-data.csv'
print(filepath + '\n')

# Convert csv to datafreame
df = pd.read_csv(filepath)
df = df.sample(frac=1).reset_index(drop=True)
print("converted dataset to dataframe\n")

## Start preprocessing: Clean up data and normalize

# Replace missing data
df_nan = df.replace('?', np.nan)
print("missing data replaced with NAN\n")

# trim all columns
df_clean = df_nan.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
print("trimmed all leading and trailing spaces\n")

print("preprocessing complete\n")

# Separate labels from measures
y_raw = df_clean.pop('Tax_Filer_Status')

X = df_clean
print("label separation complete\n")

# Drop columns with too many missing values (all migration-related columns)
X.drop(X.columns[[20]], axis = 1, inplace = True)
print("dropped instance weight\n")

# Encode features
x_1hot = pd.get_dummies(X, drop_first=True)
print("featrure encoding complete\n")

# Encode labels
label_enc = preprocessing.LabelEncoder()
Y = label_enc.fit_transform(y_raw)
print("label encoding complete\n")

feature_names = np.array(x_1hot.columns.values)
print("Defined feature names and classes\n")

labels = np.array(['tax filer', 'non-filer']).tolist()
print("classes have been defined\n")

X_train,X_test,y_train,y_test = train_test_split(x_1hot, Y, test_size=0.2)
print("split dataset into training and test sets with an 80-20 partition\n")

# Flatten label column to fit the SVM classifier requirement (otherwise you get warnings)
y_train_flat = y_train.ravel()
y_test_flat = y_test.ravel()
print("flattened class label array\n")

# Impute missing values using univariate imputer (multivariate is not supported in this version of Scikit-learn 0.18.1)
print("imputing missing values as nan\n")
imp = Imputer(missing_values=np.nan, strategy='most_frequent')
imp.fit(X_train)
X_train_clean = imp.transform(X_train)
X_test_clean = imp.transform(X_test)

print("imputation complete\n")


## Normalize dataset (exclude labels)
#X_norm = normalize(X_clean)

## Standardize dataset
#scaler = StandardScaler()  

## Fit only on training data
#scaler.fit(X_train_clean)  
#x_train = scaler.transform(X_train_clean)

## apply same transformation to test data
#x_test = scaler.transform(X_test_clean)  

#print("standardized dataset\n")


## Train a neural network
clf = MLPClassifier(solver='sgd',
                    hidden_layer_sizes=(15,30,15), random_state=1)

model = clf.fit(X_train_clean, y_train_flat)

# Predict
predictions = model.predict(X_test_clean)
print(confusion_matrix(y_test_flat,predictions))
print()
print(classification_report(y_test_flat,predictions))

## De-standardize data
# x_train_dstd = scaler.inverse_transform(x_train)
# x_test_dstd = scaler.inverse_transform(x_test)


## Start explanation
from interpret.ext.blackbox import TabularExplainer

explainer = TabularExplainer(model, 
                            X_test_clean, # x_test
                            features=feature_names, 
                            classes=labels)
print("initialized explainer\n")

# Global explanation
# you can use the training data or the test data here
global_explanation = explainer.explain_global(X_test_clean[0:100])

# if you used the PFIExplainer in the previous step, use the next line of code instead
# global_explanation = explainer.explain_global(x_train, true_labels=y_test)

# sorted feature importance values and feature names
sorted_global_importance_values = global_explanation.get_ranked_global_values()
sorted_global_importance_names = global_explanation.get_ranked_global_names()
dict(zip(sorted_global_importance_names, sorted_global_importance_values))

# alternatively, you can print out a dictionary that holds the top K feature names and values
global_explanation.get_feature_importance_dict()


#Local explanation
# explain the first data point in the test set
local_explanation = explainer.explain_local(X_test_clean[0:100])

# sorted feature importance values and feature names
sorted_local_importance_names = local_explanation.get_ranked_local_names()
sorted_local_importance_values = local_explanation.get_ranked_local_values()


# Display visualization dashboard
from azureml.contrib.interpret.visualize import ExplanationDashboard

ExplanationDashboard(global_explanation, model, X_test_clean[0:100])