# Feature Importance Extraction
This notebook lets you look at the feature importance for the best model in each model class for a given dataset. You'll have to be sure that the modelDictPath supplied was trained on the dataset you want to look at. This is because feature names are not stored in the model, so you need the **original** dataset used to train the model in order to extract human readable features.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import csv


In [2]:
modelDictPath = "data/featureImportanceTestData/modelDictMultiDataRun_datasets_1_2_3_7_20190722.pkl"

In [3]:
with open(modelDictPath, "rb") as pklFile:
    allDataModelDict = pickle.load(pklFile)

In [4]:
from collections import defaultdict
modelName = "SVC"
modelDict = allDataModelDict["dataset_1_"][modelName]
dataPath = "data/featureImportanceTestData/dataset_1_full.csv"
bestModel = modelDict["gridcv"].best_estimator_

In [None]:
# def plot_coefficients(classifier, feature_names, top_features=10): 
#     coef = classifier.best_estimator_.coef_.ravel()
#     top_positive_coefficients = np.argsort(coef)[-top_features:]
#     top_negative_coefficients = np.argsort(coef)[:top_features]
#     top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
#     # create plot
#     sb.set_context("poster")
#     plt.figure(figsize=(15, 5))
#     plt.title("Feature Importances (Support Vector Classifier)")
#     colors = ['crimson' if c < 0 else 'cornflowerblue' for c in coef[top_coefficients]]
#     plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
#     feature_names = np.array(feature_names)
#     plt.xticks(np.arange(0, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right') plt.show()
#     np.asarray(feature_names)[top_positive_coefficients]

#     plot_coefficients(svm_model, list(X.columns))

In [10]:
# bestModelParams.coef_

AttributeError: coef_ is only available when using a linear kernel

In [4]:
modelName = "logistic"
assert modelName in ["randomForest", "GBTC", "logistic", "SVC"], "invalid model name."
##################################################
#######################NOTE#######################
##################################################
# Be sure that the dataPath and dataSetOfInterest
# correspond to the same dataset used to train the
# models
dataSetOfInterest = "dataset_1_"
dataPath = "data/featureImportanceTestData/dataset_1_full.csv"
modelDict = allDataModelDict[dataSetOfInterest][modelName]
bestModel = modelDict["gridcv"].best_estimator_
featureTransformer = bestModel.steps[0][1]# 0th step is the feature extraction, and the first element
# of that is the actual function for it. 



isolateList = []
with open(dataPath) as fp:
    csvReader = csv.reader(fp)
    header = next(csvReader)
if len(header) > 100000:
    print("Reading large data with more efficient code")
    with open(dataPath) as fp:
        csvReader = csv.reader(fp)
        header = next(csvReader)
        for line in csvReader:
            isolateList.append(line[0])
    df = np.loadtxt(dataPath, delimiter = ",", skiprows = 1, usecols = range(1, len(header))) 
    df = pd.DataFrame(df, columns = header[1:])
    df.insert(loc = 0, column = "isolate", value = isolateList) 
else:
    df = pd.read_csv(dataPath)



df = df.set_index("isolate")
X_df = df.drop(labels = ["pbr_res"], axis = 1)
X = X_df.values
allFeatureNames = np.array(list(X_df))
chosenFeatures = allFeatureNames[[featureTransformer.transform(np.arange(X.shape[1]).reshape([1, X.shape[1]]))]]
chosenFeatures = chosenFeatures[0]


if modelName == "randomForest":
    # Random forest had a bit of a naming mishap where I switched from
    # RFC to randomForest
    bestModelParams = bestModel.get_params()["RFC"]
else:
    bestModelParams = bestModel.get_params()[modelName]
    
    
if modelName == "GBTC":
    print("feature importance for GBTC")
    featureImportance = bestModelParams.feature_importances_
elif modelName == "randomForest":
    print("feature importance for random forest")

    featureImportance = bestModelParams.feature_importances_
elif modelName == "logistic":
    print("Feature importance logistic")
    featureImportance = bestModelParams.coef_[0]
elif modelName == "SVC":
    if bestModel["SVC"].kernel != "linear":
        print("no coefficients are available for SVC without a linear kernel\n",
             "This SVC's kernel is {}".format(bestModel["SVC"].kernel))
        assert False, "Breaking the code because we can't get feature importance here"
else:
    assert False, "Model name: {} not implemented for feature importance".format(modelName)

featureImportanceDict = defaultdict(int) # features and coefficients line up
# but because there can be repeated features for logistic regression this needs to be 
# added together.
for feat, imp in zip(chosenFeatures, featureImportance):
    featureImportanceDict[feat] += imp

nNonZeroFeats = 0
for _, imp in featureImportanceDict.items():
    if imp != 0.0:
        nNonZeroFeats += 1
featureImportanceTupleList = sorted(featureImportanceDict.items(),
                                   key=lambda p:np.abs(p[1]), reverse = True)# sorting list    


print("{} features before feature selection".format(len(allFeatureNames)))
print("{} features passed to this model".format(len(chosenFeatures))) # for logistic
print("{} unique features passed to this model".format(len(set(chosenFeatures)))) # for logistic
print("{} features given feature importance above 0\n".format(nNonZeroFeats))

print("SANITY CHECK. This is the best model being used. Please be sure that",
     "The parameters match up with the model you specified. Sometimes an old model",
     "Can be used by accident if the kernel is not restarted\n\n",
      bestModel[modelName] if modelName != 'randomForest' else bestModel["RFC"],
     "\n\n\n\n")


print("human readable feature importance")
for feature, importanceMeasure in featureImportanceTupleList:
    if importanceMeasure != 0.0:
        print("Feature {} coefficient/importance {}\n".format(feature, importanceMeasure))


        
print("Printing feature importance so it can be copied into a csv file")
print("featureName,importanceMetric")
for feature, importanceMeasure in featureImportanceTupleList:
    if importanceMeasure != 0.0:
        print("{},{}".format(feature, importanceMeasure))


Feature importance logistic
2277 features before feature selection
661 features passed to this model
661 unique features passed to this model
203 features given feature importance above 0

SANITY CHECK. This is the best model being used. Please be sure that The parameters match up with the model you specified. Sometimes an old model Can be used by accident if the kernel is not restarted

 LogisticRegression(C=1000, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False) 




human readable feature importance
Feature mgrB coefficient/importance 20.096792636473804

Feature KP0228_02051 coefficient/importance 16.780148994355134

Feature KP0228_03719 coefficient/importance 15.044282982810538

Feature KP0228_00159 coefficient/importance -14.79493743214