# Feature Importance

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sn
%matplotlib inline

In [8]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix
import HelpfulFunctions as hp

## Loading Data Wisconsin Breast Cancer Dataset

In [95]:
from sklearn.datasets import load_breast_cancer
breastCancerData = load_breast_cancer()
X = pd.DataFrame(breastCancerData['data'],columns = breastCancerData['feature_names'])
y = breastCancerData['target']

In [96]:
feature_names = X.columns
n_feats = len(feature_names)

## Standardising the Data

In [81]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X_standardised = scaler.transform(X)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_standardised,y,test_size=0.3,random_state = 101)

## Fit Range of Models

In [83]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(solver='lbfgs')
log_model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [84]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [85]:
from sklearn.tree import DecisionTreeClassifier
tree_model = DecisionTreeClassifier()
tree_model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [86]:
from sklearn.naive_bayes import GaussianNB
bayes_model = GaussianNB()
bayes_model.fit(X_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [87]:
from sklearn.svm import SVC
svc_model = SVC(gamma = "auto")
svc_model.fit(X_train,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [88]:
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier()
mlp_model.fit(X_train,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [89]:
from sklearn.ensemble import RandomForestClassifier
rnd_model = RandomForestClassifier(n_estimators = 100)
rnd_model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Prediction Accuracy

In [90]:
model_list = [log_model,knn_model,tree_model,bayes_model,svc_model,mlp_model,rnd_model]

In [91]:
for model in model_list:
    predictions = model.predict(X_test)
    cm = confusion_matrix(y_test,predictions)
    print(model.__class__)
    print('Test Accuracy: %.3f' %model.score(X_test, y_test))

<class 'sklearn.linear_model.logistic.LogisticRegression'>
Test Accuracy: 0.977
<class 'sklearn.neighbors.classification.KNeighborsClassifier'>
Test Accuracy: 0.953
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
Test Accuracy: 0.912
<class 'sklearn.naive_bayes.GaussianNB'>
Test Accuracy: 0.918
<class 'sklearn.svm.classes.SVC'>
Test Accuracy: 0.977
<class 'sklearn.neural_network.multilayer_perceptron.MLPClassifier'>
Test Accuracy: 0.965
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
Test Accuracy: 0.953


## Feature Importance using Individual Features

In [138]:
model_list = [log_model,knn_model,tree_model,bayes_model,svc_model,rnd_model]

In [139]:
from sklearn.model_selection import cross_val_score

In [140]:
import numpy

In [146]:
for model in model_list:

    print("----------------------------------------------------")    
    print(model.__class__)
    scores_list = []

    for i in range(n_feats):
        X_one_feature = X_train[:, i].reshape(-1, 1)
        scores = cross_val_score(model, X_one_feature, y_train, cv=5)
        scores_mean = scores.mean()
        scores_list.append(scores.mean())

    sorted_indices = numpy.argsort(np.array(scores_list) * -1) # negate to have descending

    for i in range(0,5): # top 5 features
        index = sorted_indices[i]
        print(i, ":", feature_names[index], scores_list[index])
    
print("----------------------------------------------------")

Feature  Accuracy
----------------------------------------------------
<class 'sklearn.linear_model.logistic.LogisticRegression'>
0 : worst concave points 0.9198663853727144
1 : worst area 0.9198023128613846
2 : mean concave points 0.9173656040006251
3 : worst perimeter 0.9173331770589155
4 : worst radius 0.9172390217221441
----------------------------------------------------
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
0 : worst area 0.8849566338490389
1 : worst radius 0.882172605094546
2 : worst concave points 0.8798308329426474
3 : worst perimeter 0.8596393967807469
4 : mean perimeter 0.8545452414439755
----------------------------------------------------
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
0 : worst area 0.8849566338490389
1 : worst concave points 0.8822999687451164
2 : worst perimeter 0.8797984060009376
3 : worst radius 0.8797042506641664
4 : mean perimeter 0.8545452414439755
----------------------------------------------------


## Feature Importance using Recursive Feature Elimination

In [142]:
from sklearn.feature_selection import RFE

In [143]:
# We cannot run this on all models
model_list = [log_model,tree_model,rnd_model]

In [144]:
for model in model_list:
    
    print("-------------------------------------------------")
    
    rfe = RFE(estimator=model, n_features_to_select=5)
    
    print(model.__class__)
    rfe.fit(X_train, y_train)

    for i in range(0,len(names)):
        if rfe.support_[i] == True:
            print(names[i])
            
print("-------------------------------------------------")

-------------------------------------------------
<class 'sklearn.linear_model.logistic.LogisticRegression'>
mean concave points
worst radius
worst texture
worst area
worst concave points
-------------------------------------------------
<class 'sklearn.tree.tree.DecisionTreeClassifier'>
worst radius
worst texture
worst area
worst concave points
worst fractal dimension
-------------------------------------------------
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
mean concave points
worst radius
worst perimeter
worst area
worst concave points
-------------------------------------------------
