In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [63]:
init_data = load_breast_cancer()
(X, y) = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
forest = RandomForestClassifier(criterion='entropy',n_estimators=200,random_state=1,n_jobs=2)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=2,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [64]:
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
print(indices)
print(importances[indices])

[27  7 20 22  6 23 13  2  0 26  3 21  1 25 10 28 24  5 12 29 17 15  4 19
 18 16 11 14  8  9]
[0.14184901 0.11769747 0.11091947 0.10524291 0.07716301 0.07336447
 0.04176026 0.03694304 0.03562503 0.03520544 0.03288976 0.02273565
 0.01858393 0.01678607 0.01540763 0.01477012 0.01423471 0.0100853
 0.01003995 0.00901454 0.00808466 0.00742006 0.00655659 0.00636839
 0.006357   0.00613529 0.00585424 0.00509956 0.0045979  0.00320853]


In [65]:
importances_sort=importances[indices]
apf=np.zeros(3)
num=np.zeros(3,dtype=int)
acu=np.zeros(3)
for i in range(1,len(indices)+1):
    sfm=SelectFromModel(forest, threshold=importances_sort[i-1], prefit=True)
    X2_train = sfm.transform(X_train)
    X2_test = sfm.transform(X_test)
    forest_forest = RandomForestClassifier(criterion='entropy',n_estimators=200,random_state=1,n_jobs=2)
    forest_forest.fit(X2_train, y_train)
    y_pred = forest_forest.predict(X2_test)
    print('Accuracy: %.6f' % accuracy_score(y_test, y_pred))
    print('Number of feature chosen:%d' % i)
    print('Accuracy per feature: %.6f' % (accuracy_score(y_test, y_pred)/X2_train.shape[1]))
    if (accuracy_score(y_test, y_pred)/X2_train.shape[1])>apf[2]:
        if (accuracy_score(y_test, y_pred)/X2_train.shape[1])>apf[1]:
            if (accuracy_score(y_test, y_pred)/X2_train.shape[1])>apf[0]:
                apf[0]=accuracy_score(y_test, y_pred)/X2_train.shape[1]
                num[0]=i
                acu[0]=accuracy_score(y_test, y_pred)
            else:
                apf[1]=accuracy_score(y_test, y_pred)/X2_train.shape[1]
                num[1]=i
                acu[1]=accuracy_score(y_test, y_pred) 
        else:
            apf[2]=accuracy_score(y_test, y_pred)/X2_train.shape[1]
            num[2]=i
            acu[2]=accuracy_score(y_test, y_pred)

Accuracy: 0.847953
Number of feature chosen:1
Accuracy per feature: 0.847953
Accuracy: 0.888889
Number of feature chosen:2
Accuracy per feature: 0.444444
Accuracy: 0.959064
Number of feature chosen:3
Accuracy per feature: 0.319688
Accuracy: 0.947368
Number of feature chosen:4
Accuracy per feature: 0.236842
Accuracy: 0.941520
Number of feature chosen:5
Accuracy per feature: 0.188304
Accuracy: 0.947368
Number of feature chosen:6
Accuracy per feature: 0.157895
Accuracy: 0.947368
Number of feature chosen:7
Accuracy per feature: 0.135338
Accuracy: 0.953216
Number of feature chosen:8
Accuracy per feature: 0.119152
Accuracy: 0.947368
Number of feature chosen:9
Accuracy per feature: 0.105263
Accuracy: 0.959064
Number of feature chosen:10
Accuracy per feature: 0.095906
Accuracy: 0.953216
Number of feature chosen:11
Accuracy per feature: 0.086656
Accuracy: 0.970760
Number of feature chosen:12
Accuracy per feature: 0.080897
Accuracy: 0.970760
Number of feature chosen:13
Accuracy per feature: 0.07

In [66]:
df1=pd.DataFrame({'Number of feature chosen':num,
                 'Accuracy':acu,
                 'Accuracy per feature':apf}
                  ,index=[1,2,3])
df1.head()

Unnamed: 0,Number of feature chosen,Accuracy,Accuracy per feature
1,1,0.847953,0.847953
2,2,0.888889,0.444444
3,3,0.959064,0.319688
