In [105]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import RandomizedSearchCV

In [106]:
# load the breast_cancer dataset
bc = load_breast_cancer()
df = pd.DataFrame(bc.data, columns=bc.feature_names)
df['cancer'] = bc.target
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,cancer
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [107]:
df['cancer']=pd.Categorical.from_codes(bc.target, bc.target_names)
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,cancer
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,malignant
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,malignant
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,malignant
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,malignant
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,malignant


In [125]:
# Train test split
X = bc.data
y = bc.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [126]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 10000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(2, 50, num = 2)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [100, 1200, 2300, 3400, 4500, 5600, 6700, 7800, 8900, 10000], 'max_features': ['auto', 'sqrt'], 'max_depth': [2, 50, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}


In [129]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 5 fold cross validation, 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [2, 50, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 1200, 2300, 3400,
                                                         4500, 5600, 6700, 7800,
                                                         8900, 10000]},
                   random_state=0, verbose=2)

In [130]:
# Best Hyper Params
rf_random.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': None}

In [132]:
feat_labels = bc.feature_names
# define best model
rf_best = RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=4, max_features='sqrt', max_depth = None, random_state=10, n_jobs=-1)
# Fit the random search model with optimized hyperparameters
rf_best.fit(X_train, y_train)

# Print the name and gini importance of each feature
for feature in zip(feat_labels, rf_best.feature_importances_):
    print(feature)

('mean radius', 0.06939548215139639)
('mean texture', 0.017535207308069154)
('mean perimeter', 0.03405020540785518)
('mean area', 0.037103343432487564)
('mean smoothness', 0.007288534033834727)
('mean compactness', 0.012792553535783676)
('mean concavity', 0.04011221403825653)
('mean concave points', 0.1275084325077998)
('mean symmetry', 0.0021074259206811975)
('mean fractal dimension', 0.0021455217580090905)
('radius error', 0.008682555887812292)
('texture error', 0.002818959382159649)
('perimeter error', 0.027676424978709077)
('area error', 0.020765793089056326)
('smoothness error', 0.0011976394444388999)
('compactness error', 0.0038534192709864714)
('concavity error', 0.007214899703741869)
('concave points error', 0.0018425133858408595)
('symmetry error', 0.0017063117873089028)
('fractal dimension error', 0.002003787839376881)
('worst radius', 0.1019298141384568)
('worst texture', 0.019796177751943186)
('worst perimeter', 0.0891256936017882)
('worst area', 0.13613604890300246)
('wors

In [133]:
# Create a selector object that will use the random forest classifier to identify
# features that have an importance of more than 0.13
sfm = SelectFromModel(rf_best, threshold=0.13)

# Train the selector
sfm.fit(X_train, y_train)

SelectFromModel(estimator=RandomForestClassifier(max_features='sqrt',
                                                 min_samples_leaf=4,
                                                 min_samples_split=5, n_jobs=-1,
                                                 random_state=10),
                threshold=0.13)

In [134]:
# Print the names of the most important features
for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])

worst area
worst concave points


In [135]:
# Transform the data to create a new dataset containing only the most important features
# Note: We have to apply the transform to both the training X and test X data.
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)

In [136]:
# Create a new random forest classifier for the most important features
rf_important = rf_best

# Train the new classifier on the new dataset containing the most important features
rf_important.fit(X_important_train, y_train)

RandomForestClassifier(max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=5, n_jobs=-1, random_state=10)

In [137]:
# Apply The Limited Featured Classifier To The Test Data
y_important_pred = rf_important.predict(X_important_test)

# View The Accuracy Of Our Limited Feature (2 Features) Model
ACLIM = accuracy_score(y_test, y_important_pred)
ACLIM

0.965034965034965

In [138]:
# Accuracy/2 Features
ACLIM/2

0.4825174825174825