<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Extracting Base Estimators from Bagged Models Codealong

_Authors: Joseph Nelson (SF)_

---

### 1. Load the breast cancer data.

In [1]:
# Checking out some 'books' from the libraries.
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer()

# Converting data into a dataframe structure 
X = pd.DataFrame(data['data'], columns=data['feature_names'])
# Setting up our Y value as well
y = pd.Series(data['target'])

### 2. Load required sklearn packages.

In [3]:
# More Books!!
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

### 3. Make a train-test split.

In [4]:
# Train test split for fun
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 5)


### 4. Create and fit a `BaggingClassifier` with a `DecisionTreeClassifier` base estimator.

In [5]:
# Create our classifier and our bag
DT = DecisionTreeClassifier()
BC = BaggingClassifier(base_estimator = DT, n_estimators =5, max_features = 0.5, max_samples = 0.5)

# Fitting the Bag
BC.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=0.5,
         max_samples=0.5, n_estimators=5, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

### 5. Pull out a base estimator from the ensemble model.

In [6]:
#  Getting our bags base model 
# We can only have one base model so our estimator models can not have varying parameters
# The Random_state is more or less a reference seed.
BC.base_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

### 6. Pull out *all* the base estimators.

In [7]:
# Gettin the rest of our bags models.
BC.estimators_

[DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=378203321, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=459756946, splitter='best'),
 DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
             max_features=None, max_leaf_nodes=None,
             min_impurity_split=1e-07, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=568997751, splitter='best'),
 DecisionTreeClassifier(class_w

### 7. Get the features used in each of the bagged base estimators.

In [8]:
# Getting the features in each of our bagged models.
# Pretty much their index values of the list of feature names
BC.estimators_features_

[array([22,  8, 16, 13, 21,  1, 19,  0, 10,  7, 23,  2,  9, 20,  5]),
 array([ 2, 26,  9, 27,  1,  8,  0, 12, 23, 15, 16, 20, 29, 21, 10]),
 array([ 4, 14, 26,  6, 12, 19,  7,  8, 22, 13, 28, 25,  3, 18,  9]),
 array([20,  3, 12,  1, 21, 25,  5,  2, 17, 27, 18, 22, 13, 15, 14]),
 array([14,  0,  6, 16,  8, 27, 13, 11, 22, 19, 29, 20, 28, 12,  7])]

### 8. Create a list of the features used in the first base estimator.

In [9]:
# What are the parameters for the first decision tree in our bag?
BC.estimators_[0]

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=378203321, splitter='best')

In [10]:
# What are the features used in the first model
BC.estimators_features_[0]

array([22,  8, 16, 13, 21,  1, 19,  0, 10,  7, 23,  2,  9, 20,  5])

In [11]:
# Creating a list of the selected features.
sub_features = []
for feature in BC.estimators_features_[0]:
    sub_features.append(data['feature_names'][feature])


### 9. Get out the samples used in our first base estimator.

In [12]:
# Getting how many samples our bagging estimator used.
jab = len(BC.estimators_samples_[0])
jab

426

In [13]:
# What are the samples used in the first model?
samples = BC.estimators_samples_[0] 

# Creating a list to append the index of sample data that was use from x_train
true_samples = []
for bool_index in range(0,jab):

    if samples[bool_index] == True:
        true_samples.append(bool_index)

In [20]:
# Gotta set the X_train equal to a variable and reset the index
data0  = X_train.reset_index(drop = True)

#Using the True Samples from our DT to sub down x_train
data2 = data0.ix[true_samples]

In [21]:
data2.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,20.64,17.35,134.8,1335.0,0.09446,0.1076,0.1527,0.08941,0.1571,0.05478,...,25.37,23.17,166.8,1946.0,0.1562,0.3055,0.4159,0.2112,0.2689,0.07055
1,11.37,18.89,72.17,396.0,0.08713,0.05008,0.02399,0.02173,0.2013,0.05955,...,12.36,26.14,79.29,459.3,0.1118,0.09708,0.07529,0.06203,0.3267,0.06994
2,13.11,22.54,87.02,529.4,0.1002,0.1483,0.08705,0.05102,0.185,0.0731,...,14.55,29.16,99.48,639.3,0.1349,0.4402,0.3162,0.1126,0.4128,0.1076
3,9.268,12.87,61.49,248.7,0.1634,0.2239,0.0973,0.05252,0.2378,0.09502,...,10.28,16.38,69.05,300.2,0.1902,0.3441,0.2099,0.1025,0.3038,0.1252
4,20.59,21.24,137.8,1320.0,0.1085,0.1644,0.2188,0.1121,0.1848,0.06222,...,23.86,30.76,163.2,1760.0,0.1464,0.3597,0.5179,0.2113,0.248,0.08999


### 10. Get out the target subsample for the estimator.

In [22]:
# Getting the y_train sub sample used.
target = pd.DataFrame(y_train)
target.reset_index(inplace = True, drop =True)
target2 = target.ix[true_samples]

### 11. Fit a decision tree equivalent to our first base estimator.

In [23]:
# Setting the Decision Tree in our First base model of our bagged classifier.
DTC0 = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=472506830, splitter='best')

In [24]:
# Setting the models X and Y values
X0 = data2[sub_features]
Y0 = target2[0]


In [25]:
# Fitting the model
DTC0.fit(X0, Y0)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=472506830, splitter='best')