In [13]:
%%time
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Set seed for reproducibility
SEED = 1

# Read the CSV file into a DataFrame: df
df = pd.read_csv('datasets/wbc.csv', index_col=0)

X = df[['radius_mean', 'concave_points_mean']]
y = df['diagnosis'].map(lambda x: 1 if x == 'M' else 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
print(X_train.shape)
# # Instantiate a DecisionTreeClassifier 'dt' with a maximum depth of 6
dt = DecisionTreeClassifier(max_depth=6, random_state=SEED)
#
# # Fit dt to the training set
dt.fit(X_train,y_train)
#
# # Predict test set labels
y_pred = dt.predict(X_test)
print(y_pred[0:5])

# Compute test set accuracy
acc = accuracy_score(y_test, y_pred)
print("Test set accuracy: {:.2f}".format(acc))

(455, 2)
[1 0 0 1 0]
Test set accuracy: 0.89
CPU times: total: 15.6 ms
Wall time: 13 ms


In [14]:
%%time
# Import LogisticRegression from sklearn.linear_model
from sklearn.linear_model import LogisticRegression

# Instantiate logreg
logreg = LogisticRegression(random_state=SEED)

# Fit logreg to the training set
logreg.fit(X_train,y_train)

# Define a list called clfs containing the two classifiers logreg and dt
clfs = [logreg, dt]

# Review the decision regions of the two classifiers
# plot_labeled_decision_regions(X_test, y_test, clfs)

CPU times: total: 0 ns
Wall time: 8 ms


In [31]:
%%time
# Import DecisionTreeClassifier from sklearn.tree
from sklearn.tree import DecisionTreeClassifier

# Instantiate dt_entropy, set 'entropy' as the information criterion
dt_entropy = DecisionTreeClassifier(max_depth=8, criterion='entropy', random_state=1)
dt_gini = DecisionTreeClassifier(max_depth=8, criterion='gini', random_state=1)

# Fit dt_entropy to the training set
dt_entropy.fit(X_train,y_train)
dt_gini.fit(X_train,y_train)

# Use dt_entropy to predict test set labels
y_pred = dt_entropy.predict(X_test)
y_pred_gini = dt_gini.predict(X_test)

# Evaluate accuracy_entropy
accuracy_entropy =accuracy_score(y_test, y_pred)
accuracy_gini =accuracy_score(y_test, y_pred_gini)

# Print acc
print("Original est set accuracy: {:.2f}".format(acc))

# Print accuracy_entropy
print(f'Accuracy achieved by using entropy: {accuracy_entropy:.3f}')

# Print accuracy_gini
print(f'Accuracy achieved by using the gini index: {accuracy_gini:.3f}')

Original est set accuracy: 0.89
Accuracy achieved by using entropy: 0.895
Accuracy achieved by using the gini index: 0.886
CPU times: total: 15.6 ms
Wall time: 7 ms


### Trying bagging from chapter 2 on the breast cancer dataset

In [30]:
%%time
from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(estimator=dt_entropy,
                       n_estimators=50,
                       random_state=SEED)
# Fit bc to the training set
bc.fit(X_train, y_train)

# Predict test set labels
y_pred = bc.predict(X_test)

# Evaluate acc_test
acc_test = accuracy_score(y_test, y_pred)

# Print acc_test
print('Test set accuracy of bc: {:.2f}'.format(acc_test))

Test set accuracy of bc: 0.90
CPU times: total: 93.8 ms
Wall time: 96 ms


### trying oob score

In [35]:
%%time
# Instantiate a classification-tree 'dt'
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf=0.16, random_state=SEED)

# Instantiate a BaggingClassifier 'bc'; set oob_score = True
bc = BaggingClassifier(estimator=dt, n_estimators=300, oob_score=True, n_jobs=-1)

# Fit 'bc' to the training set
bc.fit(X_train, y_train)
# Predict the test set labels
y_pred = bc.predict(X_test)

# Evaluate test set accuracy
test_accuracy = accuracy_score(y_test, y_pred)
# Extract the OOB accuracy from 'bc'
oob_accuracy = bc.oob_score_
# Print test set accuracy
print('Test set accuracy: {:.3f}'.format(test_accuracy))

# Print OOB accuracy
print('OOB accuracy: {:.3f}'.format(oob_accuracy))

Test set accuracy: 0.912
OOB accuracy: 0.914
CPU times: total: 172 ms
Wall time: 250 ms


In [17]:
# Import DecisionTreeRegressor from sklearn.tree
from sklearn.tree import DecisionTreeRegressor

# Instantiate dt
dt = DecisionTreeRegressor(max_depth=8,
                           min_samples_leaf=0.13,
                           random_state=3)

# Fit dt to the training set
dt.fit(X_train,y_train)

In [18]:
# Import mean_squared_error from sklearn.metrics as MSE
from sklearn.metrics import mean_squared_error as MSE

# Compute y_pred
y_pred = dt.predict(X_test)

# Compute mse_dt
mse_dt = MSE(y_test, y_pred)

# Compute rmse_dt
rmse_dt = mse_dt**(1/2)

# Print rmse_dt
print("Test set RMSE of dt: {:.2f}".format(rmse_dt))

Test set RMSE of dt: 0.27


In [19]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(X_train,y_train)

# Predict test set labels
y_pred_lr = lr.predict(X_test)

# Compute mse_lr
mse_lr = MSE(y_test, y_pred_lr)

# Compute rmse_lr
rmse_lr = mse_lr**(1/2)

# Print rmse_lr
print("Test set RMSE of lr: {:.2f}".format(rmse_lr))

Test set RMSE of lr: 0.31
