# Calculating the Error Rate on Different Sets of Data 

In [1]:
from sklearn.datasets import load_breast_cancer
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
data = load_breast_cancer()

X, y = pd.DataFrame(data.data), pd.DataFrame(data.target)

X.shape, y.shape

((569, 30), (569, 1))

In [3]:
from sklearn.model_selection import train_test_split

X_orig, X_test, y_orig, y_test = train_test_split(
    X, y, test_size=.1, stratify=y, random_state=42,
)

X_train, X_dev, y_train, y_dev = train_test_split(
    X_orig, y_orig, 
    test_size=X_test.shape[0]/X_orig.shape[0],
    random_state=42,
)

(
    X_train.shape, y_train.shape, 
    X_dev.shape, y_dev.shape, 
    X_test.shape, y_test.shape
)

((455, 30), (455, 1), (57, 30), (57, 1), (57, 30), (57, 1))

## Create a train/dev set that combines data from both the training and validation sets:

In [4]:
np.random.seed(42)

# Create indices for train_dev
indices_train = np.random.randint(0, len(X_train), 25)
indices_dev = np.random.randint(0, len(X_dev), 25)

X_train_dev = pd.concat(
    [X_train.iloc[indices_train,:], X_dev.iloc[indices_dev, :]]
    )
y_train_dev = pd.concat(
    [y_train.iloc[indices_train,:], y_dev.iloc[indices_dev, :]]
)

print(X_train_dev.shape, y_train_dev.shape)

(50, 30) (50, 1)


## Train a decision tree on the train set

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

sets = ["Training", "Train/dev", "Validation", "Testing"]
X_sets = [X_train, X_train_dev, X_dev, X_test]
y_sets = [y_train, y_train_dev, y_dev, y_test]

scores = {}
for i in range(0, len(X_sets)):
    pred = clf.predict(X_sets[i])
    score = recall_score(y_sets[i], pred)
    scores[sets[i]] = score
print(scores)

{'Training': 1.0, 'Train/dev': 0.9696969696969697, 'Validation': 0.9230769230769231, 'Testing': 0.9166666666666666}


In [20]:
print('Bayes Error: ', 0.)
for k,v in scores.items():
  print(k, ': ', 1-v)

Bayes Error:  0.0
Training :  0.0
Train/dev :  0.030303030303030276
Validation :  0.07692307692307687
Testing :  0.08333333333333337


In [21]:
0.08333333333333337 - 0.07692307692307687

0.0064102564102564985

In [22]:
0.07692307692307687 - 0.030303030303030276

0.046620046620046596

Here, the Bayes error was assumed as 0, considering that the classification
between a malignant and a benign mass is done by taking a biopsy of the mass.
From the preceding table, it can be concluded that the model performs
exceptionally well for the purpose of the study, considering that all error rates
are close to 0, which is the lowest possible error. 

The highest difference in error rates is found between the train/dev set and the dev set, which refers to data mismatch. However, taking into account that all the datasets come from the same distribution, this condition is considered a high variance issue, where adding more data to the training set should help reduce the error rate.