In [None]:
!pip install seaborn==0.11

## Breast Cancer Prediction

In [None]:
# Loading libraries needed for analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from numpy.random import RandomState
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_palette(['#06B1F0', '#FC4B60'])
random_seed = 63445

In [None]:
new_data = pd.read_csv('./data_updated.csv', index_col=0)
print ("DATA SHAPE: ", new_data.shape)
new_data.head()

### Class distribution

In [None]:
new_data['Class'].value_counts()

The problem we are trying to solve in this analysis is breast cancer prediction. Based on the features available, we are going to predict whether the tumor a patient has is benign or malignant. One of the biggest challenges of this analysis is to deal with an imbalanced dataset

In [None]:
sns.countplot(new_data['Class'])
plt.show()

# Experiment setup

Choosing the evaluation metrics is also quite challenging when dealing with imbalanced datasets. Accuracy is not the right metric as the accuracy of baseline model that classifies everything as over-represented class is 97%. Any model we build should beat this accuracy score.  

In [None]:
def metrics(true, preds):
    """
    Function to calculate evaluation metrics 
    parameters: true values, predictions
    prints accuracy, recall, precision and f1 scores
    """
    accuracy = accuracy_score(true, preds)
    recall = recall_score(true, preds)
    precision = precision_score(true, preds)
    f1score = f1_score(true, preds)
    print ('accuracy: {}, recall: {}, precision: {}, f1-score: {}'.format(accuracy, recall, precision, f1score))

## Baseline Model - Train and test without additional pre-processing

Using stratified sampling instead of random train test split. This splits the target proportionally between training and test set.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(new_data.drop('Class', axis=1), new_data['Class'], test_size = 0.30, random_state=random_seed, stratify=new_data['Class'])
print ("TRAIN DATA SHAPE: ", x_train.shape)
print ("TEST DATA SHAPE: ", x_test.shape)
rf = RandomForestClassifier(n_estimators=5, random_state=random_seed)
rf.fit(x_train, y_train)

### Applying to held-out set

In [None]:
preds = rf.predict(x_test)
metrics(y_test, preds)

## Over-sampling the minority class in training set using SMOTE

In [None]:
sm = SMOTE(random_state=random_seed)
X, Y = sm.fit_sample(x_train, y_train, )
print ('Shape of oversampled data: {}'.format(X.shape))
print ('Shape of Y: {}'.format(Y.shape))

In [None]:
X = np.floor(X).astype(int)

In [None]:
sns.countplot(Y)
plt.title('Balanced training data')
plt.show()

In [None]:
rf = RandomForestClassifier(n_estimators=5, random_state=random_seed)
recall_scores = cross_val_score(rf, X, Y, scoring='recall', cv=5)
f1_scores = cross_val_score(rf, X,Y, scoring='f1', cv=5)
accuracy_scores = cross_val_score(rf, X,Y, scoring='accuracy', cv=5)
precision_scores = cross_val_score(rf, X,Y, scoring='precision', cv=5)
print ('Average Recall score: {}'.format(np.mean(recall_scores)))
print ('Average F1 scores: {}'.format(np.mean(f1_scores)))
print ('Average Accuracy scores: {}'.format(np.mean(accuracy_scores)))
print ('Average Precision scores: {}'.format(np.mean(precision_scores)))

whoohooo. the cross validation scores look amazing. Let's see how it performs on test data

### Results for training set

In [None]:
preds = cross_val_predict(rf, X, Y, cv=5)
print ('Accuracy score: {}'.format(accuracy_score(Y, preds)))
print ('Recall score: {}'.format(recall_score(Y, preds)))
print ('Precision score: {}'.format(precision_score(Y, preds)))
print ('f1-score: {}'.format(f1_score(Y, preds)))

### Results for test set

In [None]:
rf = RandomForestClassifier(n_estimators=5, random_state=random_seed)
rf.fit(X, Y)
test_preds = rf.predict(x_test)
print ('Accuracy score: {}'.format(accuracy_score(y_test, test_preds)))
print ('Recall score: {}'.format(recall_score(y_test, test_preds)))
print ('Precision score: {}'.format(precision_score(y_test, test_preds)))
print ('f1-score: {}'.format(f1_score(y_test, test_preds)))

What happened? cross-validated score is almost 100% but test recall is only 50%. looks like model is over-fitting. But we did cross-validation to detect if our model was overfitting

Let's look at the distribution of features over 2 classes. 

## Distribution of features over malignant and benign tumors

In [None]:
ccols = new_data.columns

grouped_data = new_data.groupby('Class')
sns.set(font_scale=1.8)
sns.set_palette(['#06B1F0', '#FC4B60'])
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20, 16), 
                         tight_layout=True)
for ax, p in zip(axes.ravel(), ccols):
    for k, v in grouped_data[p]:
        sns.kdeplot(v, ax=ax, label=str(k)+":"+v.name)
#         plt.setp(ax.get_legend().get_texts(), fontsize='22')
plt.savefig('feature_distributions.png')

In the kernel density plots we are looking for variables with very little overlap between the malignant and benign tumor. Out of nine features, only two features have high discriminatory power between class 0 and 1

so the features did not contribute to model overfitting. It could be that there is something wrong with the way the oversampling and cross-validation were performed

Looks like there was information bleed from the validation set to the training set.

![smote_before_cross_val](https://www.marcoaltini.com/uploads/1/3/2/3/13234002/2639934.jpg?401)

Here the issue is that the synthetic observations end up in both the training and validation sets during the same iteration. Models like Random Forest are able to recognise that these values are from the same distribution and the validation set predictions become very accurate.

Moreover, another intuitive issue is that during validation, the model is being both trained and validated on a **balanced** dataset, meaning both the train and validation sets are balanced. However, when we do ahead to the testing phase, we are essentially using a model trained on balanced dataset and testing it on an unbalanced dataset. Therefore the validation step, which is supposed to be representative of the testing environment no longer serves that purpose.

The ideal way would be to perform oversampling of the training data in each cross-validation iteration. This would prevent the data leakage from the validation set to the training set during cross-validation.

![smote with cross val](https://www.marcoaltini.com/uploads/1/3/2/3/13234002/9101820.jpg?372)

## Oversampling in each cross-validation loop

In [None]:
kf = StratifiedKFold(n_splits=5, random_state=random_seed)
cross_val_f1_score_lst = []
cross_val_accuracy_lst = []
cross_val_recall_lst = []
cross_val_precision_lst = []

for train_index_ls, validation_index_ls in kf.split(x_train, y_train):
    # keeping validation set apart and oversampling in each iteration using smote 
    train, validation = x_train.iloc[train_index_ls], x_train.iloc[validation_index_ls]
    target_train, target_val = y_train.iloc[train_index_ls], y_train.iloc[validation_index_ls]
    sm = SMOTE(random_state=random_seed)
    X_train_res, y_train_res = sm.fit_sample(train, target_train)
    print (X_train_res.shape, y_train_res.shape)
    
    # training the model on oversampled 4 folds of training set
    rf = RandomForestClassifier(n_estimators=5, random_state=random_seed)
    rf.fit(X_train_res, y_train_res)
    # testing on 1 fold of validation set
    validation_preds = rf.predict(validation)
    cross_val_recall_lst.append(recall_score(target_val, validation_preds))
    cross_val_accuracy_lst.append(accuracy_score(target_val, validation_preds))
    cross_val_precision_lst.append(precision_score(target_val, validation_preds))
    cross_val_f1_score_lst.append(f1_score(target_val, validation_preds))
print ('Cross validated accuracy: {}'.format(np.mean(cross_val_accuracy_lst)))
print ('Cross validated recall score: {}'.format(np.mean(cross_val_recall_lst)))
print ('Cross validated precision score: {}'.format(np.mean(cross_val_precision_lst)))
print ('Cross validated f1_score: {}'.format(np.mean(cross_val_f1_score_lst)))

### By doing over-sampling under each cross-validation loop, the cross-validated scores are representative of test set scores above 

In [None]:
r_s, p_s = np.mean(recall_scores), np.mean(precision_scores)
a_s, f_s = np.mean(accuracy_scores), np.mean(f1_scores)
r_s_1, p_s_1 = np.mean(cross_val_recall_lst), np.mean(cross_val_precision_lst)
a_s_1, f_s_1 = np.mean(cross_val_accuracy_lst), np.mean(cross_val_f1_score_lst)
metrics_df_wrong = pd.DataFrame(list(zip([r_s], [p_s], [f_s], [a_s])), columns=['Recall', 'Precision', 'F1-score', 'Accuracy'])
metrics_df_right = pd.DataFrame(list(zip([r_s_1], [p_s_1], [f_s_1], [a_s_1])), columns=['Recall', 'Precision', 'F1-score', 'Accuracy'])
metrics_df = pd.concat([metrics_df_wrong, metrics_df_right], axis=0)

In [None]:
metrics_df.index = (['Cross validation and oversampling done wrong', 'Cross validation and oversampling done right'])
metrics_df = metrics_df.transpose()

In [None]:
plt.figure(figsize=(8,6))
ax = plt.subplot(111)
ax.bar([0,2,4,6], metrics_df['Cross validation and oversampling done wrong'], color='deepskyblue', label='Cross validation and oversampling done wrong')
ax.bar([0.8,2.8,4.8,6.8], metrics_df['Cross validation and oversampling done right'], color = 'red', label = 'Cross validation and oversampling done right')
ax.legend(loc='upper right', bbox_to_anchor=(1.1, 1.2))
plt.xticks([0,2,4,6], ['Recall', 'Precision', 'F1-score', 'Accuracy'], horizontalalignment='left')
plt.xlabel('Metrics')
plt.ylabel('Cross-validated score')
plt.show()

As we can see from the plot above, the performance of the model has improved to give 50% recall score after oversampling and more representative results for test set were obtained when cross-validation and oversampling was done right.

## Balanced Ensembling Methods

#### Methods generating under-sampled subsets combined inside an ensemble.
For various algorithms: https://imbalanced-learn.readthedocs.io/en/stable/api.html#module-imblearn.ensemble 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(new_data.drop('Class', axis=1), new_data['Class'], test_size = 0.30, random_state=random_seed, stratify=new_data['Class'])
print ("TRAIN DATA SHAPE: ", x_train.shape)
print ("TEST DATA SHAPE: ", x_test.shape)
erf = EasyEnsembleClassifier( n_estimators=100, base_estimator= RandomForestClassifier(n_estimators=5, random_state=random_seed))
erf.fit(x_train, y_train)

In [None]:
preds = erf.predict(x_test)
metrics(y_test, preds)

For default combination of under and oversampling techniques: https://imbalanced-learn.readthedocs.io/en/stable/api.html#module-imblearn.combine

### MCC, AuPRC metrics metrics look at not just the accuracy scores but also F1, precision, recall which gives us an overall view of the model performance.