<center><img src="img/ufpa_logo.png" alt="drawing" width="120"/></center>

<h4 style="text-align: center;font-size:22px"><strong>Universidade Federal do Par&aacute;</strong></h4>
<h4 style="text-align: center;font-size:22px"><strong>Programa de P&oacute;s-gradua&ccedil;&atilde;o em Engenharia El&eacute;trica</strong></h4>
<h3 style="text-align: center;">Evaluation of computationally intelligent techniques for breast cancer diagnosis</h3>
<h4 style="text-align: center;font-size:24px"><strong>Published on Neural Computing and Applications journal (2021)</strong></h4>
<p>&nbsp;</p>
<blockquote>
<ul>
<li><strong>Adilson</strong></li>
<li><strong>Cleverson</strong></li>
<li><strong>Felipe</strong></li>
<li><strong>Rodrigo</strong></li>
</ul>
</blockquote>

# Load dataset 

In [None]:
# Reading the dataset
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc, roc_auc_score

data_csv = "../breast-cancer-wisconsin.csv"
df = pd.read_csv(data_csv, names=['ID','Clump','U_Cell_size','U_Cell_shape','Marginal_Adhesion','SE_epitelial_cell_size','Bare_nuclei','bland_chromatin','Normal_Nucleoli','Mitoses','Class'])
#print('Dataset shape: ', df.shape)
#print(df.dtypes)
df.head()

#  Attribute                     Domain
-- -----------------------------------------
1. Sample code number            id number
2. Clump Thickness               1 - 10
3. Uniformity of Cell Size       1 - 10
4. Uniformity of Cell Shape      1 - 10
5. Marginal Adhesion             1 - 10
6. Single Epithelial Cell Size   1 - 10
7. Bare Nuclei                   1 - 10
8. Bland Chromatin               1 - 10
9. Normal Nucleoli               1 - 10
10. Mitoses                       1 - 10
11. Class:                        (0 for benign, 1 for malignant)

In [None]:
null_columns=df.columns[df.isnull().any()]
print(df[df.isnull().any(axis=1)][null_columns].head())
# Change bare nuclei values to numeric only and check the numbers of NAN
df['Bare_nuclei'] = pd.to_numeric(df['Bare_nuclei'], errors='coerce', downcast='integer')
print(df['Bare_nuclei'].isnull().values.sum())

# Drop the lines with null values
df = df.dropna()
print(df['Bare_nuclei'].isnull().values.sum())

# Removing ID column since it won't be considered to the training
df.pop('ID')

# So, the dataset removing null values and ID column has the size:
print('New dataset shape: ', df.shape)

In [None]:
#Values distribution
for name, values in df.iteritems():
    print (name, '\nMin Value:  ', np.min(values), '\nMax Value: ', np.max(values), '\n\n')

In [None]:
# Changing Class label from 2 or 4 to 0 or 1
df.loc[:, 'Class'] = np.where(df['Class']==2, 0, 1)
print ('Class\nMin Value:  ', np.min(df['Class']), '\nMax Value: ', np.max(df['Class']), '\n\n')

In [None]:
print(f"Total     = {len(df)} -> 100%")
print(f"Benign    = {len(df[df.Class == 0])} -> {len(df[df.Class == 0])/len(df) *100}%")
print(f"Malignant = {len(df[df.Class == 1])} -> {len(df[df.Class == 1])/len(df) *100}%")

# Atributes correlation for benign and malignant samples

In [None]:
# Correlation benign samples
corr_mat_benign = df.loc[df.Class == 0].drop(['Class'], axis=1).corr()
corr_mat_benign

In [None]:
plt.figure()
ax = sns.heatmap(corr_mat_benign, linewidth=0.5)
plt.show()

In [None]:
corr_mat_malignant = df.loc[df.Class == 1].drop(['Class'], axis=1).corr()
corr_mat_malignant

In [None]:
plt.figure()
ax = sns.heatmap(corr_mat_malignant, linewidth=0.5)
plt.show()

In [None]:
# Separate entries from outputs
dataset = df.to_numpy(dtype=np.int) # Converting from Pandas dataframe to Numpy
entries = dataset[:, 0:9]
outputs = dataset[:, 9]
print(entries.shape)
print(outputs.shape)


In [None]:
# Split dataset between train and test
seed = 10 # Set seed to get invariant results
test_size = 0.34
x_train, x_test, y_train, y_test = train_test_split(entries, outputs, test_size=test_size, random_state=seed)
print('Train dataset shape:\nEntries: ', x_train.shape, '\nOutput: ', y_train.shape, '\n\n')
print('Test dataset shape:\nEntries: ', x_test.shape, '\nOutput: ', y_test.shape)

In [None]:
# Create 10-fold validation set for training
K = 10
kf = KFold(n_splits=10, shuffle=True, random_state=seed)

# Decision Tree Classifier

In [None]:
# Decision Tree classifier
dt = tree.DecisionTreeClassifier(random_state=seed)

# Random Forest Classifier

In [None]:
# Random forest classifier
rf = RandomForestClassifier(n_estimators=20, random_state=seed)

# K-Nearest neighbor classifier

In [None]:
neigh = KNeighborsClassifier()

# Organizing classifiers

In [None]:
classifiers = [(dt, "Decision tree"), (rf, "Random forest"), (neigh, "K-Nearest neighbor")]

# Classifiers training

In [None]:
# Training classifiers using cross-validation
fold_number = 1
for train_indexes, valid_indexes in kf.split(x_train):
    print("Fold ", fold_number)
    for classifier, label in classifiers:
        classifier.fit(x_train[train_indexes], y_train[train_indexes])
        y_valid_pred = classifier.predict(x_train[valid_indexes])
        print("Classifier type: ",label, ", Validation Accuracy = ", accuracy_score(y_train[valid_indexes], y_valid_pred))
    print('\n')
    fold_number += 1


In [None]:
# Testing classifiers
for classifier, label in classifiers:
    y_test_estimative = classifier.predict(x_test)
    print("Classifier type: ", label, ", Test Accuracy = ", accuracy_score(y_test, y_test_estimative))

# Confusion matrix calculation

In [None]:
confusion_matrixes = np.zeros((len(classifiers), 4))
for index, classifier_info in enumerate(classifiers):
    confusion_matrixes[index,:] = np.array([confusion_matrix(outputs, classifier_info[0].predict(entries)).ravel()])

In [None]:
def plot_metrics(dataframe, metric_indexes, indexes_results, orientation, xlabel, ylabel):
    classifier_labels = ["Decision tree", "Random forest", "K-Nearest neighbor"]
    df_perf_results = pd.DataFrame(dataframe, columns=metric_indexes)
    df_perf_results.insert(0, 'classifier_type', classifier_labels, True)
    df_perf_results = pd.melt(df_perf_results, id_vars=['classifier_type'], value_vars=indexes_results, var_name='Metric')
    
    # Plot confusion matrixes for each classifier
    plt.figure()
    if orientation=="h":
        x = "value"
        y = "classifier_type"
        gridon = "x"
    else:
        y = "value"
        x = "classifier_type"
        gridon = "y"
    x = "value" if orientation=="h" else "classifier_type"
    y = "classifier_type" if orientation=="h" else "value"
    sns.catplot(data=df_perf_results, kind="bar", orient=orientation, x=x, y=y, hue="Metric", ci="sd", palette="dark", alpha=.6, height=6)
    plt.ylabel(xlabel, fontsize=14)
    plt.xlabel(ylabel, fontsize=14)
    plt.grid(axis=gridon)
    plt.show()

# Generate dataset to plot using seaborn package
indexes = ["TP", "FN", "FP", "TN"]
indexes_result1 = ["TP", "FP", "FN", "TN"]

plot_metrics(confusion_matrixes, indexes, indexes_result1, "v", "Classifiers", "No. of samples")

In [None]:
df_confusion_matrixes = pd.DataFrame(confusion_matrixes, columns=indexes, index=[label for _, label in classifiers])
df_confusion_matrixes.T

# Calculates the performance metrics 

In [None]:
def perf_metrics(confusion_values):
    # [0] = TP, [1] = FN , [2] = FP, [3] = TN
    # 4.1 accuracy
    accuracy = (confusion_values[0] + confusion_values[3]) / (np.sum(confusion_values))
    # 4.2 precision
    precision = confusion_values[0] / (confusion_values[0] + confusion_values[2])
    # 4.3 specificity
    specificity = confusion_values[3] / (confusion_values[3] + confusion_values[2])
    # 4.4 TP rate
    tp_rate = confusion_values[0] / (confusion_values[0] + confusion_values[1])
    # 4.5 FP rate
    fp_rate = confusion_values[2] / (confusion_values[2] + confusion_values[3])
    # 4.6 NPV
    npv = confusion_values[3] / (confusion_values[3] + confusion_values[1])
    # 4.7 Rate of Misclassification
    misclassification_rate = (confusion_values[2] + confusion_values[1]) / (np.sum(confusion_values))
    # 4.8 F1 Score
    f1_score = (precision * tp_rate) / (precision + tp_rate)

    return np.array([accuracy, precision, specificity, tp_rate, fp_rate, npv, misclassification_rate, f1_score])

perf_results = np.zeros((confusion_matrixes.shape[0], 8))
for i in np.arange(confusion_matrixes.shape[0]):
    perf_results[i,:] = perf_metrics(confusion_matrixes[i,:])

In [None]:
metric_indexes = ["CA", "Pre", "Spec", "Rec", "FPR", "NPV", "RMC", "F1"] # that stands for Classification accuracy, Precision, Specificity, Recall/TP rate, 
# False positive rate, negative predictive value, misclassification rate and F1, respectively.
indexes_result2 = ["RMC", "FPR", "NPV", "F1", "Spec"]

plot_metrics(perf_results, metric_indexes, indexes_result2, "h", "No. of samples", "Classifiers")

In [None]:
df_perf_results = pd.DataFrame(perf_results, columns=metric_indexes, index=[label for _, label in classifiers])
df_perf_results[indexes_result2].T

In [None]:
indexes_result3 = ["CA", "Pre", "Rec"]
plot_metrics(perf_results, metric_indexes, indexes_result3, "h", "No. of samples", "Classifiers")

In [None]:
df_perf_results = pd.DataFrame(perf_results, columns=metric_indexes, index=[label for _, label in classifiers])
df_perf_results[indexes_result3].T

# Create ROC Curves

In [None]:
# Code to create ROC curve

fpr = dict()
tpr = dict()
roc_auc = dict()

for index, classifier_info in enumerate(classifiers):
    
    fpr[classifier_info[1]], tpr[classifier_info[1]], _ = roc_curve(outputs, classifier_info[0].predict(entries))
    roc_auc[classifier_info[1]] = auc(fpr[classifier_info[1]], tpr[classifier_info[1]])
    

# Plot ROC Curves

In [None]:
#plot curves
plt.figure()
lw = 2
plt.plot(fpr["Decision tree"], tpr["Decision tree"], color='blue',
         lw=lw, label='Decision tree ROC curve (area = %0.2f)' % roc_auc["Decision tree"])

plt.plot(fpr["Random forest"], tpr["Random forest"], color='deeppink',
         lw=lw, label='Random forest ROC curve (area = %0.2f)' % roc_auc["Random forest"])

plt.plot(fpr["K-Nearest neighbor"], tpr["K-Nearest neighbor"], color='darkcyan',
         lw=lw, label='K-Nearest neighbor ROC curve (area = %0.2f)' % roc_auc["K-Nearest neighbor"])

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()



# Plot ROC Curves in subplots

In [None]:
#plot curves in subplots

plt.figure()
lw = 2

############### 1
plt.subplot(2, 2, 1)
plt.plot(fpr["Decision tree"], tpr["Decision tree"], color='blue',
         lw=lw, label='Decision tree ROC')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")

############## 2
plt.subplot(2, 2, 2)
plt.plot(fpr["Random forest"], tpr["Random forest"], color='deeppink',
         lw=lw, label='Random forest ROC')
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")

############ 3
plt.subplot(2, 2, 3)
plt.plot(fpr["K-Nearest neighbor"], tpr["K-Nearest neighbor"], color='darkcyan',
         lw=lw, label='K-Nearest neighbor ROC')

plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.show()

# Paper negative points  (Let's make a list to use after)
- Does not mention the null values present at `Bare_nuclei` atribute
- Does not treat the proportion of benign and malign samples at training and test dataset
- We assume that `ID` column is not used to the training, but paper seems to consider it
- Normalization of entry values could improve the performance for some methods as NN
- Use a repeated K-fold cross validation could improve the performance
- Use grid-search to tune decision tree parameters could be implemented
- Does not specify any of the Decision tree parameters used (or they just use the default implementation without set any parameters)
- Calculates the decision matrix over the entire dataset
- Authors didn't remove NaN values before calculate the correlation
- Authors used the wrong labels at confusion matrixes of fig. 15, the right sequence is TN, FP, FN and TP.
- Authors plotted 2 times the F1 at both figure 16 and 17, and forgot to plot precision, and also plot AUC

# Thanks slide