# Breast Cancer

**Data Set Information:**

There are two classes (binary classification): “no-recurrence-events” and “recurrence-events”, that describe whether the patient’s cancer reappeared after treatment. The other 9 attributes contain general information about the patients themselves as well as more specific information about their individual cancer diagnoses. Using this information the goal is to classify whether a patient will have breast cancer again, or not


**Attribute Information:**

**- Class:** Describes if a patient had recurrent tumors;<br>
**- age:** Age listed in Interval of 10 years;<br>
**- menopause:** Nominal Short text description;<br>
**- tumor-size:** Interval in which falls the diamater of tumor falls;<br>
**- inv-nodes:** Interval in which falls the number of lymph-nodes in close proximity of the tumor;<br>
**- node-caps:** Nominal Describe whenever there're metastases or not;<br>
**- deg-malig:** Numerical Describe how bad the cancer is;<br>
**- breast:** Nominal Describe the afflicted breast;<br>
**- breast-quad:** Nominal Text representing the location of  tumor in the breast.<br>
**- irradiate** Nominal yes/no Indicates whenever the patient underwent radiation therapy.

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
from scipy import stats
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, Normalizer
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import SelectPercentile, chi2, SelectFromModel
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier

### Get the data

In [None]:
data = pd.read_csv('breast-cancer.csv')

*Remove quotation marks*

In [None]:
for column in data:
    data[column] = data[column].map(lambda x: x.lstrip("'").rstrip("'"))

In [None]:
data.head()

### Basic Data Information

In [None]:
data.info()

In [None]:
data.describe()

### Check missing values

In [None]:
data.loc[(data['age'] == '?') | (data['menopause'] == '?') | \
         (data['tumor-size'] == '?') | (data['inv-nodes'] == '?') | \
         (data['node-caps'] == '?') | (data['deg-malig'] == '?') | \
         (data['breast'] == '?') | (data['breast-quad'] == '?') | \
         (data['irradiat'] == '?') | (data['Class'] == '?')] 

*Convert missing data (indicated by a ?) into NaN*

In [None]:
data.replace("?", np.nan, inplace = True)
print(data.isnull().sum())

age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      8
deg-malig      0
breast         0
breast-quad    1
irradiat       0
Class          0
dtype: int64


*Fill missing value with median*

In [None]:
data = data.fillna(data.mode().iloc[0])
#data = data.dropna()

In [None]:
print(data.isnull().sum())

# Exploratory Data Analysis

In [None]:
c_palette = ['tab:red','tab:green']

*Countplot of the Target* 

In [None]:
sns.set_style('darkgrid')
ax = sns.countplot(x = data['Class'], palette=c_palette)

total = len(data['Class'])

for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:.1f}%'.format(100 * height/total),
            ha="center")

**Class on Age Interval**

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(y="age", hue="Class", data=data, palette=c_palette)
plt.show()

**Class on Menopause**

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(y="menopause", hue="Class", data=data, palette=c_palette)
plt.show()

**Class on Breast**

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(y="breast", hue="Class", data=data, palette=c_palette)
plt.show()

**Class on Breast-Quad**

*Divide par Left/Right breast*

In [None]:
right_b = data.loc[data['breast'] == 'right']
left_b = data.loc[data['breast'] == 'left']

In [None]:
fig = plt.figure(figsize = (15,10))
ax1 = fig.add_subplot(2,1,1)
sns.countplot(y=left_b['breast-quad'], hue="Class", data=data, ax = ax1, palette=c_palette)
ax1.set(ylabel='Left Breast')

ax2 = fig.add_subplot(2,1,2)
sns.countplot(y=right_b['breast-quad'], hue="Class", data=data, ax=ax2, palette=c_palette)
ax2.set(ylabel='Right Breast')

**Class on Degree of Malignancy**

In [None]:
fig = plt.figure(figsize = (15,10))
ax1 = fig.add_subplot(2,2,1)
deg_malig = data['deg-malig'].astype(float)
sns.violinplot(data = data, x='Class', y=deg_malig, ax=ax1, palette=c_palette)
sns.swarmplot(data = data, x='Class', y='deg-malig', color = 'k', alpha = 0.6, ax=ax1)

**Class on Lymph-Nodes**

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(y="inv-nodes", hue="Class", data=data, palette=c_palette)
plt.show()

**Class on Metastases**

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(y="node-caps", hue="Class", data=data, palette=c_palette)
plt.show()

**Class on Irradiate**

In [None]:
plt.figure(figsize=(12, 5))
sns.countplot(y="irradiat", hue="Class", data=data, palette=c_palette)
plt.show()

# Classifier 

#### Import Libraries

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

#### General pre-processing

#### Encode Class Feature

In [None]:
#Encode the Class feature to be a binary one
data['Class'][data.Class == 'recurrence-events'] = 1
data['Class'][data.Class == 'no-recurrence-events'] = 0

#### Preprocess Ordinal Data (LabelEncoder)

In [None]:
#replace binary data
data = data.replace(to_replace=['no', 'yes'], value=['0', '1'])

#preprocess the ordinal data (hierarchy) 
data["age"] = data["age"].map({'20-29':0, '30-39':1, '40-49':2, '50-59':3, '60-69':4, '70-79':5 })

data["inv-nodes"] = data["inv-nodes"].map({'0-2':0, '3-5':1, '6-8':2, '9-11':3, '12-14':4, '15-17':5,'24-26':7})

data["tumor-size"] = data["tumor-size"].map({'0-4':0, '5-9':1, '10-14':2, '15-19':3, '20-24':4, '25-29':5, '30-34':5, '35-39':6, '40-44':7, 
        '45-49':8, '50-54':9 })
data.head()


#### Preprocess Non-Ordinal Data using OneHotEncoding

In [None]:
#processing label with no hierarchy
nominal = ["breast-quad", "breast", "menopause"]
for i in nominal:
    one_hot = pd.get_dummies(data[i])
    data = data.drop(i,axis = 1)
    data = data.join(one_hot)
    
data.head()

#### Split data in train and test

In [None]:
#split the data in attributes and class as well as training and test sets
X = data.drop('Class', axis=1)
y = data['Class']

##  2.1) Feature Selection 

**Choose one of the methods:**
1. SelectPercentile (chi2)
2. SelectFromModel (LinearSVC)
3. SelectFromModel (LogisticRegression)
3. SelectFromModel (ExtraTreesClassifier)

## 2.2) Data Pre-processing

**Choose one of the methods:**
1. Standardization (StandardScaler)
2. Standardization (RobustScaler)
3. MinMaxScaler
4. Normalization

## 2.3)Outliers Detection using mathematical function Z-Score


### All three features included in the following function:

In [None]:
def featureSelection(XCurr, yCurr, featureselection_method, preprocessing_method):
    '''if featureselection_method == 1:
        selection = SelectPercentile(chi2, percentile=5)
    elif featureselection_method == 2:
        clf = LinearSVC()
        #clf = LinearSVC(C=0.1, penalty="l1", dual=False).fit(X, y)
        selection = SelectFromModel(clf, prefit=False)
    elif featureselection_method == 3:
        clf = LogisticRegression()
        #clf = LogisticRegression(C=0.2, penalty="l2", dual=False, max_iter=200).fit(X, y)
        selection = SelectFromModel(clf, prefit=False)
    elif featureselection_method == 4:
        clf = ExtraTreesClassifier(n_estimators=50).fit(XCurr, yCurr)
        selection = SelectFromModel(clf, prefit=False)      

    clf.feature_importances_ 
    X_transformed = selection.fit_transform(XCurr, yCurr)
    columns = np.asarray(XCurr.columns.values)
    support = np.asarray(selection.get_support())
    columns_with_support = columns[support]
    print("X_transformed.shape",X_transformed.shape)'''

    
    #Data Pre-processing:
    if preprocessing_method == 1:
        scaler = StandardScaler()
        scaler = scaler.fit(XCurr)
        X_scaled = scaler.transform(XCurr)
    elif preprocessing_method == 2:
        scaler = RobustScaler()
        scaler = scaler.fit(XCurr)
        X_scaled = scaler.transform(XCurr)
    elif preprocessing_method == 3:
        scaler = preprocessing.MinMaxScaler()
        scaler = scaler.fit(XCurr)
        X_scaled = scaler.transform(XCurr)
    elif preprocessing_method == 4:
        scaler = preprocessing.Normalizer()
        scaler = scaler.fit(XCurr)
        X_scaled = scaler.transform(XCurr)
    print("X_scaled = ",XCurr)
    
    
    #Outlier Detection:
    z = np.abs(stats.zscore(X_scaled))
    threshold = 20
    outliers_rows = np.where(z > threshold)
    print("\n z > threshold = ", np.where(z > threshold))
    # The first array contains the list of row numbers and second array respective column numbers
    print("\n number of outliers = ",len(set(outliers_rows[0])))
    
    #Remove Ouliers
    #X_prepared = X_scaled[(np.abs(stats.zscore(X_scaled)) < threshold).all(axis=1)]
    #X_prepared.shape
    #y = y.to_numpy()
    
    #X_train, X_test, y_train, y_test = train_test_split(X_scaled, yCurr, test_size=0.30)
    #return(X_train, X_test, y_train, y_test)
    return(X_scaled, yCurr)

In [None]:
featureselection_method = 4
preprocessing_method = 1

In [None]:
'''print('\n-------------------both types ------------------\n')
X, y = featureSelection(X, y, featureselection_method, preprocessing_method)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)'''

## Models Prediction

**Types:**
- Linear Classifiers: Logistic Regression, Naive Bayes Classifier
- Nearest Neighbor
- Support Vector Machines
- Decision Trees
- Random Forest
- Neural Networks

In [None]:
NN = 1

d = {}

#d["Logistic Regression"] = LogisticRegression(max_iter=200)
#d["Gaussian Naive Bayes"] = GaussianNB()

d["KNearest Neighbors ("+ str(NN) + ")"] = KNeighborsClassifier(n_neighbors=NN)

d["SVM rbf"] = SVC()
#d["SGD Classifier"] = SGDClassifier()

d["Decision Tree"] = DecisionTreeClassifier()

#d["Random Forest"] = RandomForestClassifier()

#d["Multi-layer Perceptron Classifier"] = MLPClassifier(max_iter=1000)

### Find best classifier

In [None]:
scoreList = []
nameList = []

for name, clf in d.items():
    start = time.time()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    scoreList.append(accuracy_score(y_test, y_pred))
    nameList.append(name)

    end = time.time()
    print("\n--------------",name,"---------------\n")
    print("- Accuracy: %0.3f" % score, "- Time: %0.4f" % (end - start), "seconds")
    print("\n Number of mislabeled points out of a total %d points : %d \n\n"% (X_test.shape[0], (y_test != y_pred).sum()))
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

print("###############")
ranking = sorted(zip(scoreList,nameList))[::-1]
#print(sorted(zip(scoreList,nameList)))
print("\nClassifiers from best to worst:")
for i in range(0, len(ranking)):
    print(i+1, ') {0:35} score: {1}'.format(ranking[i][1], ranking[i][0]))

## Hyperparameter Optimization

### SVM

In [None]:
param_grid = {'C': [1, 10], 
              'gamma': [0.001, 0.01, 1]
             }


start = time.time()
clf_gridsearch = GridSearchCV(d.get(ranking[0][1]), param_grid, verbose = 0)
clf_gridsearch.fit(X_train, y_train)
print(clf_gridsearch.best_params_)
predictions = clf_gridsearch.predict(X_test)
score_gridsearch = accuracy_score(y_test, predictions)
end = time.time()
print("GridSearchCV - Accuracy: %0.3f" % score_gridsearch, "- Time: %0.2f" % (end - start), "seconds")
print("Previous Accuracy: ", ranking[0][0])
print()
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### Decision Tree

In [None]:
param_grid = {'criterion' : ['gini', 'entropy'],
              'max_depth': [10, 50, 100, None],
              'max_features' : ['auto', 'sqrt', 'log2', None],
              'splitter' : ['best', 'random'],
              'min_samples_leaf': [1, 2, 4]}

start = time.time()
clf_gridsearch = GridSearchCV(d.get(ranking[2][1]), param_grid, verbose = 0)
clf_gridsearch.fit(X_train, y_train)
print(clf_gridsearch.best_params_)
predictions = clf_gridsearch.predict(X_test)
score_gridsearch = accuracy_score(y_test, predictions)
end = time.time()
print("GridSearchCV - Accuracy: %0.3f" % score_gridsearch, "- Time: %0.2f" % (end - start), "seconds")
print("Previous Accuracy: ", ranking[2][0])
print()
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

### KNN

In [379]:
param_grid = param_grid = {'n_neighbors': range(1,25),
              'weights': ['uniform', 'distance'],
              'p': [1, 2]}

start = time.time()
clf_gridsearch = GridSearchCV(d.get(ranking[1][1]), param_grid, verbose = 0)
clf_gridsearch.fit(X_train, y_train)
print(clf_gridsearch.best_params_)
predictions = clf_gridsearch.predict(X_test)
score_gridsearch = accuracy_score(y_test, predictions)
end = time.time()
print("GridSearchCV - Accuracy: %0.3f" % score_gridsearch, "- Time: %0.2f" % (end - start), "seconds")
print("Previous Accuracy: ", ranking[1][0])
print()
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

{'n_neighbors': 2, 'p': 2, 'weights': 'uniform'}
GridSearchCV - Accuracy: 0.744 - Time: 1.84 seconds
Previous Accuracy:  0.6511627906976745

[[50 14]
 [17  5]]
              precision    recall  f1-score   support

           0       0.75      0.78      0.76        64
           1       0.26      0.23      0.24        22

    accuracy                           0.64        86
   macro avg       0.50      0.50      0.50        86
weighted avg       0.62      0.64      0.63        86



### Compare with k-fold cross validation

In [None]:
# Compare Algorithms
import pandas
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []

'''models.append(('KNN', KNeighborsClassifier(n_neighbors=9, p= 1, weights = 'uniform')))
models.append(('Decision Tree', DecisionTreeClassifier(criterion= 'gini', max_depth= 50, max_features= 'auto', min_samples_leaf= 4, splitter='random')))
models.append(('SVM', SVC(C= 10, gamma= 0.001)))'''

models.append(('KNN', KNeighborsClassifier(n_neighbors=1)))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('SVM', SVC()))

#models.append(('LRegression', LogisticRegression(max_iter=200)))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=5)
	cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()