In [40]:
import os, time, pickle

import numpy as np
np.random.seed(42)

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

In [41]:
# Data Attribute Information

# 1. Sample code number: id number 
# 2. Clump Thickness: 1 - 10 
# 3. Uniformity of Cell Size: 1 - 10 
# 4. Uniformity of Cell Shape: 1 - 10 
# 5. Marginal Adhesion: 1 - 10 
# 6. Single Epithelial Cell Size: 1 - 10 
# 7. Bare Nuclei: 1 - 10 
# 8. Bland Chromatin: 1 - 10 
# 9. Normal Nucleoli: 1 - 10 
# 10. Mitoses: 1 - 10 
# 11. Class: (2 for benign, 4 for malignant)

# Specifing header
header = ['sample_code','clump_tickness','uniformity_cell_size','uniformity_cell_shape',
          'marginal_adhesion','single_epithelial_cell_size','bare_nuclei','bland_chromatin',
          'normal_nucleoli','mitoses','target']

# Reading the data
folderName = '../data/' 
fileName   = 'breast-cancer-wisconsin.data'
model_file_name   = 'decisionTree_model.sav'

df = pd.read_csv(folderName+fileName,header=None,names=header)

# Printing some info about the data
print("[INFO]\nNrow: {}\nNcol: {}".format(df.shape[0],df.shape[1]))
df.head(2)

[INFO]
Nrow: 683
Ncol: 11


Unnamed: 0,sample_code,clump_tickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,target
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2


In [42]:
# Setting the target column
target = df.loc[:,df.columns[-1]].values
# target = np.where(target==4,1,0)

# If necessary, dropping cols
cols = df.columns # List of columns in dataframe
drop = [cols[-1],cols[0]] # List of columns to drop
df.drop(labels=drop,axis=1,inplace=True)
df.head(3)

Unnamed: 0,clump_tickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1


In [43]:
# Convert all columns to int or float to avoid errors
def conv2num(df):
    cols = df.columns
    for col in cols:
        try:
            df[str(col)] = df[str(col)].astype(float) # Remember to specify type according to data specification
        except Exception as e:
            print('Column \'{}\' was not converted. Error: \n'.format(col), e, '\n')
            pass
    return df

df = conv2num(df) # Converting columns to number to avoid errors

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn import metrics

## K-Fold Cross Validation

In [46]:
# Initialize the classifier
clf = LogisticRegression(penalty='l1',C=0.1)

# ----------- CROSS VALIDATION ----------- #
# ---------------- K-Fold ---------------- #
accuracy_train=[]
precision_train=[]
recall_train=[]
f1_train=[]
auc_train=[]

accuracy_test=[]
precision_test=[]
recall_test=[]
f1_test=[]
auc_test=[]
cv = model_selection.KFold(n_splits=10,shuffle=False) # K-fold Cross Validation method
for train_index, test_index in cv.split(df.values):
    X_train, X_test = df.values[train_index,:], df.values[test_index,:]
    y_train, y_test = target[train_index],target[test_index]  
    
    # Fitting the data into the model
    clf.fit(X_train, y_train)
    
    # FOR TRAINING
    predicted_train = clf.predict(X_train)
    accuracy_train.append(metrics.accuracy_score(y_train, predicted_train))
    precision_train.append(metrics.precision_score(y_train, predicted_train,pos_label=4))
    recall_train.append(metrics.recall_score(y_train, predicted_train,pos_label=4))
    f1_train.append(metrics.f1_score(y_train, predicted_train,pos_label=4))
    fpr, tpr, thresholds = metrics.roc_curve(y_train, predicted_train,pos_label=4)
    auc_train.append(metrics.auc(fpr, tpr))
    
    # FOR TESTING
    predicted_test = clf.predict(X_test)
    accuracy_test.append(metrics.accuracy_score(y_test, predicted_test))
    precision_test.append(metrics.precision_score(y_test, predicted_test,pos_label=4))
    recall_test.append(metrics.recall_score(y_test, predicted_test,pos_label=4))
    f1_test.append(metrics.f1_score(y_test, predicted_test,pos_label=4))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, predicted_test,pos_label=4)
    auc_test.append(metrics.auc(fpr, tpr))

In [47]:
# Calculating the mean values for train and test scores
accuracy_train = np.array(accuracy_train).mean()
precision_train = np.array(precision_train).mean()
recall_train = np.array(recall_train).mean()
f1_train = np.array(f1_train).mean()
auc_train = np.array(auc_train).mean()

accuracy_test = np.array(accuracy_train).mean()
precision_test = np.array(precision_train).mean()
recall_test = np.array(recall_train).mean()
f1_test = np.array(f1_train).mean()
auc_test = np.array(auc_train).mean()

# Displaying the results
print('Accuracy Train = {}'.format(accuracy_train))
print('Precision Train = {}'.format(precision_train))
print('Recall Train = {}'.format(recall_train))
print('F1-score Train = {}'.format(f1_train))
print('AUC Train = {}'.format(auc_train))
print()
print('Accuracy Test = {}'.format(accuracy_test))
print('Precision Test = {}'.format(precision_test))
print('Recall Test = {}'.format(recall_test))
print('F1-score Test = {}'.format(f1_test))
print('AUC Test = {}'.format(auc_test))

Accuracy Train = 0.965837769126877
Precision Train = 0.9603238725324819
Recall Train = 0.9414534896009424
F1-score Train = 0.950777270184123
AUC Train = 0.9601816301978834

Accuracy Test = 0.965837769126877
Precision Test = 0.9603238725324819
Recall Test = 0.9414534896009424
F1-score Test = 0.950777270184123
AUC Test = 0.9601816301978834


## Hold-Out Cross Validation

In [48]:
# Initialize the classifier
clf = LogisticRegression(penalty='l1',C=0.1)

accuracy_train=[]
precision_train=[]
recall_train=[]
f1_train=[]
auc_train=[]

accuracy_test=[]
precision_test=[]
recall_test=[]
f1_test=[]
auc_test=[]

# ----------- CROSS VALIDATION ----------- #
# -------------- Hold-Out -------------- # When I have too many data
X_train, X_test, y_train, y_test  = model_selection.train_test_split(df.values, 
                                                                     target, 
                                                                     test_size=0.5,
                                                                     random_state=None,
                                                                     shuffle=False)

# Fitting the data into the model
clf.fit(X_train, y_train)

# FOR TRAINING
predicted_train = clf.predict(X_train)
accuracy_train.append(metrics.accuracy_score(y_train, predicted_train))
precision_train.append(metrics.precision_score(y_train, predicted_train,pos_label=4))
recall_train.append(metrics.recall_score(y_train, predicted_train,pos_label=4))
f1_train.append(metrics.f1_score(y_train, predicted_train,pos_label=4))
fpr, tpr, thresholds = metrics.roc_curve(y_train, predicted_train,pos_label=4)
auc_train.append(metrics.auc(fpr, tpr))

# FOR TESTING
predicted_test = clf.predict(X_test)
accuracy_test.append(metrics.accuracy_score(y_test, predicted_test))
precision_test.append(metrics.precision_score(y_test, predicted_test,pos_label=4))
recall_test.append(metrics.recall_score(y_test, predicted_test,pos_label=4))
f1_test.append(metrics.f1_score(y_test, predicted_test,pos_label=4))
fpr, tpr, thresholds = metrics.roc_curve(y_test, predicted_test,pos_label=4)
auc_test.append(metrics.auc(fpr, tpr))

In [49]:
# Calculating the mean values for train and test scores
accuracy_train = np.array(accuracy_train).mean()
precision_train = np.array(precision_train).mean()
recall_train = np.array(recall_train).mean()
f1_train = np.array(f1_train).mean()
auc_train = np.array(auc_train).mean()

accuracy_test = np.array(accuracy_train).mean()
precision_test = np.array(precision_train).mean()
recall_test = np.array(recall_train).mean()
f1_test = np.array(f1_train).mean()
auc_test = np.array(auc_train).mean()

# Displaying the results
print('Accuracy Train = {}'.format(accuracy_train))
print('Precision Train = {}'.format(precision_train))
print('Recall Train = {}'.format(recall_train))
print('F1-score Train = {}'.format(f1_train))
print('AUC Train = {}'.format(auc_train))
print()
print('Accuracy Test = {}'.format(accuracy_test))
print('Precision Test = {}'.format(precision_test))
print('Recall Test = {}'.format(recall_test))
print('F1-score Test = {}'.format(f1_test))
print('AUC Test = {}'.format(auc_test))

Accuracy Train = 0.9589442815249267
Precision Train = 0.9444444444444444
Recall Train = 0.9683544303797469
F1-score Train = 0.9562499999999999
AUC Train = 0.9595870512554472

Accuracy Test = 0.9589442815249267
Precision Test = 0.9444444444444444
Recall Test = 0.9683544303797469
F1-score Test = 0.9562499999999999
AUC Test = 0.9595870512554472


## Leave-One-Out Cross Validation

In [50]:
# Initialize the classifier
clf = LogisticRegression(penalty='l1',C=0.1)

# ----------- CROSS VALIDATION ----------- #
# ------------- Leave-One-Out ------------- #
accuracy_train=[]
precision_train=[]
recall_train=[]
f1_train=[]
auc_train=[]

pred_test=[]
accuracy_test=[]
precision_test=[]
recall_test=[]
f1_test=[]
auc_test=[]
cv = model_selection.LeaveOneOut() # K-fold Cross Validation method
for train_index, test_index in cv.split(df.values):
    X_train, X_test = df.values[train_index,:], df.values[test_index,:]
    y_train, y_test = target[train_index],target[test_index]  
    
    # Fitting the data into the model
    clf.fit(X_train, y_train)
        
    # FOR TRAINING
    predicted_train = clf.predict(X_train)
    accuracy_train.append(metrics.accuracy_score(y_train, predicted_train))
    precision_train.append(metrics.precision_score(y_train, predicted_train,pos_label=4))
    recall_train.append(metrics.recall_score(y_train, predicted_train,pos_label=4))
    f1_train.append(metrics.f1_score(y_train, predicted_train,pos_label=4))
    fpr, tpr, thresholds = metrics.roc_curve(y_train, predicted_train,pos_label=4)
    auc_train.append(metrics.auc(fpr, tpr))

    # Storing testing
    pred_test.append([y_test, clf.predict(X_test)])

# Converting to array
pred_test = np.array(pred_test)
    
# FOR TESTING
accuracy_test = metrics.accuracy_score(pred_test[:,0,0], pred_test[:,1,0])
precision_test = metrics.precision_score(pred_test[:,0,0], pred_test[:,1,0],pos_label=4)
recall_test = metrics.recall_score(pred_test[:,0,0], pred_test[:,1,0],pos_label=4)
f1_test = metrics.f1_score(pred_test[:,0,0], pred_test[:,1,0],pos_label=4)
fpr, tpr, thresholds = metrics.roc_curve(pred_test[:,0,0], pred_test[:,1,0],pos_label=4)
auc_test = metrics.auc(fpr, tpr)

In [51]:
# Calculating the mean values for train and test scores
accuracy_train = np.array(accuracy_train).mean()
precision_train = np.array(precision_train).mean()
recall_train = np.array(recall_train).mean()
f1_train = np.array(f1_train).mean()
auc_train = np.array(auc_train).mean()

# Displaying the results
print('Accuracy Train = {}'.format(accuracy_train))
print('Precision Train = {}'.format(precision_train))
print('Recall Train = {}'.format(recall_train))
print('F1-score Train = {}'.format(f1_train))
print('AUC Train = {}'.format(auc_train))
print()
print('Accuracy Test = {}'.format(accuracy_test))
print('Precision Test = {}'.format(precision_test))
print('Recall Test = {}'.format(recall_test))
print('F1-score Test = {}'.format(f1_test))
print('AUC Test = {}'.format(auc_test))

Accuracy Train = 0.9649553676852596
Precision Train = 0.9613779500143748
Recall Train = 0.937514501135458
F1-score Train = 0.9492958327611514
AUC Train = 0.9586204629169693

Accuracy Test = 0.9604685212298683
Precision Test = 0.9568965517241379
Recall Test = 0.9288702928870293
F1-score Test = 0.9426751592356688
AUC Test = 0.9531738851822533
