In [1]:
import pandas as pd
import numpy as np
import pickle as pk
import os
import matplotlib.pyplot as plt
from sklearn.metrics import *
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
import tensorflow as tf

In [2]:
cols = [
    'ID',
    'diagnosis',
    'radius_m',
    'texture_m',
    'perimeter_m',
    'area_m',
    'smoothness_m',
    'compactness_m',
    'concavity_m',
    'concave_points_m',
    'symmetry_m',
    'fractal_dimension_m',
    'radius_se',
    'texture_se',
    'perimeter_se',
    'area_se',
    'smoothness_se',
    'compactness_se',
    'concavity_se',
    'concave_points_se',
    'symmetry_se',
    'fractal_dimension_se',
    'radius_w',
    'texture_w',
    'perimeter_w',
    'area_w',
    'smoothness_w',
    'compactness_w',
    'concavity_w',
    'concave_points_w',
    'symmetry_w',
    'fractal_dimension_w']

In [3]:
def serialize(obj):
    with open('pickle_files/'+namestr(obj)+'.pickle','wb') as f:
        pk.dump(obj,f,pk.HIGHEST_PROTOCOL)
        
def load(obj):
    with open('pickle_files/'+obj+'.pickle','rb') as f:
        return pk.load(f)

def namestr(obj):
    g = globals()
    return [name for name in g if g[name] is obj][0]

#### Load dataset from .data file

#### Load dataset from pickle file

In [4]:
data_set = load('data_set')

# M => 1, B => 0
data_set.replace('B', 0, inplace=True)
data_set.replace('M', 1, inplace=True)

X = data_set.iloc[:,2:]
y = data_set.iloc[:,1]

In [5]:
data_set

Unnamed: 0,ID,diagnosis,radius_m,texture_m,perimeter_m,area_m,smoothness_m,compactness_m,concavity_m,concave_points_m,...,radius_w,texture_w,perimeter_w,area_w,smoothness_w,compactness_w,concavity_w,concave_points_w,symmetry_w,fractal_dimension_w
0,89143601,0,11.37,18.89,72.17,396.0,0.08713,0.05008,0.02399,0.02173,...,12.36,26.14,79.29,459.3,0.11180,0.09708,0.07529,0.06203,0.3267,0.06994
1,871641,0,11.08,14.71,70.21,372.7,0.10060,0.05743,0.02363,0.02583,...,11.35,16.82,72.01,396.5,0.12160,0.08240,0.03938,0.04306,0.1902,0.07313
2,858970,0,10.17,14.88,64.55,311.9,0.11340,0.08061,0.01084,0.01290,...,11.02,17.45,69.86,368.6,0.12750,0.09866,0.02168,0.02579,0.3557,0.08020
3,89827,0,11.06,14.96,71.49,373.9,0.10330,0.09097,0.05397,0.03341,...,11.92,19.90,79.76,440.0,0.14180,0.22100,0.22990,0.10750,0.3301,0.09080
4,904689,0,12.96,18.29,84.18,525.2,0.07351,0.07899,0.04057,0.01883,...,14.13,24.61,96.31,621.9,0.09329,0.23180,0.16040,0.06608,0.3207,0.07247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,855625,1,19.07,24.81,128.30,1104.0,0.09081,0.21900,0.21070,0.09961,...,24.09,33.17,177.40,1651.0,0.12470,0.74440,0.72420,0.24930,0.4670,0.10380
565,892604,0,12.46,19.89,80.43,471.3,0.08451,0.10140,0.06830,0.03099,...,13.46,23.07,88.13,551.3,0.10500,0.21580,0.19040,0.07625,0.2685,0.07764
566,924632,0,12.88,28.92,82.50,514.3,0.08123,0.05824,0.06195,0.02343,...,13.89,35.74,88.84,595.7,0.12270,0.16200,0.24390,0.06493,0.2372,0.07242
567,911320502,0,13.17,18.22,84.28,537.3,0.07466,0.05994,0.04859,0.02870,...,14.90,23.89,95.10,687.6,0.12820,0.19650,0.18760,0.10450,0.2235,0.06925


#### Confusion matrix functions

In [6]:
def tp(y, p):
    return sum(np.logical_and(y,p))

def fp(y, p):
    return sum(np.logical_and(np.logical_not(y),p))

def tn(y, p):
    return sum(np.logical_not(np.logical_or(y,p)))

def fn(y, p):
    return sum(np.logical_and(y,np.logical_not(p)))

#### Metrics

In [7]:
def accuracy(y_true, y_prediction):
    return sum(np.logical_not(np.logical_xor(y_true, y_prediction)))/len(y_true)

def precision(y_true, y_prediction):
    true_positive  = tp(y_true, y_prediction)
    false_positive = fp(y_true, y_prediction)
    return true_positive / (true_positive + false_positive)

def recall(y_true, y_prediction):
    true_positive  = tp(y_true, y_prediction)
    false_negative = fn(y_true, y_prediction)
    return true_positive / (true_positive + false_negative)

def specifity(y_true, y_prediction):
    false_positive = fp(y_true, y_prediction)
    true_negative  = tn(y_true, y_prediction)
    return true_negative / (false_positive + true_negative)

def roc_auc(y_true, y_prediction):
    tpr = recall(y_true, y_prediction)
    fpr = 1 - specifity(y_true, y_prediction)
    return (tpr*fpr/2)+((tpr+1)/2)*(1-fpr)

#### Training and Testing functions

In [8]:
def predict(model, x_train, y_train, x_test, y_test):
    model.fit(x_train,y_train)
    return model.predict(x_test)

def score(y_test, prediction):
    score = []
    score.append(accuracy(y_test, prediction))
    score.append(precision(y_test, prediction))
    score.append(recall(y_test, prediction))
    score.append(roc_auc(y_test, prediction))
    return np.array(score)

def score2(y_test, prediction):
    score = []
    score.append(accuracy_score(y_test, prediction))
    score.append(precision_score(y_test, prediction))
    score.append(recall_score(y_test, prediction))
    score.append(roc_auc_score(y_test, prediction))
    return np.array(score)

In [9]:
def cross_validation(X, y, n):
    options = []
    
    for i in range(n):
        X_folds = np.array_split(X, n)
        y_folds = np.array_split(y, n)
        
        x_test = X_folds.pop(i)
        y_test = y_folds.pop(i)

        options.append([pd.concat(X_folds),pd.concat(y_folds),x_test,y_test])
        
    return options

In [10]:
def result(model, X, y, n_fold):
    folds_option = cross_validation(X, y, n_fold)
    scores = []
    for i in folds_option:
        if len(scores):
            scores += score(i[3],predict(model, *i))
        else:
            scores = score(i[3],predict(model, *i))
    scores /= n_fold
    return {'accuracy' : scores[0], 
            'precision': scores[1], 
            'recall' : scores[2], 
            'auc' : scores[3]}

In [11]:
class MLP: 
    
    def __init__(self, 
                 num_perceptrons  = 128, 
                 activation       = tf.nn.relu, 
                 solver           = tf.train.AdamOptimizer, 
                 learning_rate    = .01, 
                 regularizer_rate = .1,
                 epochs           = 15, 
                 dropout_rate     = .6):
    
        self.num_perceptrons  = num_perceptrons
        self.activation       = activation
        self.solver           = solver
        self.learning_rate    = learning_rate
        self.regularizer_rate = regularizer_rate
        self.epochs           = epochs
        self.dropout_rate     = dropout_rate

    def fit(self, X, y):
        
        self.s = tf.get_default_session()
        if not self.s:
            self.s = tf.InteractiveSession()
            
        self.X_train = X
        self.y_train = y
        
        # initial data
        X      = np.array(X)     
        y      = np.array(y).reshape((-1,1))
        self.X_input = tf.placeholder('float', shape=(None, X.shape[1]), name='X')
        self.y_input = tf.placeholder('float', shape=(None, y.shape[1]), name='y')
        self.drop_rate = tf.placeholder('float')
        
        # initialize weights & bias
        weights_0 = tf.Variable(tf.random_normal([X.shape[1],self.num_perceptrons], 
                                                 stddev=(1/tf.sqrt(float(X.shape[1])))))
        bias_0    = tf.Variable(tf.random_normal([self.num_perceptrons]))

        weights_1 = tf.Variable(tf.random_normal([self.num_perceptrons,y.shape[1]], 
                                                 stddev=(1/tf.sqrt(float(self.num_perceptrons)))))
        bias_1    = tf.Variable(tf.random_normal([y.shape[1]]))
        
        # define layers
        hidden_output  = self.activation(tf.matmul(self.X_input, weights_0)+bias_0)
        dropped_output = tf.nn.dropout(hidden_output, rate=self.drop_rate)

        output         = tf.matmul(dropped_output,weights_1) + bias_1
        mean, std      = tf.nn.moments(output,0)
        self.predicted = tf.sigmoid((output - mean)/std)
        
        # optimization
        loss      = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.predicted,
                                                                              labels=self.y_input))\
        + self.regularizer_rate*(tf.reduce_sum(tf.square(bias_0)) + tf.reduce_sum(tf.square(bias_1)))
        
        optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(loss,var_list=[weights_0,
                                                                                       weights_1,
                                                                                       bias_0,
                                                                                       bias_1])
        # training
        self.s.run(tf.global_variables_initializer())
        for i in range(self.epochs):
            self.s.run(optimizer, {self.X_input:X, self.y_input:y, self.drop_rate:self.dropout_rate})
        

        
    def predict(self, x_test):
        prediction = self.s.run(self.predicted, {self.X_input:x_test, self.drop_rate:0})
        return np.where(prediction.reshape((-1,))>.5,1,0)

In [12]:
result(LogisticRegression(max_iter=3000), X, y, 5)

{'accuracy': 0.9560782487191428,
 'precision': 0.9488142235968324,
 'recall': 0.9203164950991038,
 'auc': 0.9461525971904619}

In [13]:
result(svm.SVC(), X, y, 5)

{'accuracy': 0.9051234280391244,
 'precision': 0.9552689520624303,
 'recall': 0.7688482531960792,
 'auc': 0.8746668422520214}

In [14]:
result(MLPClassifier(max_iter = 400, activation='relu'), X, y, 5)

{'accuracy': 0.934979040521658,
 'precision': 0.9168221625118177,
 'recall': 0.8984395314830097,
 'auc': 0.9249874059782378}

In [15]:
result(MLP(),X,y,5)

{'accuracy': 0.9086166744294364,
 'precision': 0.9204437066280956,
 'recall': 0.8186607595303247,
 'auc': 0.8913214614322161}