# Gradient Descent - Breast Cancer

In [1]:
# Breast Cancer

# Breast Cancer Dataset Training Standard Template
# auther: leechh

In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../')

import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

from utils import cv

In [12]:
# Logistic Regression with Gradient Descent


class Logistic_gd(object):
    def __init__(self, learning_rate=0.001, num_iter=1e+3, early_stopping=True, tol=1e-4):
        self.learning_rate = learning_rate
        self.num_iter = int(num_iter)
        self.early_stopping = early_stopping
        self.tol = tol
        self.w = None
        self.b = None
    
    @staticmethod
    def sigmoid(x):
        return 1. / (1. + np.exp(-x))
        
    def fit(self, x, y):
        # array -> matrix
        x_m = np.matrix(np.c_[x, np.ones([len(x), 1])])
        y_m = np.matrix(np.expand_dims(y, axis=-1))
        # weight
        self.w = np.matrix(np.ones([x_m.shape[1], 1]))
        # gd
        for i in range(self.num_iter):
            y_pred = self.sigmoid(x_m * self.w)
            error = (y_m - y_pred)
            w_update = self.learning_rate * x_m.T * error
            if (self.early_stopping) & (np.abs(np.mean(w_update)) < self.tol):
                break
            else:
                self.w += w_update
            
    def predict(self, x):
        y_pred = np.matrix(np.c_[x, np.ones([len(x), 1])]) * self.w
        y_pred = np.round(self.sigmoid(y_pred))
        return y_pred.astype('int')
        

In [5]:
PATH = '../data/breast_cancer/breast-cancer-wisconsin.data'

FEATURE_NAME = [
    'sample_code_number',
    'clump_thickness',
    'cell_size',
    'cell_shape', 
    'marginal_adhesion',
    'single_epithelial_cell_size',
    'bare_nuclei',
    'bland_chromatin', 
    'normal_nucleoli', 
    'mitoses',
    'class'
]


data = pd.read_csv(PATH, names=FEATURE_NAME)
data = shuffle(data, random_state=2020).reset_index(drop=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   sample_code_number           699 non-null    int64 
 1   clump_thickness              699 non-null    int64 
 2   cell_size                    699 non-null    int64 
 3   cell_shape                   699 non-null    int64 
 4   marginal_adhesion            699 non-null    int64 
 5   single_epithelial_cell_size  699 non-null    int64 
 6   bare_nuclei                  699 non-null    object
 7   bland_chromatin              699 non-null    int64 
 8   normal_nucleoli              699 non-null    int64 
 9   mitoses                      699 non-null    int64 
 10  class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [6]:
# LabelEncode class
le = LabelEncoder()
y = le.fit_transform(data['class'])

# fillna bare_nuclei
data['bare_nuclei'] = data['bare_nuclei'].apply(lambda x: 1 if (x == '?') else x)

In [7]:
# DataFrame -> array
data.drop(['class', 'sample_code_number'], axis=1, inplace=True)
x = data.values.astype('float32')
del data

In [8]:
# in this Demo, We will show how to use cv function to test model performance.
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()

cv(lr, x, y)

Total use time: 0.0346s, score: 0.9628263103802672


0.9628263103802672

In [13]:
lg = Logistic_gd()
cv(lg, x, y)

Total use time: 0.1554s, score: 0.9642446043165467


0.9642446043165467