## データの取得・確認

In [1]:
import sys
sys.path.append(r'C:/Users/koki5/Dropbox/Jupyter/ScratchML')

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection
from termcolor import cprint

from common.module.evaluation import score


cancer = datasets.load_breast_cancer()
cancer_data = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
cancer_target = pd.Series(data=cancer.target, name='target')

X_train, X_test, y_train, y_test = model_selection.train_test_split(cancer_data, cancer_target)

cprint('Cancer Data (samples: {}, features: {})'.format(*cancer_data.shape), 'blue', attrs=['bold'])
cprint('Cancer Target (samples: {})'.format(*cancer_target.shape), 'blue', attrs=['bold'])

[1m[34mCancer Data (samples: 569, features: 30)[0m
[1m[34mCancer Target (samples: 569)[0m


In [2]:
cancer_data.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [3]:
cancer_target.unique()

array([0, 1], dtype=int64)

## データ前処理

In [4]:
def standardize(X):
    """標準化する.
    
    X: データセット
    """
    return (X - np.mean(X)) / np.std(X)


def add_ones(X):
    """特徴量の0列目に1を加える.
    
    X: データセット
    """
    X['0'] = 1
    return X.sort_index(axis=1, ascending=True)


X_train_std = standardize(X_train)
X_test_std = standardize(X_test)

# バイアス項を追加
X_train_std = add_ones(X_train_std)
X_test_std = add_ones(X_test_std)

X_train_std.head(3)

Unnamed: 0,0,area error,compactness error,concave points error,concavity error,fractal dimension error,mean area,mean compactness,mean concave points,mean concavity,...,worst area,worst compactness,worst concave points,worst concavity,worst fractal dimension,worst perimeter,worst radius,worst smoothness,worst symmetry,worst texture
469,1,-0.272526,0.301856,0.973509,0.484863,0.468146,-0.722792,0.786415,0.148578,0.113624,...,-0.645872,0.169213,0.380564,0.153702,0.4403,-0.594714,-0.628299,2.021422,-0.411755,-0.027429
9,1,-0.355579,2.485595,0.346371,1.334069,2.191601,-0.527893,2.502149,0.923421,1.661809,...,-0.310078,5.01654,1.585425,3.776241,6.617726,-0.303618,-0.259686,2.344461,2.44356,2.486585
423,1,-0.452863,0.085715,0.104102,0.09048,0.021246,-0.239174,0.154994,-0.047018,0.046532,...,-0.314841,0.351098,0.366907,0.37205,0.208377,-0.188832,-0.249033,-0.779727,-0.271165,-0.010976


## モデルの構築・訓練

In [5]:
class LogisticRegression(object):
    """勾配降下法に基づくロジスティック回帰."""
    
    
    def __init__(self, alpha=0.01, eps=1e-6):
        """
        ---パラメータ-------
        alpha: 学習率
        eps: 収束の目安
        --------------------
        
        ---属性-------------
        _w: パラメータ（重み）
        --------------------
        """
        self.alpha = alpha
        self.eps = eps
        
        self._w = 0
        
        
    def fit(self, X, y):
        """訓練データで学習する.
        
        ---パラメータ-------
        X: 訓練データセット
        y: ターゲット
        --------------------
        
        """
        # パラメータの初期値はランダムに設定する
        theta = np.random.rand(X.shape[1])
        error = self.J(X, y, theta)
        count = 0
        diff = 1
        
        # コスト関数の値が殆ど変動しなくなったら収束とする
        while diff > self.eps:
            # パラメータの更新処理
            grad = (1 / len(y)) * np.dot(self.h(X, theta) - y, X)
            theta = theta - self.alpha * grad
            
            # 収束状況を確認するための処理
            current_error = self.J(X, y, theta)
            diff = error - current_error
            error = current_error
            count += 1
            print('\r【{}回目】Error: {}, Diff: {}, Grad: {}'.format(count, error, diff, grad.sum()), end='')
            
        self._w = theta
        return self
    
    
    def predict(self, X):
        """クラスラベルを予測する.
        
        X: テストデータセット
        """
        return np.where(self.h(X, self._w) >= 0.5, 1, 0)
        
    
    def activate(self, z):
        """ロジスティック関数（活性化関数）.
        
        z: 仮説関数の予測値
        """
        return 1 / (1 + np.exp(-z))
    

    def h(self, X, theta):
        """仮説関数.
        
        X: 訓練データセット
        theta: パラメータ（重み）
        """
        return self.activate(np.dot(X, theta))
    

    def J(self, X, y, theta):
        """目的関数.
        
        X: 訓練データセット
        y: ターゲット
        theta: パラメータ（重み）
        """
        delta = 1e-7 # np.log()に0が混在しないようにする
        return - (1 / len(y)) * (np.sum(y * np.log(self.h(X, theta) + delta) + (1 - y) * np.log(1 - self.h(X, theta) + delta)))

In [None]:
lr = LogisticRegression()
lr.fit(X_train_std, y_train)

【1回目】Error: 7.013570716436541, Diff: 0.06813152260477917, Grad: 14.154887387782905【2回目】Error: 6.945101811642295, Diff: 0.06846890479424594, Grad: 14.151994162606828【3回目】Error: 6.876311201901776, Diff: 0.06879060974051932, Grad: 14.148973440124276【4回目】Error: 6.807214870785211, Diff: 0.06909633111656444, Grad: 14.145819048986143【5回目】Error: 6.737828860153866, Diff: 0.06938601063134531, Grad: 14.142524493577229【6回目】Error: 6.668169067622933, Diff: 0.06965979253093302, Grad: 14.139082935870517【7回目】Error: 6.598251110878043, Diff: 0.06991795674488976, Grad: 14.13548717620702【8回目】Error: 6.5280902678817805, Diff: 0.07016084299626257, Grad: 14.131729632951679【9回目】Error: 6.457701485685561, Diff: 0.07038878219621925, Grad: 14.127802320976926【10回目】Error: 6.387099435386642, Diff: 0.07060205029891886, Grad: 14.123696828926722【11回目】Error: 6.316298580431852, Diff: 0.07080085495479072, Grad: 14.119404295214583【12回目】Error: 6.245313224724313, Diff: 0.07098535570753839, Grad: 14.114915382709755

【101回目】Error: 0.8563410128891209, Diff: 0.0235870964724203, Grad: 8.079404853708784【102回目】Error: 0.8337610391250696, Diff: 0.022579973764051298, Grad: 7.907504362526541【103回目】Error: 0.8121470148520914, Diff: 0.021614024272978205, Grad: 7.73859782432437【104回目】Error: 0.7914587058632717, Diff: 0.02068830898881968, Grad: 7.572789541244935【105回目】Error: 0.7716569781109676, Diff: 0.019801727752304155, Grad: 7.410147822453646【106回目】Error: 0.7527038659780964, Diff: 0.018953112132871208, Grad: 7.250720242669173【107回目】Error: 0.7345625846974858, Diff: 0.018141281280610544, Grad: 7.09454389772394【108回目】Error: 0.7171975146115543, Diff: 0.017365070085931467, Grad: 6.941651365176397【109回目】Error: 0.700574176038935, Diff: 0.01662333857261933, Grad: 6.792073340586309【110回目】Error: 0.6846592060773059, Diff: 0.01591496996162911, Grad: 6.645838925238686【111回目】Error: 0.669420342928076, Diff: 0.015238863149229953, Grad: 6.502974442583036【112回目】Error: 0.6548264191613994, Diff: 0.014593923766676564, 

【201回目】Error: 0.2616077143288176, Diff: 0.0012866952075384352, Grad: 1.7700906918883146【202回目】Error: 0.2603409476715517, Diff: 0.0012667666572658831, Grad: 1.7545040671764969【203回目】Error: 0.2590936158703315, Diff: 0.0012473318012202195, Grad: 1.7391790697332126【204回目】Error: 0.2578652414527691, Diff: 0.00122837441756235, Grad: 1.7241093259046658【205回目】Error: 0.2566553625196007, Diff: 0.001209878933168429, Grad: 1.7092886588056961【206回目】Error: 0.2554635321259705, Diff: 0.0011918303936301888, Grad: 1.6947110812440431【207回目】Error: 0.25428931769116353, Diff: 0.001174214434806975, Grad: 1.6803707889264319【208回目】Error: 0.25313230043532037, Diff: 0.0011570172558431624, Grad: 1.6662621539343436【209回目】Error: 0.25199207484175506, Diff: 0.0011402255935653027, Grad: 1.6523797184578848【210回目】Error: 0.25086824814357034, Diff: 0.0011238266981847267, Grad: 1.6387181887767361【211回目】Error: 0.2497604398333399, Diff: 0.0011078083102304404, Grad: 1.6252724294776537【212回目】Error: 0.248668281194693

【303回目】Error: 0.18741797374595873, Diff: 0.0004299214672242313, Grad: 0.9163851353563062【304回目】Error: 0.18699128387316874, Diff: 0.00042668987278998705, Grad: 0.9119681463606008【305回目】Error: 0.18656778427441656, Diff: 0.00042349959875218057, Grad: 0.9075918110280077【306回目】Error: 0.18614743436084097, Diff: 0.0004203499135755906, Grad: 0.9032555818814451【307回目】Error: 0.1857301942587584, Diff: 0.0004172401020825778, Grad: 0.8989589212847712【308回目】Error: 0.18531602479374604, Diff: 0.00041416946501235397, Grad: 0.8947013012209017【309回目】Error: 0.18490488747515035, Diff: 0.0004111373185956835, Grad: 0.8904822030759043【310回目】Error: 0.18449674448100914, Diff: 0.000408142994141214, Grad: 0.8863011174288835【311回目】Error: 0.1840915586433738, Diff: 0.0004051858376353523, Grad: 0.8821575438474832【312回目】Error: 0.18368929343401877, Diff: 0.0004022652093550183, Grad: 0.8780509906888286【313回目】Error: 0.18328991295052782, Diff: 0.00039938048349094646, Grad: 0.8739809749057449【314回目】Error: 0.182

【10316回目】Error: 0.040499610946735536, Diff: 1.0718224025194067e-06, Grad: 0.026174738935838243

## 予測・評価

In [None]:
predict = pd.DataFrame(lr.predict(X_train_std), columns=['TARGET'])
predict.head(10)

In [None]:
train_score = score(predict, y_train)
test_score = score(pd.DataFrame(lr.predict(X_test_std), columns=['target']), y_test)
cprint('train_score: {}\ntest_score: {}'.format(train_score, test_score), 'red', attrs=['bold'])