## データの取得・確認

In [1]:
import sys
sys.path.append(r'C:/Users/koki5/Dropbox/Jupyter/ScratchML')

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection
from termcolor import cprint

from common.module.evaluation import score


cancer = datasets.load_breast_cancer()
cancer_data = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
cancer_target = pd.Series(data=cancer.target, name='target')

X_train, X_test, y_train, y_test = model_selection.train_test_split(cancer_data, cancer_target)

cprint('Cancer Data (samples: {}, features: {})'.format(*cancer_data.shape), 'blue', attrs=['bold'])
cprint('Cancer Target (samples: {})'.format(*cancer_target.shape), 'blue', attrs=['bold'])

[1m[34mCancer Data (samples: 569, features: 30)[0m
[1m[34mCancer Target (samples: 569)[0m


In [2]:
cancer_data.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [3]:
cancer_target.unique()

array([0, 1], dtype=int64)

## データ前処理

In [4]:
def standardize(X):
    """標準化する.
    
    X: データセット
    """
    return (X - np.mean(X, axis=0)) / np.std(X, axis=0)


def add_ones(X):
    """特徴量の0列目に1を加える.
    
    X: データセット
    """
    X['0'] = 1
    return X.sort_index(axis=1, ascending=True)


X_train_std = standardize(X_train)
X_test_std = standardize(X_test)

# バイアス項を追加
X_train_std = add_ones(X_train_std)
X_test_std = add_ones(X_test_std)

X_train_std.head(3)

Unnamed: 0,0,area error,compactness error,concave points error,concavity error,fractal dimension error,mean area,mean compactness,mean concave points,mean concavity,...,worst area,worst compactness,worst concave points,worst concavity,worst fractal dimension,worst perimeter,worst radius,worst smoothness,worst symmetry,worst texture
169,1,-0.400498,-0.672352,-0.466936,-0.789417,-0.814867,0.046303,-0.513131,-0.336391,-0.805864,...,-0.197775,-0.59646,-0.496601,-1.007166,-1.087824,-0.125743,-0.083943,-0.494034,-0.794926,-0.446976
111,1,-0.486359,0.817553,1.765658,0.567971,0.813268,-0.525619,0.246843,0.219245,0.15703,...,-0.642229,-0.214045,-0.114658,-0.277967,0.019396,-0.574064,-0.639102,-0.192658,-1.075135,-0.042225
127,1,0.886275,0.086541,0.262997,0.238665,-0.433607,1.304531,-0.487286,0.121513,-0.007959,...,1.044461,-0.205311,0.053605,0.18789,-1.027029,1.127257,1.156178,-1.321757,-0.106998,0.000381


## モデルの構築・訓練

In [5]:
class LogisticRegression(object):
    """勾配降下法に基づくロジスティック回帰."""
    
    
    def __init__(self, alpha=0.01, eps=1e-6):
        """
        ---パラメータ-------
        alpha: 学習率
        eps: 収束の目安
        --------------------
        
        ---属性-------------
        _w: パラメータ（重み）
        --------------------
        """
        self.alpha = alpha
        self.eps = eps
        
        self._w = 0
        
        
    def fit(self, X, y):
        """訓練データで学習する.
        
        ---パラメータ-------
        X: 訓練データセット
        y: ターゲット
        --------------------
        
        """
        # パラメータの初期値はランダムに設定する
        theta = np.random.rand(X.shape[1])
        error = self.J(X, y, theta)
        count = 0
        diff = 1
        
        # コスト関数の値が殆ど変動しなくなったら収束とする
        while diff > self.eps:
            # パラメータの更新処理
            grad = (1 / len(y)) * np.dot(self.h(X, theta) - y, X)
            theta = theta - self.alpha * grad
            
            # 収束状況を確認するための処理
            current_error = self.J(X, y, theta)
            diff = error - current_error
            error = current_error
            count += 1
            print('\r【{}回目】Error: {}, Diff: {}, Grad: {}'.format(count, error, diff, grad.sum()), end='')
            
        self._w = theta
        return self
    
    
    def predict(self, X):
        """クラスラベルを予測する.
        
        X: テストデータセット
        """
        return np.where(self.h(X, self._w) >= 0.5, 1, 0)
        
    
    def activate(self, z):
        """ロジスティック関数（活性化関数）.
        
        z: 仮説関数の予測値
        """
        return 1 / (1 + np.exp(-z))
    

    def h(self, X, theta):
        """仮説関数.
        
        X: 訓練データセット
        theta: パラメータ（重み）
        """
        return self.activate(np.dot(X, theta))
    

    def J(self, X, y, theta):
        """目的関数.
        
        X: 訓練データセット
        y: ターゲット
        theta: パラメータ（重み）
        """
        delta = 1e-7 # np.log()に0が混在しないようにする
        return - (1 / len(y)) * (np.sum(y * np.log(self.h(X, theta) + delta) + (1 - y) * np.log(1 - self.h(X, theta) + delta)))

In [6]:
lr = LogisticRegression()
lr.fit(X_train_std, y_train)

【1回目】Error: 7.33626953443581, Diff: 0.06425020747832733, Grad: 14.251262727122024【2回目】Error: 7.271703013738498, Diff: 0.06456652069731206, Grad: 14.248506782407565【3回目】Error: 7.206834229877661, Diff: 0.06486878386083728, Grad: 14.245620185724883【4回目】Error: 7.141678224951188, Diff: 0.06515600492647255, Grad: 14.242597736634991【5回目】Error: 7.076250606928844, Diff: 0.06542761802234409, Grad: 14.23943419452612【6回目】Error: 7.0105670790872265, Diff: 0.06568352784161746, Grad: 14.236124288960532【7回目】Error: 6.944642962114239, Diff: 0.06592411697298761, Grad: 14.232662725404161【8回目】Error: 6.878492743576791, Diff: 0.06615021853744807, Grad: 14.229044184659728【9回目】Error: 6.812129685920384, Diff: 0.06636305765640671, Grad: 14.225263314342238【10回目】Error: 6.745565517020791, Diff: 0.06656416889959349, Grad: 14.221314710899419【11回目】Error: 6.678810222489192, Diff: 0.06675529453159879, Grad: 14.21719289100319【12回目】Error: 6.611871952586203, Diff: 0.06693826990298923, Grad: 14.212892251617502【1

【107回目】Error: 0.9856596059390735, Diff: 0.025701614086990454, Grad: 8.500771477186223【108回目】Error: 0.9608402532332098, Diff: 0.024819352705863773, Grad: 8.358044892657752【109回目】Error: 0.9368787075579664, Diff: 0.023961545675243334, Grad: 8.216505637867108【110回目】Error: 0.9137504028623875, Diff: 0.023128304695578916, Grad: 8.076276754644036【111回目】Error: 0.8914307861807701, Diff: 0.022319616681617416, Grad: 7.937466272216709【112回目】Error: 0.8698954334084891, Diff: 0.021535352772281025, Grad: 7.80016664993144【113回目】Error: 0.8491201503424423, Diff: 0.020775283066046812, Grad: 7.664455364941718【114回目】Error: 0.8290810549120353, Diff: 0.02003909543040694, Grad: 7.530396507015295【115回目】Error: 0.8097546383368592, Diff: 0.019326416575176086, Grad: 7.398043175372489【116回目】Error: 0.7911178048128183, Diff: 0.018636833524040886, Grad: 7.2674404133835955【117回目】Error: 0.7731478911175427, Diff: 0.017969913695275674, Grad: 7.138628375556816【118回目】Error: 0.7558226690933508, Diff: 0.017325222024

【205回目】Error: 0.26461082131260294, Diff: 0.0015243425298861468, Grad: 2.07241054144633【206回目】Error: 0.2631157801970476, Diff: 0.0014950411155553178, Grad: 2.0521138060271964【207回目】Error: 0.2616492551843708, Diff: 0.0014665250126768004, Grad: 2.0321674312191735【208回目】Error: 0.26021048701131533, Diff: 0.0014387681730554935, Grad: 2.0125630117341076【209回目】Error: 0.25879874147049337, Diff: 0.0014117455408219626, Grad: 1.9932923659309028【210回目】Error: 0.2574133084548896, Diff: 0.0013854330156037875, Grad: 1.9743475336818626【211回目】Error: 0.2560535010383565, Diff: 0.0013598074165330765, Grad: 1.9557207737814108【212回目】Error: 0.2547186545912378, Diff: 0.001334846447118676, Grad: 1.9374045609327666【213回目】Error: 0.2534081259302304, Diff: 0.0013105286610074485, Grad: 1.9193915823509076【214回目】Error: 0.2521212925015722, Diff: 0.0012868334286581518, Grad: 1.9016747340218045【215回目】Error: 0.25085755159662104, Diff: 0.0012637409049511827, Grad: 1.8842471166585784【216回目】Error: 0.24961631959887

【309回目】Error: 0.18748213209356718, Diff: 0.00036682669018317515, Grad: 0.980846949895638【310回目】Error: 0.1871187306728458, Diff: 0.0003634014207213876, Grad: 0.9756789708943596【311回目】Error: 0.18675870125559127, Diff: 0.0003600294172545204, Grad: 0.9705631847968279【312回目】Error: 0.18640199165804275, Diff: 0.0003567095975485213, Grad: 0.9654988383116738【313回目】Error: 0.18604855075250512, Diff: 0.0003534409055376275, Grad: 0.9604851916910603【314回目】Error: 0.1856983284418912, Diff: 0.0003502223106139346, Grad: 0.9555215184619267【315回目】Error: 0.18535127563495227, Diff: 0.00034705280693891893, Grad: 0.9506071051626046【316回目】Error: 0.1850073442221785, Diff: 0.00034393141277377914, Grad: 0.9457412510846794【317回目】Error: 0.18466648705234934, Diff: 0.0003408571698291507, Grad: 0.9409232680199828【318回目】Error: 0.18432865790971456, Diff: 0.0003378291426347768, Grad: 0.9361524800126122【319回目】Error: 0.18399381149178845, Diff: 0.00033484641792611014, Grad: 0.9314282231158688【320回目】Error: 0.1836

【415回目】Error: 0.16114940304767011, Diff: 0.00017374856813331951, Grad: 0.6253073763478678【416回目】Error: 0.1609765717611942, Diff: 0.00017283128647591073, Grad: 0.623162082626324【417回目】Error: 0.16080464881063708, Diff: 0.00017192295055712403, Grad: 0.6210314004116917【418回目】Error: 0.16063362537247938, Diff: 0.00017102343815769694, Grad: 0.6189151859246084【419回目】Error: 0.16046349274335373, Diff: 0.00017013262912565774, Grad: 0.6168132972356601【420回目】Error: 0.16029424233801917, Diff: 0.00016925040533455338, Grad: 0.6147255942355753【421回目】Error: 0.16012586568737572, Diff: 0.00016837665064345364, Grad: 0.6126519386060013【422回目】Error: 0.15995835443651807, Diff: 0.00016751125085764929, Grad: 0.610592193790848【423回目】Error: 0.1597917003428281, Diff: 0.00016665409368996076, Grad: 0.6085462249681832【424回目】Error: 0.1596258952741048, Diff: 0.00016580506872329592, Grad: 0.6065138990226724【425回目】Error: 0.15946093120673105, Diff: 0.0001649640673737629, Grad: 0.6044950845185462【426回目】Error: 0

【522回目】Error: 0.14651564328962172, Diff: 0.00010959762034781773, Grad: 0.4572530178233489【523回目】Error: 0.14640643190688582, Diff: 0.00010921138273589648, Grad: 0.4561135763956623【524回目】Error: 0.14629760420652493, Diff: 0.00010882770036088618, Grad: 0.4549799648702066【525回目】Error: 0.14618915765855744, Diff: 0.00010844654796748898, Grad: 0.4538521396758174【526回目】Error: 0.1460810897579368, Diff: 0.00010806790062065086, Grad: 0.4527300576627079【527回目】Error: 0.14597339802423576, Diff: 0.00010769173370103746, Grad: 0.4516136760974853【528回目】Error: 0.14586608000133583, Diff: 0.00010731802289992709, Grad: 0.4505029526582351【529回目】Error: 0.14575913325712112, Diff: 0.00010694674421471428, Grad: 0.449397845429675【530回目】Error: 0.14565255538317656, Diff: 0.0001065778739445522, Grad: 0.4482983128983783【531回目】Error: 0.14554634399449115, Diff: 0.00010621138868541213, Grad: 0.4472043139480622【532回目】Error: 0.14544049672916523, Diff: 0.00010584726532592015, Grad: 0.4461158078549423【533回目】Error

【11345回目】Error: 0.06650796901344012, Diff: 9.99970273890538e-07, Grad: 0.01997338633024684387

<__main__.LogisticRegression at 0x1d90d670a58>

## 予測・評価

In [7]:
predict = pd.DataFrame(lr.predict(X_train_std), columns=['TARGET'])
predict.head(10)

Unnamed: 0,TARGET
0,1
1,1
2,0
3,0
4,1
5,0
6,1
7,0
8,1
9,1


In [8]:
train_score = score(predict, y_train)
test_score = score(pd.DataFrame(lr.predict(X_test_std), columns=['target']), y_test)
cprint('train_score: {}\ntest_score: {}'.format(train_score, test_score), 'red', attrs=['bold'])

[1m[31mtrain_score: 0.9812206572769953
test_score: 0.9370629370629371[0m
