## データの取得・確認

In [1]:
import sys
sys.path.append(r'C:/Users/koki5/Dropbox/Jupyter/ScratchML')

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn import model_selection
from termcolor import cprint

from common.module.evaluation import score


cancer = datasets.load_breast_cancer()
cancer_data = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
cancer_target = pd.Series(data=cancer.target, name='target')

X_train, X_test, y_train, y_test = model_selection.train_test_split(cancer_data, cancer_target)

cprint('Cancer Data (samples: {}, features: {})'.format(*cancer_data.shape), 'blue', attrs=['bold'])
cprint('Cancer Target (samples: {})'.format(*cancer_target.shape), 'blue', attrs=['bold'])

[1m[34mCancer Data (samples: 569, features: 30)[0m
[1m[34mCancer Target (samples: 569)[0m


In [2]:
cancer_data.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758


In [3]:
cancer_target.unique()

array([0, 1], dtype=int64)

## データ前処理

In [4]:
def standardize(X):
    """標準化する.
    
    X: データセット
    """
    return (X - np.mean(X)) / np.std(X)


def add_ones(X):
    """特徴量の0列目に1を加える.
    
    X: データセット
    """
    X['0'] = 1
    return X.sort_index(axis=1, ascending=True)


X_train_std = standardize(X_train)
X_test_std = standardize(X_test)

# バイアス項を追加
X_train_std = add_ones(X_train_std)
X_test_std = add_ones(X_test_std)

X_train_std.head(3)

Unnamed: 0,0,area error,compactness error,concave points error,concavity error,fractal dimension error,mean area,mean compactness,mean concave points,mean concavity,...,worst area,worst compactness,worst concave points,worst concavity,worst fractal dimension,worst perimeter,worst radius,worst smoothness,worst symmetry,worst texture
155,1,-0.489176,-0.578268,-0.646886,-0.441796,-0.549499,-0.5269,-0.739532,-0.659755,-0.635642,...,-0.537623,-0.50619,-0.499802,-0.382754,-0.190608,-0.598685,-0.534452,-0.504332,0.311731,-0.084383
406,1,-0.376679,-0.752531,-0.511638,-0.502048,-0.896513,0.436505,-0.395377,-0.095895,-0.432883,...,0.155384,-0.54746,-0.033142,-0.21036,-0.798411,0.286973,0.337877,-0.551201,-0.209953,-1.008287
236,1,2.456249,0.139186,0.870911,0.483431,-0.320703,2.903865,1.175989,1.916754,1.32478,...,3.760577,0.955786,2.185725,1.438418,0.105153,3.010448,3.153889,0.620505,0.296158,1.437438


## モデルの構築・訓練

In [5]:
class LogisticRegression(object):
    """勾配降下法に基づくロジスティック回帰."""
    
    
    def __init__(self, alpha=0.01, eps=1e-6):
        """
        ---パラメータ-------
        alpha: 学習率
        eps: 収束の目安
        --------------------
        
        ---属性-------------
        _w: パラメータ（重み）
        --------------------
        """
        self.alpha = alpha
        self.eps = eps
        
        self._w = 0
        
        
    def fit(self, X, y):
        """訓練データで学習する.
        
        ---パラメータ-------
        X: 訓練データセット
        y: ターゲット
        --------------------
        
        """
        # パラメータの初期値はランダムに設定する
        theta = np.random.rand(X.shape[1])
        error = self.J(X, y, theta)
        count = 0
        diff = 1
        
        # コスト関数の値が殆ど変動しなくなったら収束とする
        while diff > self.eps:
            # パラメータの更新処理
            grad = (1 / len(y)) * np.dot(self.h(X, theta) - y, X)
            theta = theta - self.alpha * grad
            
            # 収束状況を確認するための処理
            current_error = self.J(X, y, theta)
            diff = error - current_error
            error = current_error
            count += 1
            print('\r【{}回目】Error: {}, Diff: {}, Grad: {}'.format(count, error, diff, grad.sum()), end='')
            
        self._w = theta
        return self
    
    
    def predict(self, X):
        """クラスラベルを予測する.
        
        X: テストデータセット
        """
        return np.where(self.h(X, self._w) >= 0.5, 1, 0)
        
    
    def activate(self, z):
        """ロジスティック関数（活性化関数）.
        
        z: 仮説関数の予測値
        """
        return 1 / (1 + np.exp(-z))
    

    def h(self, X, theta):
        """仮説関数.
        
        X: 訓練データセット
        theta: パラメータ（重み）
        """
        return self.activate(np.dot(X, theta))
    

    def J(self, X, y, theta):
        """目的関数.
        
        X: 訓練データセット
        y: ターゲット
        theta: パラメータ（重み）
        """
        delta = 1e-7 # np.log()に0が混在しないようにする
        return - (1 / len(y)) * (np.sum(y * np.log(self.h(X, theta) + delta) + (1 - y) * np.log(1 - self.h(X, theta) + delta)))

In [6]:
lr = LogisticRegression()
lr.fit(X_train_std, y_train)

【1回目】Error: 7.30546607016808, Diff: 0.058922206542226796, Grad: 13.917677223295918【2回目】Error: 7.2463135553998, Diff: 0.05915251476828054, Grad: 13.915322870571806【3回目】Error: 7.186934408785723, Diff: 0.059379146614076284, Grad: 13.91287304670836【4回目】Error: 7.127331692976238, Diff: 0.059602715809485396, Grad: 13.910324598067184【5回目】Error: 7.067507989703515, Diff: 0.05982370327272335, Grad: 13.907674308980999【6回目】Error: 7.007465533201905, Diff: 0.06004245650161, Grad: 13.904918891631594【7回目】Error: 6.9472063324281255, Diff: 0.06025920077377922, Grad: 13.902054971085263【8回目】Error: 6.886732273950643, Diff: 0.06047405847748255, Grad: 13.899079064996318【9回目】Error: 6.826045201019048, Diff: 0.0606870729315947, Grad: 13.89598755773589【10回目】Error: 6.765146968926263, Diff: 0.06089823209278489, Grad: 13.892776669001389【11回目】Error: 6.704039481556643, Diff: 0.06110748736962002, Grad: 13.889442417285258【12回目】Error: 6.642724716753761, Diff: 0.06131476480288267, Grad: 13.885980578895495【13回目

【113回目】Error: 0.8761640980122937, Diff: 0.024671974384168238, Grad: 8.28042062919418【114回目】Error: 0.8524821618630393, Diff: 0.0236819361492544, Grad: 8.11801510913676【115回目】Error: 0.8297637963409719, Diff: 0.022718365522067407, Grad: 7.956225596608762【116回目】Error: 0.8079810830147975, Diff: 0.02178271332617432, Grad: 7.795392842046518【117回目】Error: 0.7871049274501589, Diff: 0.02087615556463862, Grad: 7.635838289758315【118回目】Error: 0.7671053219915438, Diff: 0.01999960545861512, Grad: 7.477862439712721【119回目】Error: 0.7479515909898993, Diff: 0.019153731001644525, Grad: 7.32174416345605【120回目】Error: 0.7296126153120422, Diff: 0.018338975677857028, Grad: 7.167740738304957【121回目】Error: 0.7120570348900492, Diff: 0.01755558042199301, Grad: 7.016088381071332【122回目】Error: 0.6952534297331778, Diff: 0.016803605156871404, Grad: 6.867003051797519【123回目】Error: 0.679170481342239, Diff: 0.016082948390938823, Grad: 6.7206812562001295【124回目】Error: 0.6637771177124193, Diff: 0.01539336362981969, G

【228回目】Error: 0.238908945486607, Diff: 0.0009603481471095188, Grad: 1.6185156992030096【229回目】Error: 0.2379634805021326, Diff: 0.0009454649844743979, Grad: 1.6052812516031023【230回目】Error: 0.23703254821143538, Diff: 0.0009309322906972117, Grad: 1.5922492053107182【231回目】Error: 0.2361158085934726, Diff: 0.0009167396179627918, Grad: 1.579415230280964【232回目】Error: 0.23521293170774762, Diff: 0.0009028768857249625, Grad: 1.5667751102617749【233回目】Error: 0.23432359734162592, Diff: 0.0008893343661217079, Grad: 1.5543247394327968【234回目】Error: 0.23344749467159764, Diff: 0.0008761026700282726, Grad: 1.5420601191513328【235回目】Error: 0.232584321937878, Diff: 0.000863172733719636, Grad: 1.5299773548015725【236回目】Error: 0.2317337861317658, Diff: 0.0008505358061122181, Grad: 1.518072652743472【237回目】Error: 0.23089560269520928, Diff: 0.0008381834365565055, Grad: 1.5063423173577868【238回目】Error: 0.23006949523205333, Diff: 0.0008261074631559506, Grad: 1.4947827481839153【239回目】Error: 0.22925519523046

【337回目】Error: 0.1819324032504054, Diff: 0.0002974411203734706, Grad: 0.8414154087463064【338回目】Error: 0.18163713856627012, Diff: 0.0002952646841352935, Grad: 0.8376821002248047【339回目】Error: 0.1813440221309779, Diff: 0.00029311643529222886, Grad: 0.8339817709163018【340回目】Error: 0.18105302625164527, Diff: 0.00029099587933262283, Grad: 0.8303140017055957【341回目】Error: 0.1807641237192696, Diff: 0.00028890253237567887, Grad: 0.8266783804082891【342回目】Error: 0.1804772877983662, Diff: 0.0002868359209033944, Grad: 0.8230745016296375【343回目】Error: 0.18019249221686637, Diff: 0.00028479558149982487, Grad: 0.8195019666268196【344回目】Error: 0.17990971115626792, Diff: 0.00028278106059845265, Grad: 0.8159603831745339【345回目】Error: 0.17962891924203161, Diff: 0.00028079191423630023, Grad: 0.8124493654338251【346回目】Error: 0.1793500915342165, Diff: 0.00027882770781512134, Grad: 0.8089685338240536【347回目】Error: 0.17907320351834666, Diff: 0.00027688801586983613, Grad: 0.8055175148979214【348回目】Error: 0.1

【452回目】Error: 0.15748084414825872, Diff: 0.00015656265616242893, Grad: 0.5577179630269812【453回目】Error: 0.15732494877759068, Diff: 0.00015589537066804104, Grad: 0.5561102726134487【454回目】Error: 0.15716971532841512, Diff: 0.00015523344917556248, Grad: 0.5545123519791434【455回目】Error: 0.15701513849939794, Diff: 0.00015457682901717784, Grad: 0.5529241152797517【456回目】Error: 0.15686121305094589, Diff: 0.00015392544845205247, Grad: 0.5513454776516497【457回目】Error: 0.15670793380429524, Diff: 0.00015327924665065051, Grad: 0.5497763551980785【458回目】Error: 0.15655529564061654, Diff: 0.0001526381636786922, Grad: 0.5482166649755511【459回目】Error: 0.15640329350013513, Diff: 0.00015200214048141647, Grad: 0.5466663249804844【460回目】Error: 0.15625192238126723, Diff: 0.000151371118867899, Grad: 0.5451252541360567【461回目】Error: 0.15610117733977044, Diff: 0.00015074504149678591, Grad: 0.5435933722792795【462回目】Error: 0.15595105348790944, Diff: 0.0001501238518610004, Grad: 0.5420706001482878【463回目】Error:

【560回目】Error: 0.14363933061475098, Diff: 0.00010632035523869265, Grad: 0.4270712462480079【561回目】Error: 0.14353333207220734, Diff: 0.00010599854254364494, Grad: 0.42616812528115255【562回目】Error: 0.14342765352933998, Diff: 0.00010567854286736011, Grad: 0.42526922612655826【563回目】Error: 0.1433222931885775, Diff: 0.00010536034076247258, Grad: 0.42437451988940217【564回目】Error: 0.1432172492676227, Diff: 0.00010504392095481152, Grad: 0.42348397793029746【565回目】Error: 0.143112519999282, Diff: 0.00010472926834068086, Grad: 0.4225975718625098【566回目】Error: 0.143008103631297, Diff: 0.00010441636798499965, Grad: 0.4217152735492132【567回目】Error: 0.1429039984261786, Diff: 0.00010410520511841548, Grad: 0.4208370551007801【568回目】Error: 0.14280020266104282, Diff: 0.00010379576513577793, Grad: 0.419962888872107【569回目】Error: 0.14269671462744943, Diff: 0.00010348803359339076, Grad: 0.41909274745997327【570回目】Error: 0.14259353263124233, Diff: 0.00010318199620709678, Grad: 0.4182266037004365【571回目】Error

【666回目】Error: 0.13389929358151992, Diff: 7.99894048980343e-05, Grad: 0.350298162765794【667回目】Error: 0.13381949518217426, Diff: 7.979839934565969e-05, Grad: 0.3497201316327509【668回目】Error: 0.1337398869689638, Diff: 7.960821321045808e-05, Grad: 0.3491442811281727【669回目】Error: 0.1336604681278211, Diff: 7.941884114270881e-05, Grad: 0.34857059893136316【670回目】Error: 0.13358123784998144, Diff: 7.923027783965364e-05, Grad: 0.34799907281127457【671回目】Error: 0.13350219533193594, Diff: 7.904251804549678e-05, Grad: 0.3474296906257108【672回目】Error: 0.1334233397753856, Diff: 7.885555655035015e-05, Grad: 0.3468624403205379【673回目】Error: 0.13334467038719544, Diff: 7.866938819015012e-05, Grad: 0.3462973099289048【674回目】Error: 0.13326618637934937, Diff: 7.848400784607468e-05, Grad: 0.3457342875704705【675回目】Error: 0.13318788696890557, Diff: 7.829941044379396e-05, Grad: 0.3451733614506398【676回目】Error: 0.13310977137795213, Diff: 7.811559095344256e-05, Grad: 0.3446145198598089【677回目】Error: 0.1330318

【781回目】Error: 0.1257988380287042, Diff: 6.233646490599876e-05, Grad: 0.29559390131661323【782回目】Error: 0.12573662451678058, Diff: 6.221351192362001e-05, Grad: 0.2952035591393484【783回目】Error: 0.12567453353723032, Diff: 6.20909795502611e-05, Grad: 0.2948144172046994【784回目】Error: 0.12561256467160034, Diff: 6.196886562997994e-05, Grad: 0.2944264698140482【785回目】Error: 0.1255507175035788, Diff: 6.184716802154488e-05, Grad: 0.29403971130382334【786回目】Error: 0.1254889916189796, Diff: 6.172588459918416e-05, Grad: 0.29365413604523827【787回目】Error: 0.125427386605728, Diff: 6.160501325161438e-05, Grad: 0.2932697384440327【788回目】Error: 0.12536590205384573, Diff: 6.148455188226265e-05, Grad: 0.29288651294021556【789回目】Error: 0.1253045375554363, Diff: 6.136449840943303e-05, Grad: 0.2925044540078102【790回目】Error: 0.12524329270467072, Diff: 6.124485076558495e-05, Grad: 0.29212355615460267【791回目】Error: 0.1251821670977729, Diff: 6.1125606897805e-05, Grad: 0.2917438139218907【792回目】Error: 0.125121160

【887回目】Error: 0.11981468477733793, Diff: 5.129173932226505e-05, Grad: 0.2599486403347649【888回目】Error: 0.11976348103610883, Diff: 5.120374122910554e-05, Grad: 0.25965947266364764【889回目】Error: 0.11971236503703188, Diff: 5.1115999076950525e-05, Grad: 0.2593710552350815【890回目】Error: 0.11966133652525904, Diff: 5.1028511772840957e-05, Grad: 0.2590833849607839【891回目】Error: 0.11961039524702881, Diff: 5.0941278230229314e-05, Grad: 0.25879645876915636【892回目】Error: 0.11955954094966008, Diff: 5.085429736872982e-05, Grad: 0.2585102736051761【893回目】Error: 0.11950877338154574, Diff: 5.076756811434047e-05, Grad: 0.25822482643028694【894回目】Error: 0.11945809229214653, Diff: 5.0681089399207124e-05, Grad: 0.2579401142222934【895回目】Error: 0.11940749743198481, Diff: 5.0594860161720656e-05, Grad: 0.257656133975253【896回目】Error: 0.11935698855263845, Diff: 5.050887934636428e-05, Grad: 0.25737288269937036【897回目】Error: 0.11930656540673477, Diff: 5.042314590367192e-05, Grad: 0.25709035742089337【898回目】Erro

【989回目】Error: 0.11500425355527007, Diff: 4.347021512926541e-05, Grad: 0.23385060276452055【990回目】Error: 0.11496084994069583, Diff: 4.340361457423614e-05, Grad: 0.2336245143291566【991回目】Error: 0.11491751275607767, Diff: 4.33371846181585e-05, Grad: 0.23339893137451773【992回目】Error: 0.11487424183144679, Diff: 4.327092463088378e-05, Grad: 0.2331738520698162【993回目】Error: 0.11483103699746132, Diff: 4.320483398546904e-05, Grad: 0.232949274593121【994回目】Error: 0.1147878980854031, Diff: 4.313891205821874e-05, Grad: 0.23272519713130563【995回目】Error: 0.11474482492717474, Diff: 4.307315822836555e-05, Grad: 0.23250161787999674【996回目】Error: 0.11470181735529643, Diff: 4.300757187830628e-05, Grad: 0.23227853504352297【997回目】Error: 0.11465887520290295, Diff: 4.2942152393476984e-05, Grad: 0.23205594683486366【998回目】Error: 0.11461599830374058, Diff: 4.287689916236681e-05, Grad: 0.23183385147559887【999回目】Error: 0.11457318649216403, Diff: 4.2811811576559666e-05, Grad: 0.23161224719585863【1000回目】Error

【12260回目】Error: 0.05402013721594668, Diff: 9.99989878069929e-07, Grad: 0.024211643412183147875

<__main__.LogisticRegression at 0x1c5200e1198>

## 予測・評価

In [7]:
predict = pd.DataFrame(lr.predict(X_train_std), columns=['TARGET'])
predict.head(10)

Unnamed: 0,TARGET
0,1
1,1
2,0
3,1
4,0
5,0
6,0
7,0
8,0
9,1


In [8]:
train_score = score(predict, y_train)
test_score = score(pd.DataFrame(lr.predict(X_test_std), columns=['target']), y_test)
cprint('train_score: {}\ntest_score: {}'.format(train_score, test_score), 'red', attrs=['bold'])

[1m[31mtrain_score: 0.9859154929577465
test_score: 0.9790209790209791[0m
