In [1]:
import sklearn as sk
import numpy as np

In [3]:
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler  #최대=1, 최소=0 되도록 스케일링

covtype = fetch_covtype(shuffle=True, random_state=0)
X_covtype = covtype.data
y_covtype = covtype.target - 1
classes = np.unique(y_covtype)
X_train, X_test, y_train, y_test = train_test_split(X_covtype, y_covtype)

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

def read_Xy(start, end):
    # 실무에서는 파일이나 데이터베이스에서 읽어온다.
    idx = list(range(start, min(len(y_train) - 1, end)))
    X = X_train[idx, :]
    y = y_train[idx]
    return X, y

## SGD

In [4]:
%%time

from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

model = SGDClassifier(random_state=0)
n_split = 10
n_X = len(y_train) // n_split
n_epoch = 10
for epoch in range(n_epoch):
    for n in range(n_split):
        X, y = read_Xy(n * n_X, (n + 1) * n_X)
        model.partial_fit(X, y, classes=classes)
    accuracy_train = accuracy_score(y_train, model.predict(X_train))
    accuracy_test = accuracy_score(y_test, model.predict(X_test))
    print("epoch={:d} train acc={:5.3f} test acc={:5.3f}".format(epoch, accuracy_train, accuracy_test))

epoch=0 train acc=0.708 test acc=0.707
epoch=1 train acc=0.712 test acc=0.712
epoch=2 train acc=0.713 test acc=0.713
epoch=3 train acc=0.714 test acc=0.713
epoch=4 train acc=0.713 test acc=0.712
epoch=5 train acc=0.713 test acc=0.712
epoch=6 train acc=0.713 test acc=0.711
epoch=7 train acc=0.712 test acc=0.711
epoch=8 train acc=0.712 test acc=0.711
epoch=9 train acc=0.712 test acc=0.711
Wall time: 9.48 s


## 나이브베이즈 모형

In [5]:
%%time

from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

model = BernoulliNB(alpha=0.1)

n_split = 10
n_X = len(y_train) // n_split
for n in range(n_split):
    X, y = read_Xy(n * n_X, (n + 1) * n_X)
    model.partial_fit(X, y, classes=classes)
    accuracy_train = accuracy_score(y_train, model.predict(X_train))
    accuracy_test = accuracy_score(y_test, model.predict(X_test)) 
    print("n={:d} train accuracy={:5.3f} test accuracy={:5.3f}".format(n, accuracy_train, accuracy_test))

n=0 train accuracy=0.634 test accuracy=0.631
n=1 train accuracy=0.634 test accuracy=0.631
n=2 train accuracy=0.633 test accuracy=0.630
n=3 train accuracy=0.633 test accuracy=0.629
n=4 train accuracy=0.633 test accuracy=0.630
n=5 train accuracy=0.633 test accuracy=0.630
n=6 train accuracy=0.632 test accuracy=0.629
n=7 train accuracy=0.634 test accuracy=0.631
n=8 train accuracy=0.632 test accuracy=0.629
n=9 train accuracy=0.632 test accuracy=0.629
Wall time: 4.04 s


## 그레디언트 부스팅

In [6]:
%%time

from lightgbm import train, Dataset
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

params = {
    'objective': 'multiclass',
    "num_class": len(classes),
    'learning_rate': 0.2,
    'seed': 0,
}

n_split = 10
n_X = len(y_train) // n_split
num_tree = 10
model = None
for n in range(n_split):
    X, y = read_Xy(n * n_X, (n + 1) * n_X)
    model = train(params, init_model=model, train_set=Dataset(X, y),
                  keep_training_booster=False, num_boost_round=num_tree)
    accuracy_train = accuracy_score(y_train, np.argmax(model.predict(X_train), axis=1))
    accuracy_test = accuracy_score(y_test, np.argmax(model.predict(X_test), axis=1)) 
    print("n={:d} train accuracy={:5.3f} test accuracy={:5.3f}".format(n, accuracy_train, accuracy_test))


You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2205
[LightGBM] [Info] Number of data points in the train set: 43575, number of used features: 50
[LightGBM] [Info] Start training from score -1.006656
[LightGBM] [Info] Start training from score -0.716293
[LightGBM] [Info] Start training from score -2.807880
[LightGBM] [Info] Start training from score -5.369033
[LightGBM] [Info] Start training from score -4.104378
[LightGBM] [Info] Start training from score -3.529187
[LightGBM] [Info] Start training from score -3.347257
n=0 train accuracy=0.788 test accuracy=0.785
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2204
[LightGBM] [Info] Number of data points in the train set: 43575, number of used features: 50
n=1 train accuracy=0.810 test accuracy=0.805
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2206
[LightGBM] [Info] Number of data points in the train set: 43575, numbe

## Random Forest

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

n_split = 10
n_X = len(y_train) // n_split
num_tree_ini = 10
num_tree_step = 10
model = RandomForestClassifier(n_estimators=num_tree_ini, warm_start=True)
for n in range(n_split):
    X, y = read_Xy(n * n_X, (n + 1) * n_X)
    model.fit(X, y)
    accuracy_train = accuracy_score(y_train, model.predict(X_train))
    accuracy_test = accuracy_score(y_test, model.predict(X_test))
    print("epoch={:d} train accuracy={:5.3f} test accuracy={:5.3f}".format(n, accuracy_train, accuracy_test))
    
    model.n_estimators += num_tree_step

epoch=0 train accuracy=0.867 test accuracy=0.853
epoch=1 train accuracy=0.891 test accuracy=0.872
epoch=2 train accuracy=0.899 test accuracy=0.879
epoch=3 train accuracy=0.903 test accuracy=0.883
epoch=4 train accuracy=0.905 test accuracy=0.885
epoch=5 train accuracy=0.906 test accuracy=0.886
epoch=6 train accuracy=0.907 test accuracy=0.887
epoch=7 train accuracy=0.907 test accuracy=0.887
epoch=8 train accuracy=0.908 test accuracy=0.888
