In [134]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
import numpy as np

In [135]:
cancer = load_breast_cancer()
cancer_data = cancer.data
cancer_label = cancer.target

# Data Analysis

In [136]:
print(dir(cancer))

['DESCR', 'data', 'feature_names', 'filename', 'target', 'target_names']


In [137]:
print(cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

In [138]:
cancer_data.shape

(569, 30)

In [139]:
cancer.target.shape

(569,)

Data Split

In [140]:
X_train, X_test, y_train, y_test = train_test_split(cancer_data, 
                                                    cancer_label, 
                                                    test_size=0.2, 
                                                    random_state=7)

# 모델 생성 (ensemble: randomforest, svm, logist regression)

In [141]:
decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

In [142]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import accuracy_score

In [143]:
#개별 모델 생성
svm_ = svm.SVC()
randomforest = RandomForestClassifier(n_estimators = 100, random_state = 0)
logisticreg = LogisticRegression()

In [144]:
#각 모델 학습
svm_.fit(X_train, y_train)
randomforest.fit(X_train, y_train)
logisticreg.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [145]:
svm_pred = svm_.predict(X_train)
rf_pred = randomforest.predict(X_train)
lr_pred = logisticreg.predict(X_train)

In [146]:
new_data = np.array([svm_pred, rf_pred, lr_pred])
new_data.shape

(3, 455)

In [147]:
new_data = np.transpose(new_data)
new_data.shape

(455, 3)

In [148]:
#최종 모델
lgbm = LGBMClassifier()

In [149]:
lgbm.fit(new_data, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

# 모델테스트

In [150]:
sp = svm_.predict(X_test)
rp = randomforest.predict(X_test)
lp = logisticreg.predict(X_test)

new_test = np.array([sp, rp, lp])
new_test = np.transpose(new_test)
lgbm_pred = lgbm.predict(new_test)

정확도 측정 (classification_report)

In [151]:
print(classification_report(y_test, lgbm_pred))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        40
           1       0.96      1.00      0.98        74

    accuracy                           0.97       114
   macro avg       0.98      0.96      0.97       114
weighted avg       0.97      0.97      0.97       114



# Cf. 단일모델 정확도 측정

# SVM predict
svm prediction result

In [152]:
print(classification_report(y_test, sp))

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86      0.89       114
weighted avg       0.92      0.90      0.90       114



# Random Forest predict
randomforest prediction result

In [153]:
print(classification_report(y_test, sp))

              precision    recall  f1-score   support

           0       1.00      0.72      0.84        40
           1       0.87      1.00      0.93        74

    accuracy                           0.90       114
   macro avg       0.94      0.86      0.89       114
weighted avg       0.92      0.90      0.90       114



# Logistic Regression predict
logistic prediction result

In [154]:
print(classification_report(y_test, lp))

              precision    recall  f1-score   support

           0       1.00      0.82      0.90        40
           1       0.91      1.00      0.95        74

    accuracy                           0.94       114
   macro avg       0.96      0.91      0.93       114
weighted avg       0.94      0.94      0.94       114

