In [1]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns

In [2]:
breast_cancer_data = load_breast_cancer()
df_features = breast_cancer_data.feature_names

In [3]:
df_features = breast_cancer_data.feature_names

In [4]:
df_data = pd.DataFrame(breast_cancer_data.data, columns = df_features).to_numpy()
df_target = pd.DataFrame(breast_cancer_data.target, columns = ['label']).to_numpy()

In [5]:
df_data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [6]:
train_input, test_input, train_target, test_target = train_test_split(df_data, df_target, test_size=0.2, random_state=42)

In [7]:
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(rf, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

1.0 0.9582417582417582


In [8]:
rf.fit(train_input, train_target)
print(rf.feature_importances_)

[0.04870337 0.01359088 0.05326975 0.04755501 0.00728533 0.01394433
 0.06800084 0.10620999 0.00377029 0.00388577 0.02013892 0.00472399
 0.01130301 0.02240696 0.00427091 0.00525322 0.00938583 0.00351326
 0.00401842 0.00532146 0.07798688 0.02174901 0.06711483 0.15389236
 0.01064421 0.02026604 0.0318016  0.14466327 0.01012018 0.00521012]


  rf.fit(train_input, train_target)


In [9]:
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)

rf.fit(train_input, train_target)
print(rf.oob_score_)

  rf.fit(train_input, train_target)


0.9560439560439561


In [10]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(et, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

1.0 0.9626373626373625


In [11]:
et.fit(train_input, train_target)
print(et.feature_importances_)

[0.07056276 0.02515277 0.03321301 0.04161836 0.01042362 0.02950286
 0.06475801 0.0735693  0.00678652 0.00757519 0.01761604 0.00618722
 0.0173427  0.03537469 0.00658643 0.00797924 0.00703864 0.01101909
 0.00652007 0.00719335 0.11657282 0.02730071 0.10130041 0.06646302
 0.02205361 0.02753929 0.04103085 0.08496076 0.01484804 0.01191061]


  et.fit(train_input, train_target)


In [12]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)
scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

1.0 0.9516483516483516


In [13]:
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.2, random_state=42)
scores = cross_validate(gb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

1.0 0.9582417582417582


In [14]:
gb.fit(train_input, train_target)
print(gb.feature_importances_)

  y = column_or_1d(y, warn=True)


[1.71829240e-04 3.27848146e-02 9.93955307e-04 9.43738259e-04
 7.57469663e-03 2.12621664e-03 2.29912719e-03 4.36451103e-01
 3.81209153e-04 8.06194703e-05 8.87889973e-03 5.17917256e-03
 3.47782084e-04 5.60342182e-03 2.75539441e-04 7.42937562e-04
 5.41607976e-03 8.74113653e-05 4.98061015e-04 8.23740768e-03
 1.03839559e-01 2.05866017e-02 5.13172592e-02 4.94657519e-02
 5.10286733e-03 1.75884676e-04 2.34865823e-02 2.26572206e-01
 3.56138301e-04 2.31272732e-05]


In [15]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = HistGradientBoostingClassifier(random_state=42)
scores = cross_validate(hgb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

1.0 0.9604395604395606


In [16]:
from sklearn.inspection import permutation_importance

hgb.fit(train_input, train_target)
result = permutation_importance(hgb, train_input, train_target, n_repeats=10,
                                random_state=42, n_jobs=-1)
print(result.importances_mean)

  y = column_or_1d(y, warn=True)


[0.         0.0043956  0.         0.         0.         0.
 0.         0.00791209 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.00527473 0.0021978  0.
 0.         0.         0.         0.01076923 0.         0.        ]


In [17]:
result = permutation_importance(hgb, test_input, test_target, n_repeats=10,
                                random_state=42, n_jobs=-1)
print(result.importances_mean)

[0.         0.01140351 0.         0.00964912 0.00087719 0.00263158
 0.00350877 0.0122807  0.         0.         0.00438596 0.00087719
 0.         0.00877193 0.00087719 0.00175439 0.00175439 0.
 0.         0.         0.00350877 0.00877193 0.00877193 0.00964912
 0.00087719 0.         0.00175439 0.00789474 0.00263158 0.        ]


In [18]:
hgb.score(test_input, test_target)

0.9736842105263158

In [19]:
from xgboost import XGBClassifier

xgb = XGBClassifier(tree_method='hist', random_state=42)
scores = cross_validate(xgb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

1.0 0.9626373626373628


In [20]:
from lightgbm import LGBMClassifier

lgb = LGBMClassifier(random_state=42)
scores = cross_validate(lgb, train_input, train_target, return_train_score=True, n_jobs=-1)

print(np.mean(scores['train_score']), np.mean(scores['test_score']))

1.0 0.964835164835165
