In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import to_graphviz, plot_importance

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import _hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingRegressor
%matplotlib inline

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_squared_error,r2_score
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve

import keras
from keras.models import save_model, Sequential
from keras.layers import Activation, BatchNormalization, Dense
from keras.optimizers import Adam

pd.options.display.max_columns= None
#pd.options.display.max_rows = None

Using TensorFlow backend.


### Data Exploration

In [2]:
header = ['sample', 'thickness', 'size', 'shape', 'adhesion', 'epithelial', 'nuclei', 'chromatin', 'nucleoli', 'mitoses', 'status']

In [3]:
df = pd.read_csv("breast-cancer-wisconsin.data",header=None,names=header)

In [4]:
df

Unnamed: 0,sample,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses,status
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   sample      699 non-null    int64 
 1   thickness   699 non-null    int64 
 2   size        699 non-null    int64 
 3   shape       699 non-null    int64 
 4   adhesion    699 non-null    int64 
 5   epithelial  699 non-null    int64 
 6   nuclei      699 non-null    object
 7   chromatin   699 non-null    int64 
 8   nucleoli    699 non-null    int64 
 9   mitoses     699 non-null    int64 
 10  status      699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


In [6]:
df.describe()

Unnamed: 0,sample,thickness,size,shape,adhesion,epithelial,chromatin,nucleoli,mitoses,status
count,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0,699.0
mean,1071704.0,4.41774,3.134478,3.207439,2.806867,3.216023,3.437768,2.866953,1.589413,2.689557
std,617095.7,2.815741,3.051459,2.971913,2.855379,2.2143,2.438364,3.053634,1.715078,0.951273
min,61634.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
25%,870688.5,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0
50%,1171710.0,4.0,1.0,1.0,1.0,2.0,3.0,1.0,1.0,2.0
75%,1238298.0,6.0,5.0,5.0,4.0,4.0,5.0,4.0,1.0,4.0
max,13454350.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,4.0


In [7]:
df.isnull().sum()

sample        0
thickness     0
size          0
shape         0
adhesion      0
epithelial    0
nuclei        0
chromatin     0
nucleoli      0
mitoses       0
status        0
dtype: int64

In [8]:
df.duplicated().sum()

8

In [9]:
df.shape

(699, 11)

In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df['status'].value_counts()

2    453
4    238
Name: status, dtype: int64

In [12]:
df.drop(['sample'],axis=1,inplace=True)

In [13]:
df

Unnamed: 0,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses,status
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [14]:
df['status'] = df['status'].map({2:0,4:1})

In [15]:
df

Unnamed: 0,thickness,size,shape,adhesion,epithelial,nuclei,chromatin,nucleoli,mitoses,status
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,0
695,2,1,1,1,2,1,1,1,1,0
696,5,10,10,3,7,3,8,10,2,1
697,4,8,6,4,3,4,10,6,1,1


In [16]:
df['nuclei'].replace(to_replace='?',value=0,inplace=True)

In [17]:
df['nuclei'].value_counts()

1     397
10    130
5      30
2      30
3      28
8      20
4      19
0      16
9       9
7       8
6       4
Name: nuclei, dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 691 entries, 0 to 698
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   thickness   691 non-null    int64 
 1   size        691 non-null    int64 
 2   shape       691 non-null    int64 
 3   adhesion    691 non-null    int64 
 4   epithelial  691 non-null    int64 
 5   nuclei      691 non-null    object
 6   chromatin   691 non-null    int64 
 7   nucleoli    691 non-null    int64 
 8   mitoses     691 non-null    int64 
 9   status      691 non-null    int64 
dtypes: int64(9), object(1)
memory usage: 59.4+ KB


In [19]:
df['nuclei'] =pd.to_numeric(df['nuclei'])

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 691 entries, 0 to 698
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   thickness   691 non-null    int64
 1   size        691 non-null    int64
 2   shape       691 non-null    int64
 3   adhesion    691 non-null    int64
 4   epithelial  691 non-null    int64
 5   nuclei      691 non-null    int64
 6   chromatin   691 non-null    int64
 7   nucleoli    691 non-null    int64
 8   mitoses     691 non-null    int64
 9   status      691 non-null    int64
dtypes: int64(10)
memory usage: 59.4 KB


### Model Training

In [21]:
X = df.iloc[:,0:9]
y = df.iloc[:,9]

In [22]:
X.values, y.values

(array([[ 5,  1,  1, ...,  3,  1,  1],
        [ 5,  4,  4, ...,  3,  2,  1],
        [ 3,  1,  1, ...,  3,  1,  1],
        ...,
        [ 5, 10, 10, ...,  8, 10,  2],
        [ 4,  8,  6, ..., 10,  6,  1],
        [ 4,  8,  8, ..., 10,  4,  1]], dtype=int64),
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
        0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1,
        1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1,
        0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
        0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [24]:
X_train.shape, X_test.shape

((552, 9), (139, 9))

In [25]:
xgb = XGBClassifier(random_state=0, n_estimators=200, objective='binary:logistic')

In [26]:
xgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],eval_metric='error',early_stopping_rounds=20)

[0]	validation_0-error:0.071942
Will train until validation_0-error hasn't improved in 20 rounds.
[1]	validation_0-error:0.057554
[2]	validation_0-error:0.100719
[3]	validation_0-error:0.071942
[4]	validation_0-error:0.071942
[5]	validation_0-error:0.064748
[6]	validation_0-error:0.064748
[7]	validation_0-error:0.064748
[8]	validation_0-error:0.064748
[9]	validation_0-error:0.064748
[10]	validation_0-error:0.064748
[11]	validation_0-error:0.064748
[12]	validation_0-error:0.064748
[13]	validation_0-error:0.071942
[14]	validation_0-error:0.064748
[15]	validation_0-error:0.064748
[16]	validation_0-error:0.071942
[17]	validation_0-error:0.071942
[18]	validation_0-error:0.071942
[19]	validation_0-error:0.071942
[20]	validation_0-error:0.071942
[21]	validation_0-error:0.071942
Stopping. Best iteration:
[1]	validation_0-error:0.057554



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [27]:
y_pred = xgb.predict(X_test)

In [28]:
y_pred

array([1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0], dtype=int64)

### Model Evaluation

In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      0.93      0.96        92
           1       0.88      0.96      0.92        47

    accuracy                           0.94       139
   macro avg       0.93      0.95      0.94       139
weighted avg       0.95      0.94      0.94       139

