# Breast Cancer Classification Model

Attribute Information:

1) ID number
2) Diagnosis (M = malignant, B = benign)
3-32)

Ten real-valued features are computed for each cell nucleus:

a) radius (mean of distances from center to points on the perimeter)
b) texture (standard deviation of gray-scale values)
c) perimeter
d) area
e) smoothness (local variation in radius lengths)
f) compactness (perimeter^2 / area - 1.0)
g) concavity (severity of concave portions of the contour)
h) concave points (number of concave portions of the contour)
i) symmetry
j) fractal dimension ("coastline approximation" - 1)

Source: http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29

# Import required packages

In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Set the parameters for the run

In [2]:
model_name = 'bc_classification_grid_search_object.joblib'

# Get the data

In [7]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = pd.DataFrame(breast_cancer_wisconsin_diagnostic.data.features)
y = breast_cancer_wisconsin_diagnostic.data.targets 
  
# metadata 
print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_diagnostic.variables) 

{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'published_in': 'Electronic imaging', 'year': 1993, 'url': 'https://www.semanticscholar.org/paper/53

In [8]:
# df = pd.read_csv('data.csv', names=['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
#                                 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
#                                 'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', 
#                                 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
#                                 'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
#                                 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst',
#                                 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst',
#                                 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'])

In [10]:
X.dtypes

radius1               float64
texture1              float64
perimeter1            float64
area1                 float64
smoothness1           float64
compactness1          float64
concavity1            float64
concave_points1       float64
symmetry1             float64
fractal_dimension1    float64
radius2               float64
texture2              float64
perimeter2            float64
area2                 float64
smoothness2           float64
compactness2          float64
concavity2            float64
concave_points2       float64
symmetry2             float64
fractal_dimension2    float64
radius3               float64
texture3              float64
perimeter3            float64
area3                 float64
smoothness3           float64
compactness3          float64
concavity3            float64
concave_points3       float64
symmetry3             float64
fractal_dimension3    float64
dtype: object

In [11]:
len(X)

569

# Train Test Split

In [39]:
# y = df['diagnosis']
# X = df.drop('diagnosis', axis=1)

In [14]:
# convert to binary
y = (y !='B').astype(int)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Set up the pipeline

In [21]:
X.columns

Index(['radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1',
       'compactness1', 'concavity1', 'concave_points1', 'symmetry1',
       'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2',
       'smoothness2', 'compactness2', 'concavity2', 'concave_points2',
       'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3',
       'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3',
       'symmetry3', 'fractal_dimension3'],
      dtype='object')

In [16]:
ignore_cols='id'

In [22]:
cols = ['radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1',
       'compactness1', 'concavity1', 'concave_points1', 'symmetry1',
       'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2',
       'smoothness2', 'compactness2', 'concavity2', 'concave_points2',
       'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3',
       'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3',
       'symmetry3', 'fractal_dimension3']

In [23]:
pipeline = Pipeline(steps=[
    ('preprocessing',
        ColumnTransformer(
            transformers=[
                ('scalar imputing mean', SimpleImputer(strategy='mean'), cols),
            ], remainder='drop')),
#      ,
#         ColumnTransformer(
#             transformers=[
#                 ('scalar scaling', MinMaxScaler(feature_range=(0, 1)))
#             ], remainder='drop')),
    ('GBT', GradientBoostingClassifier(verbose=1))
    ])



# Set up the grid Search

Set the parameters for a grid search over the selected family of models.

In [24]:
# Currently only contains default values.  Commented parameters needn't be used
# in a grid search.
dct_grid = {
    'GBT__n_estimators' : [100, 250, 350],
    'GBT__max_depth'    : [7, 8, 9]
}

Search for the best model.

In [25]:
grid_search = GridSearchCV(pipeline, dct_grid, cv=10, return_train_score=False
                   , scoring=['accuracy', 'precision', 'average_precision', 'neg_log_loss']
                   , refit='accuracy', n_jobs=-1 
                  )
grid_search.fit(X_train, y_train)

  return f(*args, **kwargs)


      Iter       Train Loss   Remaining Time 
         1           1.1190            0.50s
         2           0.9682            0.42s
         3           0.8447            0.41s
         4           0.7414            0.43s
         5           0.6538            0.45s
         6           0.5787            0.44s
         7           0.5137            0.42s
         8           0.4572            0.41s
         9           0.4077            0.40s
        10           0.3642            0.40s
        20           0.1249            0.35s
        30           0.0449            0.31s
        40           0.0164            0.27s
        50           0.0060            0.23s
        60           0.0022            0.18s
        70           0.0008            0.14s
        80           0.0003            0.09s
        90           0.0001            0.05s
       100           0.0000            0.00s


GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(transformers=[('scalar '
                                                                         'imputing '
                                                                         'mean',
                                                                         SimpleImputer(),
                                                                         ['radius1',
                                                                          'texture1',
                                                                          'perimeter1',
                                                                          'area1',
                                                                          'smoothness1',
                                                                          'compactness1',
                                                               

# Time of Run

In [26]:
pd.DataFrame(grid_search.cv_results_)['mean_fit_time']

0    0.614556
1    1.536062
2    2.097075
3    0.647201
4    1.588529
5    2.144267
6    0.662922
7    1.640785
8    1.836546
Name: mean_fit_time, dtype: float64

In [27]:
time_of_run_in_hours = (pd.DataFrame(grid_search.cv_results_)['mean_fit_time'] * 10).sum() / 60 / 60
print('time of run in hours: {}'.format(time_of_run_in_hours))
hours_per_record = time_of_run_in_hours / 20000
print('hours per record: {}'.format(hours_per_record))
records_in_an_hour = 1 / hours_per_record
print('number of records in 1 hour {}'.format(records_in_an_hour))

time of run in hours: 0.03546651224295298
hours per record: 1.773325612147649e-06
number of records in 1 hour 563912.2297393057


# Analyze the Model Results

In [28]:
pd.DataFrame(grid_search.cv_results_)['mean_test_average_precision']

0    0.915115
1    0.917996
2    0.906755
3    0.915718
4    0.919277
5    0.908521
6    0.912262
7    0.918599
8    0.921433
Name: mean_test_average_precision, dtype: float64

In [29]:
#pd.DataFrame(grid_sesarch.cv_results_).columns

In [30]:
grid_search.best_estimator_.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('scalar imputing mean',
                                                  SimpleImputer(),
                                                  ['radius1', 'texture1',
                                                   'perimeter1', 'area1',
                                                   'smoothness1',
                                                   'compactness1', 'concavity1',
                                                   'concave_points1',
                                                   'symmetry1',
                                                   'fractal_dimension1',
                                                   'radius2', 'texture2',
                                                   'perimeter2', 'area2',
                                                   'smoothness2',
                                                   'compactness2', 'conc

In [31]:
pd.DataFrame(grid_search.cv_results_)[['mean_fit_time', 'param_GBT__max_depth', 'param_GBT__n_estimators',
                                      'mean_test_accuracy', 'mean_test_precision', 'mean_test_average_precision',
                                      'mean_test_neg_log_loss']].T#.to_csv('model_training_performance_metrics.csv')

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.614556,1.536062,2.097075,0.647201,1.588529,2.144267,0.662922,1.640785,1.836546
param_GBT__max_depth,7.0,7.0,7.0,8.0,8.0,8.0,9.0,9.0,9.0
param_GBT__n_estimators,100.0,250.0,350.0,100.0,250.0,350.0,100.0,250.0,350.0
mean_test_accuracy,0.937115,0.937115,0.937115,0.934615,0.934615,0.934615,0.942115,0.939615,0.939615
mean_test_precision,0.9202,0.9202,0.9202,0.914048,0.914048,0.914048,0.930655,0.924464,0.924464
mean_test_average_precision,0.915115,0.917996,0.906755,0.915718,0.919277,0.908521,0.912262,0.918599,0.921433
mean_test_neg_log_loss,-0.629262,-1.48841,-1.990283,-0.592485,-1.424668,-1.911735,-0.5688,-1.352895,-1.812718


# Feature Importance

In [76]:
feature_imp = pd.DataFrame({'column': cols,
                            'feature_importance': grid_search.best_estimator_.named_steps["GBT"].feature_importances_})
feature_imp = feature_imp.sort_values('feature_importance', ascending=False)
#feature_imp.to_csv('feature_importance_90k_lapse.csv')

In [77]:
feature_imp.head(15)

Unnamed: 0,column,feature_importance
7,concave_points_mean,0.712714
21,texture_worst,0.078024
23,area_worst,0.076143
3,area_mean,0.043814
26,concavity_worst,0.017637
1,texture_mean,0.011739
13,area_se,0.010126
24,smoothness_worst,0.010103
15,compactness_se,0.008045
25,compactness_worst,0.007376


# Save the Model Output

In [None]:
dump(grid_search, '{}.joblib'.format(model_name))

# Write the Predictions

Score on the training set

In [None]:
# results = pd.DataFrame({'policy_agreement_id': df_train.index,
#                         'probability': grid_sesarch.best_estimator_.predict_proba(X_train)[:, 1],
#                         'prediction': grid_sesarch.best_estimator_.predict(X_train),
#                         'actual': Y_train})

In [None]:
#results.to_csv('results_training_set.csv')

Evaluate on the Test set

In [83]:
from sklearn.metrics import average_precision_score, log_loss, accuracy_score, precision_score
print('Test ({} samples) performance metrics'.format(len(y_test)))
print('average precision: {}'.format(average_precision_score(y_test, 
                                                            grid_search.predict_proba(X_test)[:, 1])))
print('log loss: {}'.format(log_loss(y_test, grid_search.predict_proba(X_test)[:, 1])))
print('accuracy: {}'.format(accuracy_score(y_test, grid_search.predict(X_test))))
print('precision: {}'.format(precision_score(y_test, grid_search.predict(X_test))))


out of sample (171) performance metrics
average precision: 0.9925298187863119
log loss: 0.257514191077136
accuracy: 0.9473684210526315
precision: 0.890625


Save the results

In [102]:
test_scores = pd.DataFrame({'id': X_test['id'],
                            'probability': grid_search.predict_proba(X_test)[:, 1],
                            'prediction': grid_search.predict(X_test),
                             'actual': y_test})

In [103]:
test_scores.head()

Unnamed: 0,actual,id,prediction,probability
458,0,9112367,0,0.00014
177,0,872608,0,0.000138
288,0,8913,0,0.000138
54,1,857392,1,0.999775
430,0,90769602,0,0.000138


In [113]:
#test_scores_with_data = pd.concat([test_scores, X_train.drop('id', axis=1)], axis=1)

In [114]:
#test_scores_with_data.head()

In [98]:
#test_scores_with_data.to_csv('{}_scores_with_data.csv'.format(model_name))

# Predcitions Confusion Matrix

In [115]:
test_scores['actual']

458    0
177    0
288    0
54     1
430    0
429    0
159    0
75     0
201    0
522    1
369    1
166    0
517    1
48     1
273    1
202    1
9      1
361    0
454    0
131    0
503    0
85     0
483    0
172    1
5      1
469    1
252    0
558    0
196    0
338    1
      ..
542    0
249    0
471    0
518    1
524    0
263    1
547    0
26     1
478    0
199    1
383    0
485    0
155    0
105    0
258    1
114    0
236    0
521    0
530    0
345    0
230    1
49     0
541    0
460    0
22     0
157    1
203    1
145    0
101    1
227    0
Name: actual, Length: 171, dtype: int64

In [93]:
cm = confusion_matrix(test_scores_with_data['actual'], test_scores_with_data['prediction'])
cm

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [368]:
total = len(out_of_sample_results)
perc_vals = [[cm[0][0]/total*100, cm[0][1]/total*100], [cm[1][0]/total*100, cm[1][1]/total*100]]
perc_vals

[[78.49000000000001, 3.27], [5.795, 12.445]]