# Breast Cancer Classification Model

Attribute Information:

1) ID number
2) Diagnosis (M = malignant, B = benign)
3-32)

Ten real-valued features are computed for each cell nucleus:

a) radius (mean of distances from center to points on the perimeter)
b) texture (standard deviation of gray-scale values)
c) perimeter
d) area
e) smoothness (local variation in radius lengths)
f) compactness (perimeter^2 / area - 1.0)
g) concavity (severity of concave portions of the contour)
h) concave points (number of concave portions of the contour)
i) symmetry
j) fractal dimension ("coastline approximation" - 1)

Source: http://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+%28diagnostic%29

# Import required packages

In [28]:
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from joblib import dump, load
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

# Set the parameters for the run

In [81]:
model_name = 'bc_classification_grid_search_object.joblib'

# Get the data

In [13]:
df = pd.read_csv('data.csv', names=['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
                                'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
                                'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', 
                                'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
                                'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
                                'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst',
                                'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst',
                                'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst'])

In [14]:
df.dtypes

id                          object
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave_points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst     

In [15]:
len(df)

570

# Train Test Split

In [39]:
y = df['diagnosis']
X = df.drop('diagnosis', axis=1)

In [40]:
y = (y !='B').astype(int)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

# Set up the pipeline

In [33]:
ignore_cols='id'

In [50]:
cols = ['radius_mean', 'texture_mean', 'perimeter_mean',
        'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
        'concave_points_mean', 'symmetry_mean', 'fractal_dimension_mean', 
        'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
        'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
        'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst',
        'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst',
        'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']

In [66]:
pipeline = Pipeline(steps=[
    ('preprocessing',
        ColumnTransformer(
            transformers=[
                ('scalar imputing mean', SimpleImputer(strategy='mean'), cols),
            ], remainder='drop')),
#      ,
#         ColumnTransformer(
#             transformers=[
#                 ('scalar scaling', MinMaxScaler(feature_range=(0, 1)))
#             ], remainder='drop')),
    ('GBT', GradientBoostingClassifier(verbose=1))
    ])



# Set up the grid Search

Set the parameters for a grid search over the selected family of models.

In [67]:
# Currently only contains default values.  Commented parameters needn't be used
# in a grid search.
dct_grid = {
    'GBT__n_estimators' : [100, 250, 350],
    'GBT__max_depth'    : [7, 8, 9]
}

Search for the best model.

In [68]:
grid_search = GridSearchCV(pipeline, dct_grid, cv=10, return_train_score=False
                   , scoring=['accuracy', 'precision', 'average_precision', 'neg_log_loss']
                   , refit='accuracy', n_jobs=-1 
                  )
grid_search.fit(X_train, y_train)



      Iter       Train Loss   Remaining Time 
         1           1.1439            0.76s
         2           0.9915            0.71s
         3           0.8659            0.70s
         4           0.7606            0.70s
         5           0.6710            0.69s
         6           0.5940            0.69s
         7           0.5274            0.68s
         8           0.4695            0.67s
         9           0.4187            0.67s
        10           0.3741            0.67s
        20           0.1283            0.63s
        30           0.0461            0.60s
        40           0.0168            0.58s
        50           0.0062            0.56s
        60           0.0023            0.53s
        70           0.0009            0.49s
        80           0.0005            0.44s
        90           0.0004            0.38s
       100           0.0004            0.33s
       200           0.0004            0.06s


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessing',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('scalar '
                                                                         'imputing '
                                                                         'mean',
                                                                         SimpleImputer(add_indicator=False,
                                                                                       copy=True,
                                                                               

# Time of Run

In [78]:
pd.DataFrame(grid_search.cv_results_)['mean_fit_time']

0    0.326132
1    0.377651
2    0.414694
3    0.318390
4    0.406536
5    0.432444
6    0.326364
7    0.386804
8    0.423554
Name: mean_fit_time, dtype: float64

In [79]:
time_of_run_in_hours = (pd.DataFrame(grid_search.cv_results_)['mean_fit_time'] * 10).sum() / 60 / 60
print('time of run in hours: {}'.format(time_of_run_in_hours))
hours_per_record = time_of_run_in_hours / 20000
print('hours per record: {}'.format(hours_per_record))
records_in_an_hour = 1 / hours_per_record
print('number of records in 1 hour {}'.format(records_in_an_hour))

time of run in hours: 0.00947935938835144
hours per record: 4.7396796941757194e-07
number of records in 1 hour 2109847.2144200676


# Analyze the Model Results

In [80]:
pd.DataFrame(grid_search.cv_results_)['mean_test_average_precision']

0    0.972465
1    0.974277
2    0.973894
3    0.972370
4    0.973677
5    0.974807
6    0.971562
7    0.971830
8    0.971253
Name: mean_test_average_precision, dtype: float64

In [72]:
#pd.DataFrame(grid_sesarch.cv_results_).columns

In [73]:
grid_search.best_estimator_.get_params

<bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('scalar imputing mean',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  ['radius_mean',
                                                   'texture_mean',
                                  

In [74]:
pd.DataFrame(grid_search.cv_results_)[['mean_fit_time', 'param_GBT__max_depth', 'param_GBT__n_estimators',
                                      'mean_test_accuracy', 'mean_test_precision', 'mean_test_average_precision',
                                      'mean_test_neg_log_loss']].T#.to_csv('model_training_performance_metrics.csv')

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.326132,0.377651,0.414694,0.31839,0.406536,0.432444,0.326364,0.386804,0.423554
param_GBT__max_depth,7.0,7.0,7.0,8.0,8.0,8.0,9.0,9.0,9.0
param_GBT__n_estimators,100.0,250.0,350.0,100.0,250.0,350.0,100.0,250.0,350.0
mean_test_accuracy,0.927318,0.929825,0.929825,0.924812,0.927318,0.924812,0.922306,0.912281,0.919799
mean_test_precision,0.915701,0.917929,0.929802,0.9118,0.914154,0.9118,0.900069,0.888337,0.89941
mean_test_average_precision,0.972465,0.974277,0.973894,0.97237,0.973677,0.974807,0.971562,0.97183,0.971253
mean_test_neg_log_loss,-0.483263,-0.464406,-0.45439,-0.483334,-0.476228,-0.464766,-0.499751,-0.493519,-0.498142


# Feature Importance

In [76]:
feature_imp = pd.DataFrame({'column': cols,
                            'feature_importance': grid_search.best_estimator_.named_steps["GBT"].feature_importances_})
feature_imp = feature_imp.sort_values('feature_importance', ascending=False)
#feature_imp.to_csv('feature_importance_90k_lapse.csv')

In [77]:
feature_imp.head(15)

Unnamed: 0,column,feature_importance
7,concave_points_mean,0.712714
21,texture_worst,0.078024
23,area_worst,0.076143
3,area_mean,0.043814
26,concavity_worst,0.017637
1,texture_mean,0.011739
13,area_se,0.010126
24,smoothness_worst,0.010103
15,compactness_se,0.008045
25,compactness_worst,0.007376


# Save the Model Output

In [None]:
dump(grid_search, '{}.joblib'.format(model_name))

# Write the Predictions

Score on the training set

In [None]:
# results = pd.DataFrame({'policy_agreement_id': df_train.index,
#                         'probability': grid_sesarch.best_estimator_.predict_proba(X_train)[:, 1],
#                         'prediction': grid_sesarch.best_estimator_.predict(X_train),
#                         'actual': Y_train})

In [None]:
#results.to_csv('results_training_set.csv')

Evaluate on the Test set

In [83]:
from sklearn.metrics import average_precision_score, log_loss, accuracy_score, precision_score
print('Test ({} samples) performance metrics'.format(len(y_test)))
print('average precision: {}'.format(average_precision_score(y_test, 
                                                            grid_search.predict_proba(X_test)[:, 1])))
print('log loss: {}'.format(log_loss(y_test, grid_search.predict_proba(X_test)[:, 1])))
print('accuracy: {}'.format(accuracy_score(y_test, grid_search.predict(X_test))))
print('precision: {}'.format(precision_score(y_test, grid_search.predict(X_test))))


out of sample (171) performance metrics
average precision: 0.9925298187863119
log loss: 0.257514191077136
accuracy: 0.9473684210526315
precision: 0.890625


Save the results

In [102]:
test_scores = pd.DataFrame({'id': X_test['id'],
                            'probability': grid_search.predict_proba(X_test)[:, 1],
                            'prediction': grid_search.predict(X_test),
                             'actual': y_test})

In [103]:
test_scores.head()

Unnamed: 0,actual,id,prediction,probability
458,0,9112367,0,0.00014
177,0,872608,0,0.000138
288,0,8913,0,0.000138
54,1,857392,1,0.999775
430,0,90769602,0,0.000138


In [113]:
#test_scores_with_data = pd.concat([test_scores, X_train.drop('id', axis=1)], axis=1)

In [114]:
#test_scores_with_data.head()

In [98]:
#test_scores_with_data.to_csv('{}_scores_with_data.csv'.format(model_name))

# Predcitions Confusion Matrix

In [115]:
test_scores['actual']

458    0
177    0
288    0
54     1
430    0
429    0
159    0
75     0
201    0
522    1
369    1
166    0
517    1
48     1
273    1
202    1
9      1
361    0
454    0
131    0
503    0
85     0
483    0
172    1
5      1
469    1
252    0
558    0
196    0
338    1
      ..
542    0
249    0
471    0
518    1
524    0
263    1
547    0
26     1
478    0
199    1
383    0
485    0
155    0
105    0
258    1
114    0
236    0
521    0
530    0
345    0
230    1
49     0
541    0
460    0
22     0
157    1
203    1
145    0
101    1
227    0
Name: actual, Length: 171, dtype: int64

In [93]:
cm = confusion_matrix(test_scores_with_data['actual'], test_scores_with_data['prediction'])
cm

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [368]:
total = len(out_of_sample_results)
perc_vals = [[cm[0][0]/total*100, cm[0][1]/total*100], [cm[1][0]/total*100, cm[1][1]/total*100]]
perc_vals

[[78.49000000000001, 3.27], [5.795, 12.445]]