In [1]:
# Imports and data loading
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Supress the warnings from final output
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

column_names = ['ID number', 'Diagnosis', 
               'Mean radius','Mean texture','Mean perimeter','Mean area','Mean smoothness','Mean compactness','Mean concavity','Mean concave points','Mean symmetry','Mean fractal dimension',
               'Standard error radius','Standard error texture','Standard error perimeter','Standard error area','Standard error smoothness','Standard error compactness','Standard error concavity','Standard error concave points','Standard error symmetry','Standard error fractal dimension',
               'Largest radius','Largest texture','Largest perimeter','Largest area','Largest smoothness','Largest compactness','Largest concavity','Largest concave points','Largest symmetry','Largest fractal dimension']

wdbc_data = pd.read_csv('../input/wdbc.data', header=None, names=column_names)
target_data = wdbc_data['Diagnosis']
feature_data = wdbc_data.iloc[:,2:]

encoder = LabelEncoder()
encoded_target_data = encoder.fit_transform(target_data)

In [2]:
# Train/test split
x_train, x_test, y_train, y_test = train_test_split(feature_data, encoded_target_data, train_size=0.7, test_size=0.3, random_state=4)

# Directly confirm their sizes 
print("x_train.shape: {}".format(x_train.shape))
print("x_test.shape: {}".format(x_test.shape))
print("y_train.shape: {}".format(y_train.shape))
print("y_test.shape: {}".format(y_test.shape))

# Standardisation
scaler = StandardScaler().fit(x_train)
standardised_x_train = scaler.transform(x_train)
standardised_x_test = scaler.transform(x_test)

# Directly confirm their sizes
print("standardised_x_train.shape: {}".format(standardised_x_train.shape))
print("standardised_x_test.shape: {}".format(standardised_x_test.shape))

x_train.shape: (398, 30)
x_test.shape: (171, 30)
y_train.shape: (398,)
y_test.shape: (171,)
standardised_x_train.shape: (398, 30)
standardised_x_test.shape: (171, 30)


In [3]:
# Lets start with sklearn's LinearSVC
# Create estimator and get predictions
lin_svc = LinearSVC(random_state=4)
lin_svc.fit(standardised_x_train, y_train)
lin_svc_preds = lin_svc.predict(standardised_x_test)
# Test predictions and print output
print('OOB Accuracy Score: {}'.format(accuracy_score(y_test, lin_svc_preds)))
print('OOB Classification Report:')
print(classification_report(y_test, lin_svc_preds))

OOB Accuracy Score: 0.9532163742690059
OOB Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.94      0.96       117
           1       0.88      0.98      0.93        54

   micro avg       0.95      0.95      0.95       171
   macro avg       0.94      0.96      0.95       171
weighted avg       0.96      0.95      0.95       171



Wow, for out of box usage that is really good. A solid 95.3% accuracy. Interestingly, while the recall is similar for both, and the precision for benign (0) is very high, the precision for malignant (1) is only 0.88. Normally, I would think this a bad thing, but since the objective here is to find out who needs cancer treatment I think that - combined with a recall of 0.98, that's fine. If they are found to likely have malignant tumours, is is better that this test flags up harmless cases for further analysis rather than fails to flag up actual malignant tumours - one results in unnecessary further medical examinations, the other kills people. 

In [4]:
# Next, lets try a KNeighborsClassifier
# Create estimator and get predictions
knc = KNeighborsClassifier()
knc.fit(standardised_x_train, y_train)
knc_pred = knc.predict(standardised_x_test)
# Test predictions and print output
print('OOB Accuracy Score: {}'.format(accuracy_score(y_test, knc_pred)))
print('OOB Classification Report:')
print(classification_report(y_test, knc_pred))

OOB Accuracy Score: 0.9649122807017544
OOB Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       117
           1       0.93      0.96      0.95        54

   micro avg       0.96      0.96      0.96       171
   macro avg       0.96      0.96      0.96       171
weighted avg       0.97      0.96      0.97       171



An even better accuracy score! 96.5% is very good. Additionally, while the precision score for benign (0) is 0.01 worse, it's recall is 0.03 higher. Unfortunately the recall of malignant (1) fell by 0.02, though it's precision rose by 0.05. Overall the f1-scores were both higher. 

In [5]:
# Next, lets try SVC
# Create estimator and get predictions
svc = SVC(random_state=4)
svc.fit(standardised_x_train, y_train)
svc_pred = svc.predict(standardised_x_test)
# Test predictions and print output
print('OOB Accuracy Score: {}'.format(accuracy_score(y_test, svc_pred)))
print('OOB Classification Report:')
print(classification_report(y_test, svc_pred))

OOB Accuracy Score: 0.9766081871345029
OOB Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98       117
           1       0.93      1.00      0.96        54

   micro avg       0.98      0.98      0.98       171
   macro avg       0.97      0.98      0.97       171
weighted avg       0.98      0.98      0.98       171



Yet another increase in accuracy - this time to 97.7%! Even better, the recall of the malignant (1) case is 1.00, which means that not a single actually malignant tumour in the test set was missed! On the other hand, 7% of diagnoses of malignant were incorrect, which means a lot of unnecessary, uncomfortable medical tests. But hey, at least it's 7% uncomfortable and not 7% dead because they got their cancer misclassified. 

In [6]:
# Carrying on with SVC, I'll try a GridSearchCV to see if the first >97.5% classifier can get any better. 
# Create param dict
parameters = {'kernel': ['rbf','poly','sigmoid'],
              'degree': np.arange(0,5).tolist(),
              'C': np.logspace(-3,2,6).tolist()}

# Create and run gridsearch
svc_grid = GridSearchCV(estimator=SVC(random_state=4), param_grid=parameters, refit=True, cv=5, scoring='accuracy')
svc_grid.fit(standardised_x_train, y_train)

# Get best score, estimator, and classification report
print('GridSearchCV best estimator: {}\n'.format(svc_grid.best_estimator_))
print('GridSearchCV best estimator accuracy score: {}'.format(accuracy_score(y_test, svc_grid.best_estimator_.predict(standardised_x_test))))
print('GridSearchCV best estimator classification Report:')
print(classification_report(y_test, svc_grid.best_estimator_.predict(standardised_x_test)))

GridSearchCV best estimator: SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=0, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=4,
  shrinking=True, tol=0.001, verbose=False)

GridSearchCV best estimator accuracy score: 0.9824561403508771
GridSearchCV best estimator classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99       117
           1       0.95      1.00      0.97        54

   micro avg       0.98      0.98      0.98       171
   macro avg       0.97      0.99      0.98       171
weighted avg       0.98      0.98      0.98       171



Turns out you can get better! What does it take? More regularisation! 10 times more, to be exact.  
With an accuracy score of 98.2%, and with an incrase of both classes f1-scores, this is looking to be a very useful classifier. 

## Summary
A dataset made of pre-engineered continuous ratio medical data, with no individually linearly separable features, is classifiable to a very high level of accuracy: 98.25%.  
With this level of accuracy - and in particular with this level of recall on the malignant (1) class, I don't see the need to engage in any further optimisation. 