# SVM Breast Cancer Data

### Basic libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# setting style
sns.set_style(style='whitegrid')

In [3]:
# loading data
from sklearn.datasets import load_breast_cancer

In [4]:
# setting up data
cancer = load_breast_cancer()

In [5]:
# checking keys
cancer.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])

In [6]:
# showing description
print(cancer['DESCR'])

Breast Cancer Wisconsin (Diagnostic) Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, field
        13 is Radius SE, field 23 is Worst Radius.

        

In [7]:
# setting and showing data frame
df = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [8]:
# Saving target variable
y = cancer['target']

In [16]:
# setting as data frame
y = pd.DataFrame(y,columns=['target'])

In [21]:
# adding to df to balance sample
df['target'] = y['target']

## Balancing Sample

In [22]:
# :Class Distribution: 212 - Malignant, 357 - Benign
df.target.value_counts()

1    357
0    212
Name: target, dtype: int64

In [19]:
# importing lib
from sklearn.utils import resample

In [24]:
# Separate majority and minority classes
df_majority = df[df.target==1]
df_minority = df[df.target==0]

In [25]:
# generating equal numbers of variables
df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=357,
                                 random_state=123) 

df_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [26]:
# checking function
df_upsampled.target.value_counts()

1    357
0    357
Name: target, dtype: int64

## Logistic Regression Model

### Importing Model Libraries

In [27]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_predict
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression



In [28]:
# setting up logistic regression model
lr = LogisticRegression()

In [30]:
# Separate input features (X) and target variable (y)
y = df_upsampled.target
X = df_upsampled.drop('target', axis=1)

# splitting data into test and training group
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Train model
lrc = LogisticRegression().fit(X_train, y_train)
 
# Predict on training set
cross_val_score(lrc, X, y, cv= 5)

array([ 0.97916667,  0.97222222,  0.95774648,  0.95774648,  0.94366197])

## RFE

### importing Libraries

In [32]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.feature_selection import RFE

In [33]:
# See all feature ranks
nfeatures = 1

In [34]:
# Setting Up Selector
rfe = RFE(lr,nfeatures)

In [36]:
# Fitting rfe
fit = rfe.fit(X,y)

In [63]:
# Display of Features Importants
result_RFE = pd.DataFrame(list(fit.ranking_),index=X.columns)

In [66]:
# displaying r score
lrc.score(X,y)

0.96918767507002801

In [67]:
# saving copy of ranked features
sorted_rfe = result_RFE[0].sort_values()

In [124]:
sorted_rfe.head(5)

worst concavity         1
worst concave points    2
mean concavity          3
worst compactness       4
worst radius            5
Name: 0, dtype: int64

In [83]:
# taking 16 best features
RFE_x = df_upsampled[['mean radius','mean compactness','mean concavity',
                      'mean concave points','mean symmetry','radius error',
                      'texture error', 'perimeter error', 'area error',
                      'concave points error','worst radius','worst texture',
                      'mean texture','target']]

## Re-Fit Model

In [84]:
# Separate input features (X) and target variable (y)
y = df_upsampled.target
X = RFE_x.drop('target',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Train model
lrc = LogisticRegression().fit(X_train, y_train)
 
# Predict on training set
cross_val_score(lrc, X, y, cv= 5)

array([ 0.93055556,  0.92361111,  0.94366197,  0.94366197,  0.92253521])

In [86]:
# 14 features less and still 93%
lrc.score(X,y)

0.9327731092436975

We were able to remove 14 Features with out losing about 3% of the important data.

## Applying PCA

In [88]:
# standardizing data
ss = StandardScaler().fit_transform(X_train)

In [94]:
# using pca to select 2 components from our 14 remaining features
sklearn_pca = PCA(n_components=2)

# fitting our x training data to pca
Y_sklearn = sklearn_pca.fit_transform(ss)

# adding results to a new data frame
X_PCA = pd.DataFrame(Y_sklearn)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    sklearn_pca.explained_variance_ratio_
)

The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 [ 0.52092134  0.14509753]


## Linear Model

In [95]:
# Train model
lrc = LogisticRegression().fit(X_PCA, y_train)
 
# Predict on training set
cross_val_score(lrc, X, y, cv= 5)

array([ 0.93055556,  0.92361111,  0.94366197,  0.94366197,  0.92253521])

## Random Forest Model

### Importing Libraries

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.grid_search import GridSearchCV



In [114]:
# setting up random forest model
RFC = RandomForestClassifier()

In [115]:
# creating parameters to test
param_grid = {'n_estimators':[10,25,50,75,100,125,150,175,200,300,400,500]}

In [116]:
# fitting grid with setting
grid = GridSearchCV(RFC,param_grid,verbose=3)

In [117]:
# fitting grid with data
grid.fit(X_PCA, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] n_estimators=10 .................................................
[CV] ........................ n_estimators=10, score=0.906250 -   0.0s
[CV] n_estimators=10 .................................................
[CV] ........................ n_estimators=10, score=0.937107 -   0.0s
[CV] n_estimators=10 .................................................
[CV] ........................ n_estimators=10, score=0.962264 -   0.0s
[CV] n_estimators=25 .................................................
[CV] ........................ n_estimators=25, score=0.918750 -   0.0s
[CV] n_estimators=25 .................................................
[CV] ........................ n_estimators=25, score=0.937107 -   0.0s
[CV] n_estimators=25 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] ........................ n_estimators=25, score=0.968553 -   0.0s
[CV] n_estimators=50 .................................................
[CV] ........................ n_estimators=50, score=0.906250 -   0.0s
[CV] n_estimators=50 .................................................
[CV] ........................ n_estimators=50, score=0.937107 -   0.0s
[CV] n_estimators=50 .................................................
[CV] ........................ n_estimators=50, score=0.962264 -   0.0s
[CV] n_estimators=75 .................................................
[CV] ........................ n_estimators=75, score=0.906250 -   0.0s
[CV] n_estimators=75 .................................................
[CV] ........................ n_estimators=75, score=0.937107 -   0.0s
[CV] n_estimators=75 .................................................
[CV] ........................ n_estimators=75, score=0.962264 -   0.0s
[CV] n_estimators=100 ................................................
[CV] .

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   11.3s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [10, 25, 50, 75, 100, 125, 150, 175, 200, 300, 400, 500]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

In [118]:
# best parameter
grid.best_params_

{'n_estimators': 25}

In [122]:
# applying new parameter
RFC = RandomForestClassifier(n_estimators=25)

In [123]:
# testing model over 5 folds
cross_val_score(RFC.fit(X_PCA, y_train), X,y, cv=5)

array([ 0.96527778,  0.97222222,  0.98591549,  1.        ,  0.98591549])

## Conclusion

In this journal, we were able to train multiple models to accurately predict weither a beast tumer was Malignant or Benign.  Starting with a linear regression model, we were able to find our features coefficents and rank our features by importants. Then we were able to remove 14 features.  After that, we used principal component analysis to reduce our remaining 16 features into 2 components.  With these two components we were able to update our regression model to have arounnd 93% accuracy without over fitting.  Then applying our 2 components to a Random Forest model we could improve our accuracy even further getting up to about 98% accuracy. 