## Code

reference: https://www.kaggle.com/jcrowe/model-comparison-for-breast-cancer-diagnosis/notebook

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns  # used for plot interactive graph

from sklearn import preprocessing
from sklearn.model_selection import train_test_split  # to split the data into two parts
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import metrics  # for the check the error and accuracy of the model

from sklearn.cross_validation import KFold  # used for cross validation

from scipy.stats import randint as sp_randint

%matplotlib inline
import matplotlib.pyplot as plt



In [2]:
df = pd.read_csv("data/data.csv", header=0)  # header=0 means the 0-th row is header

In [3]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [4]:
# Remove unnecessary columns
df.drop('id',axis=1,inplace=True)
df.drop('Unnamed: 32',axis=1,inplace=True)

In [5]:
# Change categorical strings to numerical values
df['diagnosis']=df['diagnosis'].map({'M':1,'B':0})

---

## 1. Feature Selection

데이터의 차원이 총 30개로 너무 많았기 때문에 feature를 선택할 필요가 있었다. 그래서 feature 간의 상관관계를 분석했다.

### 1.1 Collinearity

In [6]:
prediction_var = list(df)[1:]

In [7]:
# corr = df[prediction_var].corr() # .corr is used for find corelation
# plt.figure(figsize=(30,30))
# sns.heatmap(corr, cbar = True,  square = True, annot=True, fmt= '.2f',annot_kws={'size': 15},
#            xticklabels= prediction_var, yticklabels= prediction_var,
#            cmap= 'coolwarm') # for more on heatmap you can visit Link(http://seaborn.pydata.org/generated/seaborn.heatmap.html)

### 관찰결과

+ 상관관계 0.9 이상 
    + (radius_mean, perimeter_mean, area_mean)
    + (radius_se, perimeter_se, area_se)
    + (radius_worst, perimeter_worst, area_worst)


#### Trimming Data

From observing the graphs and PCA data above: fractal_dimension_mean, smoothness_mean and symmetry_mean are not very useful in predicting the type of cancer. To aid in the learning process and remove noise, these columns will be dropped.

---

## 1. Data

In [17]:
df2 = df.iloc[:,:]
# Remove unnecessary columns
# df2.drop(['fractal_dimension_mean', 'smoothness_mean', 'symmetry_mean'],axis=1,inplace=True)

In [18]:
# now split our data into train and test
train, test = train_test_split(df2, test_size = 0.3) # in this our main data is splitted into train and test
# we can check their dimension
print(train.shape)
print(test.shape)

(398, 31)
(171, 31)


In [19]:
prediction_var = list(df2)[1:]
len(prediction_var)

30

In [20]:
train_X = train[prediction_var]  # taking the training data input 
train_y = train.diagnosis  # This is output of our training data
# same we have to do for test
test_X = test[prediction_var]  # taking test data inputs
test_y = test.diagnosis  # output value of test data

---

## 2. Classification Method

본 데이터의 classification을 위해 3가지 알고리즘(2.1 Logistic Regression, 2.2 Random Forest, 2.3 Neural Network)을 사용했다.
그리고 label이 없다고 가정하고 2.4 K-means clustering 알고리즘을 사용해서 label 없이도 악성 종양과 양성 종양을 잘 구분하는지 알아보았다.

### 2.1 Logistic Regression

+ Linear Regression: What are the coefficients of the Linear Regression

In [12]:
model = LogisticRegression()
model.fit(train_X, train_y)

model_scores = cross_val_score(model, train_X, train_y, cv=10, scoring='accuracy')
print ('cross_val_score: %f' % model_scores.mean())

prediction = model.predict(test_X)
print ('test_score: %f' % metrics.accuracy_score(prediction,test_y))

cross_val_score: 0.937179
test_score: 0.970760


### 2.1 Linear Classifier

+ Linear Regression: What are the coefficients of the Linear Regression

In [13]:
train_X = train_X.as_matrix()

In [14]:
train_y = train_y.reshape(len(train_y),1)
train_y

  """Entry point for launching an IPython kernel.


array([[0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [0],
       [0],
       [1],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [0],
       [0],
    

In [15]:
import tensorflow as tf

# train_X = train_X.as_matrix()
# train_y = train_y.as_matrix().reshape([len(train), 1])

x = tf.placeholder(tf.float32, [None, 30])
w = tf.Variable(tf.zeros([30,1]))
w0 = tf.Variable(tf.zeros([1]))
f = tf.matmul(x,w) + w0
p = tf.sigmoid(f)

y = tf.placeholder(tf.float32, [None,1])
loss = -tf.reduce_sum(y*tf.log(p) + (1-y)*tf.log(1-p))
train_step = tf.train.AdamOptimizer().minimize(loss)

correct_prediction = tf.equal(tf.sign(p-0.5), tf.sign(y-0.5))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

sess = tf.Session()
sess.run(tf.global_variables_initializer())

i = 0
for _ in range(20000):
    i += 1
    sess.run(train_step, feed_dict={x: train_X, y:train_y})
    if i% 2000 == 0:
        loss_val, acc_val = sess.run([loss, accuracy], feed_dict={x:train_X, y:train_y})
        print ('Step: %d, Loss: %f, Accuracy: %f' % (i, loss_val, acc_val))

InternalError: Blas GEMV launch failed:  m=30, n=398
	 [[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_Placeholder_0_0/_3, Variable/read)]]

Caused by op u'MatMul', defined at:
  File "/home/snu/anaconda2/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/home/snu/anaconda2/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/home/snu/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/snu/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/snu/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "/home/snu/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "/home/snu/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "/home/snu/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/ipykernel/zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2717, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2821, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/snu/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-c2aa60775e27>", line 9, in <module>
    f = tf.matmul(x,w) + w0
  File "/home/snu/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/math_ops.py", line 1816, in matmul
    a, b, transpose_a=transpose_a, transpose_b=transpose_b, name=name)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/tensorflow/python/ops/gen_math_ops.py", line 1217, in _mat_mul
    transpose_b=transpose_b, name=name)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "/home/snu/anaconda2/lib/python2.7/site-packages/tensorflow/python/framework/ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

InternalError (see above for traceback): Blas GEMV launch failed:  m=30, n=398
	 [[Node: MatMul = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false, _device="/job:localhost/replica:0/task:0/gpu:0"](_arg_Placeholder_0_0/_3, Variable/read)]]


### 2.2 Random Forest Classification

+ Random Forest: What 5 features are the most informative and what are their information strength (i.e., information gain)?

In [21]:
model = RandomForestClassifier(n_estimators=100)
model.fit(train_X,train_y)

model_scores = cross_val_score(model, train_X, train_y, cv=10, scoring='accuracy')
print ('cross_val_score: %f' % model_scores.mean())

prediction = model.predict(test_X)
print ('test_score: %f' % metrics.accuracy_score(prediction,test_y))

cross_val_score: 0.962164
test_score: 0.970760


In [22]:
featimp = pd.Series(model.feature_importances_, index=prediction_var).sort_values(ascending=False)
print(featimp) # this is the property of Random Forest classifier that it provide us the importance 
# of the features used

area_worst                 0.148913
perimeter_worst            0.142155
radius_worst               0.125344
concave points_worst       0.094542
concave points_mean        0.078914
area_se                    0.056114
area_mean                  0.047055
radius_mean                0.041929
concavity_mean             0.032679
perimeter_mean             0.024948
perimeter_se               0.024159
smoothness_worst           0.021156
compactness_worst          0.020882
texture_worst              0.020055
concavity_worst            0.016413
radius_se                  0.015360
texture_mean               0.013630
compactness_mean           0.010909
fractal_dimension_worst    0.009985
symmetry_worst             0.009287
concave points_se          0.006876
symmetry_se                0.005462
texture_se                 0.005176
smoothness_mean            0.004843
smoothness_se              0.004814
fractal_dimension_se       0.004599
concavity_se               0.003912
compactness_se             0

### 2.3 Neural Network

+ Neural Network: How does the classification accuracy change with different number of hidden layers?

In [None]:
import tensorflow as tf
import math

n_classes = 2
n_features = 30
n_hidden_1 = 8
n_hidden_2 = 8

learning_rate = 0.01
max_iter = 50000

### define graph
x = tf.placeholder(tf.float32, shape=[None, n_features], name="X")
y_label = tf.placeholder(tf.int32, shape=[None], name="Y_label")
# one-hot encoding
y = tf.one_hot(indices=y_label, depth=n_classes)

# hidden 1
with tf.name_scope('hidden1'):
  weights1 = tf.Variable(
      tf.truncated_normal([n_features, n_hidden_1],
                          stddev=1.0),
      name='weights')
  biases1 = tf.Variable(
      tf.zeros([n_hidden_1]),
      name='biases')
  hidden1 = tf.nn.relu(tf.matmul(x, weights1) + biases1)
# hidden 2
with tf.name_scope('hidden2'):
  weights2 = tf.Variable(
      tf.truncated_normal([n_hidden_1, n_hidden_2],
                          stddev=1.0),
      name='weights')
  biases2 = tf.Variable(
      tf.zeros([n_hidden_2]),
      name='biases')
  hidden2 = tf.nn.relu(tf.matmul(hidden1, weights2) + biases2)
# Linear
with tf.name_scope('softmax_linear'):
  weights3 = tf.Variable(
      tf.truncated_normal([n_hidden_2, n_classes],
                          stddev=1.0),
      name='weights')
  biases3 = tf.Variable(
      tf.zeros([n_classes]),
      name='biases')
  logits = tf.matmul(hidden2, weights3) + biases3

pred = tf.cast( tf.argmax(logits, 1), tf.int32 )
accuracy = tf.reduce_mean( tf.cast( tf.equal(pred, y_label), tf.float32 ))


### define loss
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y))
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)



### Starting sessions
with tf.Session() as sess:
  ## initialize variables
  init = tf.global_variables_initializer()
  sess.run(init)

  ## training
  for i in range(max_iter):
    _, accuracy_val, pred_val, loss_val = sess.run(
      [train_step, accuracy, pred, loss], 
      feed_dict={x: train_X, y_label: train_y})

    if i % 10000 == 0:
      print('=========== Epoch: %d ===========' % i)
      print('Loss', loss_val)
      print('Accuracy', accuracy_val)
      # print('Y_prediction ', pred_val[:10])
      # print('True', train_y[:10])
      
      # accuracy for testset
      test_accuracy, test_pred = sess.run( 
        [accuracy, pred],
        feed_dict={x: test_X, y_label: test_y})
      print('---- evaluation ----')
      print('acc: %.4f' % test_accuracy)
      # print('pred', test_pred)
      # print('true', test_y)


    if math.isnan(loss_val):
      print('LOSS is NAN!')
      break

In [None]:
model = MLPClassifier(hidden_layer_sizes=(100,30))
model.fit(train_X,train_y)

model_scores = cross_val_score(model, train_X, train_y, cv=10, scoring='accuracy')
print ('cross_val_score: %f' % model_scores.mean())

prediction = model.predict(test_X)
print ('test_score: %f' % metrics.accuracy_score(prediction,test_y))

## 2.4 K-means clustering

In [None]:
df3 = df2.iloc[:,1:]

X = df3
y = df2.iloc[:,0]

In [None]:
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D

estimators = [('k=2', KMeans(n_clusters=2)),
              ('k=4', KMeans(n_clusters=4)),
              ('k=5', KMeans(n_clusters=5)),
              ('k=7', KMeans(n_clusters=7)),
              ('k=10', KMeans(n_clusters=10))]

# print metric measure headers
print ("Estimator\tHomogeneity\tCompleteness\tV-means\t\tARI\t\tAMI\t\tSilhouette")

fignum = 1
titles = ['2 clusters', '4 clusters', '5 clusters', '7 clusters', '10 clusters']

for name, est in estimators:
    est.fit(X)
    labels = est.labels_
    
    # measure cluster qualities
    print('%s\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f\t\t%.3f'
          % (name,
             metrics.homogeneity_score(y, est.labels_),
             metrics.completeness_score(y, est.labels_),
             metrics.v_measure_score(y, est.labels_),
             metrics.adjusted_rand_score(y, est.labels_),
             metrics.adjusted_mutual_info_score(y, est.labels_),
             metrics.silhouette_score(X, est.labels_, metric='euclidean')))