# Lab: Machine Learning

## Setup

In [1]:
import tensorflow as tf
import numpy as np
tf.set_random_seed(777)  # for reproducibility

## Load Data

In [2]:
def read_training_data(fname, D=None):
    file = open(fname)
    params = ["radius", "texture", "perimeter","area","smoothness","compactness","concavity","concave points","symmetry","fractal dimension"];
    stats = ["(mean)", "(stderr)", "(worst)"]
    feature_labels = set([y+x for x in stats for y in params])
    feature_map = {params[i]+stats[j]:j*len(params)+i for i in range(len(params)) for j in range(len(stats))}
    
    patient_ids = []
    feature_vectors = []
    patient_diagnoses = []
    for line in file:
        row = line.split(",")
        patient_ID = int(row[0])
        patient_ids.append(patient_ID)
        
        patient_diagnoses.append([0 if row[1]=='B' else +1])
        feature_vectors.append([float(row[feature_map[key]+2]) for key in feature_labels])
    return patient_ids, feature_labels, np.array(feature_vectors, dtype=np.float32), np.array(patient_diagnoses, dtype=np.float32)

In [3]:
ids_train, labels_train, feature_vectors_train, patient_diagnoses_train = read_training_data('train.data')

In [4]:
print("label: ", labels_train)
print("feature_vectors: ", feature_vectors_train)
print("patient_diagnoses: ", patient_diagnoses_train)

label:  {'compactness(worst)', 'area(mean)', 'concave points(mean)', 'smoothness(mean)', 'perimeter(stderr)', 'fractal dimension(stderr)', 'perimeter(worst)', 'fractal dimension(mean)', 'area(stderr)', 'compactness(mean)', 'smoothness(worst)', 'compactness(stderr)', 'concavity(mean)', 'texture(mean)', 'concavity(worst)', 'concavity(stderr)', 'radius(mean)', 'symmetry(mean)', 'perimeter(mean)', 'texture(stderr)', 'concave points(stderr)', 'symmetry(stderr)', 'texture(worst)', 'symmetry(worst)', 'radius(stderr)', 'area(worst)', 'fractal dimension(worst)', 'smoothness(stderr)', 'concave points(worst)', 'radius(worst)'}
feature_vectors:  [[  6.65600002e-01   1.00100000e+03   1.47100002e-01 ...,   6.39900006e-03
    2.65399992e-01   2.53799992e+01]
 [  1.86600000e-01   1.32600000e+03   7.01700002e-02 ...,   5.22499997e-03
    1.86000004e-01   2.49899998e+01]
 [  4.24499989e-01   1.20300000e+03   1.27900004e-01 ...,   6.14999980e-03
    2.43000001e-01   2.35699997e+01]
 ..., 
 [  7.97400028e

In [5]:
x_data_train = feature_vectors_train
y_data_train = patient_diagnoses_train
print(x_data_train.shape, y_data_train.shape)
print(x_data_train, y_data_train)

(300, 30) (300, 1)
[[  6.65600002e-01   1.00100000e+03   1.47100002e-01 ...,   6.39900006e-03
    2.65399992e-01   2.53799992e+01]
 [  1.86600000e-01   1.32600000e+03   7.01700002e-02 ...,   5.22499997e-03
    1.86000004e-01   2.49899998e+01]
 [  4.24499989e-01   1.20300000e+03   1.27900004e-01 ...,   6.14999980e-03
    2.43000001e-01   2.35699997e+01]
 ..., 
 [  7.97400028e-02   4.31100006e+02   3.51499990e-02 ...,   5.59599977e-03
    7.15999976e-02   1.33599997e+01]
 [  2.16700003e-01   6.33099976e+02   1.37400003e-02 ...,   3.16899992e-03
    7.53000006e-02   1.62199993e+01]
 [  8.61399993e-02   3.34200012e+02   1.87500007e-02 ...,   1.01699997e-02
    3.12500000e-02   1.09300003e+01]] [[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]


In [6]:
ids_test, labels_test, feature_vectors_test, patient_diagnoses_test = read_training_data('validate.data')

In [7]:
x_data_test = feature_vectors_test
y_data_test = patient_diagnoses_test
print(x_data_test.shape, y_data_test.shape)
print(x_data_test, y_data_test)

(260, 30) (260, 1)
[[  4.11599994e-01   1.21700000e+03   1.06200002e-01 ...,   6.05600001e-03
    1.97999999e-01   2.59300003e+01]
 [  2.15800002e-01   4.71299988e+02   3.09900008e-02 ...,   6.52999990e-03
    7.62500018e-02   1.34600000e+01]
 [  3.39100003e-01   1.24700000e+03   1.28000006e-01 ...,   7.96400011e-03
    1.92300007e-01   2.36800003e+01]
 ..., 
 [  7.15800002e-02   2.71299988e+02   0.00000000e+00 ...,   1.15900002e-02
    0.00000000e+00   1.04899998e+01]
 [  3.17099988e-01   6.57099976e+02   3.73600014e-02 ...,   4.24200017e-03
    1.10500000e-01   1.54799995e+01]
 [  2.51700014e-01   4.03500000e+02   4.10499983e-02 ...,   8.20000004e-03
    9.65299979e-02   1.24799995e+01]] [[ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]


## Non normalized

In [8]:
# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 30])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([30, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

# Hypothesis using sigmoid: tf.div(1., 1. + tf.exp(tf.matmul(X, W)))
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)

# cost/loss function
cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

# Accuracy computation
# True if hypothesis>0.5 else False
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

# Launch graph
with tf.Session() as sess:
    # Initialize TensorFlow variables
    sess.run(tf.global_variables_initializer())

    for step in range(10001):
        cost_val, _ = sess.run([cost, train], feed_dict={X: x_data_train, Y: y_data_train})
        if step % 1000 == 0:
            print(step, cost_val)

    # Accuracy report
    a = sess.run(accuracy, feed_dict={X: x_data_train, Y: y_data_train})
    print("Train Accuracy: ", a)
    a = sess.run(accuracy, feed_dict={X: x_data_test, Y: y_data_test})
    print("Test Accuracy: ", a)

0 nan
1000 nan
2000 nan
3000 nan
4000 nan
5000 nan
6000 nan
7000 nan
8000 nan
9000 nan
10000 nan
Train Accuracy:  0.513333
Test Accuracy:  0.769231


## Normalized

In [9]:
def MinMaxScaler(data):
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)
    # noise term prevents the zero division
    return numerator / (denominator + 1e-7)

In [10]:
x_data_train = MinMaxScaler(feature_vectors_train)
y_data_train = patient_diagnoses_train
print(x_data_train.shape, y_data_train.shape)
print(x_data_train, y_data_train)

(300, 30) (300, 1)
[[ 0.61929148  0.3640416   0.73111296 ...,  0.1592951   0.91202718
   0.6927352 ]
 [ 0.15456334  0.50201654  0.3487573  ...,  0.11938635  0.63917506
   0.67725289]
 [ 0.38537508  0.44979835  0.63568562 ...,  0.15083063  0.83505124
   0.62088132]
 ..., 
 [ 0.05088725  0.12209722  0.17470171 ...,  0.13199805  0.24604802
   0.21556173]
 [ 0.18376651  0.20785394  0.06829023 ...,  0.04949502  0.25876281
   0.32909882]
 [ 0.05709656  0.08095946  0.09319081 ...,  0.28748584  0.10738828
   0.1190949 ]] [[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.

In [11]:
x_data_test = MinMaxScaler(feature_vectors_test)
y_data_test = patient_diagnoses_test
print(x_data_test.shape, y_data_test.shape)
print(x_data_test, y_data_test)

(260, 30) (260, 1)
[[ 0.41171205  0.44906887  0.55514866 ...,  0.17740576  0.71843231
   0.63050944]
 [ 0.19285063  0.12910837  0.16199678 ...,  0.20221847  0.27666903
   0.17476791]
 [ 0.33067298  0.46194109  0.66910577 ...,  0.27728483  0.69775021
   0.54827863]
 ..., 
 [ 0.03164436  0.04329357  0.         ...,  0.46709692  0.          0.0662232 ]
 [ 0.3060818   0.20883033  0.19529524 ...,  0.08244736  0.40094328
   0.24859291]
 [ 0.23297898  0.10001716  0.21458429 ...,  0.28963885  0.35025388
   0.13895179]] [[ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]


In [12]:
# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 30])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([30, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

# Hypothesis using sigmoid: tf.div(1., 1. + tf.exp(tf.matmul(X, W)))
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)

# cost/loss function
cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

# Accuracy computation
# True if hypothesis>0.5 else False
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

# Launch graph
with tf.Session() as sess:
    # Initialize TensorFlow variables
    sess.run(tf.global_variables_initializer())

    for step in range(10001):
        cost_val, _ = sess.run([cost, train], feed_dict={X: x_data_train, Y: y_data_train})
        if step % 1000 == 0:
            print(step, cost_val)

    # Accuracy report
    a = sess.run(accuracy, feed_dict={X: x_data_train, Y: y_data_train})
    print("Train Accuracy: ", a)
    a = sess.run(accuracy, feed_dict={X: x_data_test, Y: y_data_test})
    print("Test Accuracy: ", a)

0 0.80712
1000 0.457697
2000 0.380548
3000 0.333983
4000 0.302688
5000 0.28004
6000 0.262765
7000 0.249065
8000 0.23787
9000 0.228504
10000 0.220517
Train Accuracy:  0.946667
Test Accuracy:  0.942308


## Multi Layers

In [13]:
# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 30])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W1 = tf.Variable(tf.random_normal([30, 100]), name='weight1')
b1 = tf.Variable(tf.random_normal([100]), name='bias1')
layer1 = tf.sigmoid(tf.matmul(X, W1) + b1)

W2 = tf.Variable(tf.random_normal([100, 100]), name='weight2')
b2 = tf.Variable(tf.random_normal([100]), name='bias2')
layer2 = tf.sigmoid(tf.matmul(layer1, W2) + b2)

W3 = tf.Variable(tf.random_normal([100, 100]), name='weight3')
b3 = tf.Variable(tf.random_normal([100]), name='bias3')
layer3 = tf.sigmoid(tf.matmul(layer2, W3) + b3)

W4 = tf.Variable(tf.random_normal([100, 1]), name='weight4')
b4 = tf.Variable(tf.random_normal([1]), name='bias4')
hypothesis = tf.sigmoid(tf.matmul(layer3, W4) + b4)


# cost/loss function
cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

# Accuracy computation
# True if hypothesis>0.5 else False
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

# Launch graph
with tf.Session() as sess:
    # Initialize TensorFlow variables
    sess.run(tf.global_variables_initializer())

    for step in range(10001):
        cost_val, _ = sess.run([cost, train], feed_dict={X: x_data_train, Y: y_data_train})
        if step % 1000 == 0:
            print(step, cost_val)

    # Accuracy report
    a = sess.run(accuracy, feed_dict={X: x_data_train, Y: y_data_train})
    print("Train Accuracy: ", a)
    a = sess.run(accuracy, feed_dict={X: x_data_test, Y: y_data_test})
    print("Test Accuracy: ", a)

0 8.63053
1000 0.138003
2000 0.108111
3000 0.0949948
4000 0.0868118
5000 0.0808928
6000 0.0761045
7000 0.0718798
8000 0.0682603
9000 0.0653758
10000 0.0629778
Train Accuracy:  0.98
Test Accuracy:  0.973077
