# Lab: Machine Learning

## Setup

In [1]:
import tensorflow as tf
import numpy as np
tf.set_random_seed(777)  # for reproducibility

## Load Data

In [31]:
def read_training_data(fname, D=None):
    file = open(fname)
    params = ["radius", "texture", "perimeter","area","smoothness","compactness","concavity","concave points","symmetry","fractal dimension"];
    stats = ["(mean)", "(stderr)", "(worst)"]
    feature_labels = set([y+x for x in stats for y in params])
    feature_map = {params[i]+stats[j]:j*len(params)+i for i in range(len(params)) for j in range(len(stats))}
    
    patient_ids = []
    feature_vectors = []
    patient_diagnoses = []
    for line in file:
        row = line.split(",")
        patient_ID = int(row[0])
        patient_ids.append(patient_ID)
        
        patient_diagnoses.append([0 if row[1]=='B' else +1])
        feature_vectors.append([float(row[feature_map[key]+2]) for key in feature_labels])
    return patient_ids, feature_labels, np.array(feature_vectors, dtype=np.float32), np.array(patient_diagnoses, dtype=np.float32)

In [32]:
ids, labels, feature_vectors, patient_diagnoses = read_training_data('train.data')

In [33]:
print("label: ", labels)
print("feature_vectors: ", feature_vectors)
print("patient_diagnoses: ", patient_diagnoses)

label:  {'symmetry(worst)', 'radius(mean)', 'radius(stderr)', 'area(worst)', 'concavity(mean)', 'radius(worst)', 'compactness(stderr)', 'smoothness(stderr)', 'fractal dimension(worst)', 'smoothness(worst)', 'concave points(stderr)', 'symmetry(stderr)', 'smoothness(mean)', 'area(stderr)', 'symmetry(mean)', 'fractal dimension(stderr)', 'texture(mean)', 'fractal dimension(mean)', 'compactness(worst)', 'area(mean)', 'perimeter(stderr)', 'concave points(worst)', 'perimeter(worst)', 'concave points(mean)', 'compactness(mean)', 'concavity(stderr)', 'texture(worst)', 'concavity(worst)', 'perimeter(mean)', 'texture(stderr)'}
feature_vectors:  [[  4.60099995e-01   1.79899998e+01   1.09500003e+00 ...,   7.11899996e-01
    1.22800003e+02   9.05300021e-01]
 [  2.75000006e-01   2.05699997e+01   5.43500006e-01 ...,   2.41600007e-01
    1.32899994e+02   7.33900011e-01]
 [  3.61299992e-01   1.96900005e+01   7.45599985e-01 ...,   4.50399995e-01
    1.30000000e+02   7.86899984e-01]
 ..., 
 [  1.97799996e

In [36]:
x_data = feature_vectors
y_data = patient_diagnoses
print(x_data.shape, y_data.shape)
print(x_data, y_data)

(300, 30) (300, 1)
[[  4.60099995e-01   1.79899998e+01   1.09500003e+00 ...,   7.11899996e-01
    1.22800003e+02   9.05300021e-01]
 [  2.75000006e-01   2.05699997e+01   5.43500006e-01 ...,   2.41600007e-01
    1.32899994e+02   7.33900011e-01]
 [  3.61299992e-01   1.96900005e+01   7.45599985e-01 ...,   4.50399995e-01
    1.30000000e+02   7.86899984e-01]
 ..., 
 [  1.97799996e-01   1.17600002e+01   6.44999981e-01 ...,   6.12000003e-02
    7.50000000e+01   2.10500002e+00]
 [  2.63599992e-01   1.42600002e+01   2.30000004e-01 ...,   1.56499997e-01
    9.12200012e+01   6.69000030e-01]
 [  2.22700000e-01   1.05100002e+01   2.86799997e-01 ...,   4.15799990e-02
    6.68499985e+01   1.14300001e+00]] [[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]


In [47]:
# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 30])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([30, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

# Hypothesis using sigmoid: tf.div(1., 1. + tf.exp(tf.matmul(X, W)))
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)

# cost/loss function
cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) *
                       tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

# Accuracy computation
# True if hypothesis>0.5 else False
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

# Launch graph
with tf.Session() as sess:
    # Initialize TensorFlow variables
    sess.run(tf.global_variables_initializer())

#     for step in range(10001):
#         cost_val, _ = sess.run([cost, train], feed_dict={X: x_data, Y: y_data})
#         if step % 200 == 0:
#             print(step, cost_val)
        
    for step in range(10):
        print(sess.run(W))
        print(sess.run(hypothesis, feed_dict={X: x_data, Y: y_data}))
        print(sess.run(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis), feed_dict={X: x_data, Y: y_data}))
        cost_val, _ = sess.run([cost, train], feed_dict={X: x_data, Y: y_data})
        print(sess.run(W))
        print(step, cost_val)

    # Accuracy report
    h, c, a = sess.run([hypothesis, predicted, accuracy],
                       feed_dict={X: x_data, Y: y_data})
    print("\nHypothesis: ", h, "\nCorrect (Y): ", c, "\nAccuracy: ", a)

[[-0.99579185]
 [ 0.92669129]
 [-0.46672109]
 [ 1.32939219]
 [-0.19609638]
 [ 0.15245219]
 [ 0.40904137]
 [-1.4403795 ]
 [-0.40789711]
 [-0.98190165]
 [-0.7080192 ]
 [ 0.81205606]
 [ 0.18874829]
 [-1.90909159]
 [-1.63768077]
 [ 1.89433742]
 [-0.25371096]
 [ 0.49597964]
 [ 0.61322391]
 [-0.7822234 ]
 [-1.50169277]
 [-1.52879131]
 [ 0.43449357]
 [-0.80352122]
 [ 0.11317345]
 [-1.98990202]
 [-0.37527871]
 [-0.01020248]
 [ 0.37048405]
 [-0.63944954]]
[[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 