# Lab: Machine Learning

## Setup

In [1]:
import tensorflow as tf
import numpy as np
tf.set_random_seed(777)  # for reproducibility

## Load Data

In [2]:
def read_training_data(fname, D=None):
    file = open(fname)
    params = ["radius", "texture", "perimeter","area","smoothness","compactness","concavity","concave points","symmetry","fractal dimension"];
    stats = ["(mean)", "(stderr)", "(worst)"]
    feature_labels = set([y+x for x in stats for y in params])
    feature_map = {params[i]+stats[j]:j*len(params)+i for i in range(len(params)) for j in range(len(stats))}
    
    patient_ids = []
    feature_vectors = []
    patient_diagnoses = []
    for line in file:
        row = line.split(",")
        patient_ID = int(row[0])
        patient_ids.append(patient_ID)
        
        patient_diagnoses.append([0 if row[1]=='B' else +1])
        feature_vectors.append([float(row[feature_map[key]+2]) for key in feature_labels])
    return patient_ids, feature_labels, np.array(feature_vectors, dtype=np.float32), np.array(patient_diagnoses, dtype=np.float32)

In [3]:
ids_train, labels_train, feature_vectors_train, patient_diagnoses_train = read_training_data('train.data')

In [4]:
print("label: ", labels_train)
print("feature_vectors: ", feature_vectors_train)
print("patient_diagnoses: ", patient_diagnoses_train)

label:  {'symmetry(mean)', 'concavity(mean)', 'fractal dimension(worst)', 'concave points(worst)', 'compactness(stderr)', 'compactness(worst)', 'concave points(mean)', 'perimeter(mean)', 'symmetry(stderr)', 'texture(mean)', 'fractal dimension(mean)', 'radius(stderr)', 'area(worst)', 'symmetry(worst)', 'concavity(stderr)', 'smoothness(worst)', 'fractal dimension(stderr)', 'radius(mean)', 'area(stderr)', 'concavity(worst)', 'perimeter(stderr)', 'area(mean)', 'texture(stderr)', 'compactness(mean)', 'concave points(stderr)', 'smoothness(stderr)', 'texture(worst)', 'smoothness(mean)', 'perimeter(worst)', 'radius(worst)'}
feature_vectors:  [[  2.41899997e-01   3.00099999e-01   1.18900001e-01 ...,   1.18400000e-01
    1.84600006e+02   2.53799992e+01]
 [  1.81199998e-01   8.69000033e-02   8.90199989e-02 ...,   8.47399980e-02
    1.58800003e+02   2.49899998e+01]
 [  2.06900001e-01   1.97400004e-01   8.75800028e-02 ...,   1.09600000e-01
    1.52500000e+02   2.35699997e+01]
 ..., 
 [  1.61899999e

In [5]:
x_data_train = feature_vectors_train
y_data_train = patient_diagnoses_train
print(x_data_train.shape, y_data_train.shape)
print(x_data_train, y_data_train)

(300, 30) (300, 1)
[[  2.41899997e-01   3.00099999e-01   1.18900001e-01 ...,   1.18400000e-01
    1.84600006e+02   2.53799992e+01]
 [  1.81199998e-01   8.69000033e-02   8.90199989e-02 ...,   8.47399980e-02
    1.58800003e+02   2.49899998e+01]
 [  2.06900001e-01   1.97400004e-01   8.75800028e-02 ...,   1.09600000e-01
    1.52500000e+02   2.35699997e+01]
 ..., 
 [  1.61899999e-01   2.68500000e-02   6.91500008e-02 ...,   9.96799991e-02
    8.50999985e+01   1.33599997e+01]
 [  1.63499996e-01   2.47499999e-02   7.67600015e-02 ...,   6.57600015e-02
    1.05800003e+02   1.62199993e+01]
 [  1.69499993e-01   2.49499995e-02   6.77699968e-02 ...,   1.01499997e-01
    7.00999985e+01   1.09300003e+01]] [[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]


In [6]:
ids_test, labels_test, feature_vectors_test, patient_diagnoses_test = read_training_data('validate.data')

In [7]:
x_data_test = feature_vectors_test
y_data_test = patient_diagnoses_test
print(x_data_test.shape, y_data_test.shape)
print(x_data_test, y_data_test)

(260, 30) (260, 1)
[[  1.79199994e-01   2.19699994e-01   9.92899984e-02 ...,   1.15000002e-01
    1.71100006e+02   2.59300003e+01]
 [  1.78100005e-01   6.83000013e-02   7.76399970e-02 ...,   8.45099986e-02
    8.81299973e+01   1.34600000e+01]
 [  2.24900007e-01   2.28300005e-01   9.46900025e-02 ...,   1.08000003e-01
    1.58800003e+02   2.36800003e+01]
 ..., 
 [  1.74199998e-01   0.00000000e+00   6.96899965e-02 ...,   8.12299997e-02
    6.65000000e+01   1.04899998e+01]
 [  1.45400003e-01   1.02899998e-01   8.00400004e-02 ...,   8.47299993e-02
    1.05900002e+02   1.54799995e+01]
 [  1.38799995e-01   1.11199997e-01   8.73199999e-02 ...,   9.26100016e-02
    8.22799988e+01   1.24799995e+01]] [[ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]


## Non normalized

In [8]:
# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 30])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([30, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

# Hypothesis using sigmoid: tf.div(1., 1. + tf.exp(tf.matmul(X, W)))
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)

# cost/loss function
cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

# Accuracy computation
# True if hypothesis>0.5 else False
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

# Launch graph
with tf.Session() as sess:
    # Initialize TensorFlow variables
    sess.run(tf.global_variables_initializer())

    for step in range(10001):
        cost_val, _ = sess.run([cost, train], feed_dict={X: x_data_train, Y: y_data_train})
        if step % 200 == 0:
            print(step, cost_val)

    # Accuracy report
    a = sess.run(accuracy, feed_dict={X: x_data_train, Y: y_data_train})
    print("Train Accuracy: ", a)
    a = sess.run(accuracy, feed_dict={X: x_data_test, Y: y_data_test})
    print("Test Accuracy: ", a)

0 nan
200 nan
400 nan
600 nan
800 nan
1000 nan
1200 nan
1400 nan
1600 nan
1800 nan
2000 nan
2200 nan
2400 nan
2600 nan
2800 nan
3000 nan
3200 nan
3400 nan
3600 nan
3800 nan
4000 nan
4200 nan
4400 nan
4600 nan
4800 nan
5000 nan
5200 nan
5400 nan
5600 nan
5800 nan
6000 nan
6200 nan
6400 nan
6600 nan
6800 nan
7000 nan
7200 nan
7400 nan
7600 nan
7800 nan
8000 nan
8200 nan
8400 nan
8600 nan
8800 nan
9000 nan
9200 nan
9400 nan
9600 nan
9800 nan
10000 nan
Train Accuracy:  0.513333
Test Accuracy:  0.769231


## Normalized

In [9]:
def MinMaxScaler(data):
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)
    # noise term prevents the zero division
    return numerator / (denominator + 1e-7)

In [10]:
x_data_train = MinMaxScaler(feature_vectors_train)
y_data_train = patient_diagnoses_train
print(x_data_train.shape, y_data_train.shape)
print(x_data_train, y_data_train)

(300, 30) (300, 1)
[[ 0.668446    0.70313948  0.41886368 ...,  0.68000889  0.78754622
   0.6927352 ]
 [ 0.34436712  0.2036082   0.22287799 ...,  0.2704705   0.6361289
   0.67725289]
 [ 0.48158008  0.4625116   0.21343292 ...,  0.57294005  0.59915489
   0.62088132]
 ..., 
 [ 0.24132393  0.06291001  0.09254882 ...,  0.45224422  0.20359175
   0.21556173]
 [ 0.24986637  0.05798968  0.14246352 ...,  0.03954251  0.32507777
   0.32909882]
 [ 0.2819005   0.05845828  0.08349725 ...,  0.474388    0.11555842
   0.1190949 ]] [[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]

In [11]:
x_data_test = MinMaxScaler(feature_vectors_test)
y_data_test = patient_diagnoses_test
print(x_data_test.shape, y_data_test.shape)
print(x_data_test, y_data_test)

(260, 30) (260, 1)
[[ 0.43118548  0.60440147  0.51803905 ...,  0.50241554  0.59280163
   0.63050944]
 [ 0.42313287  0.18789542  0.26360288 ...,  0.18895839  0.17101315
   0.17476791]
 [ 0.76573873  0.6280604   0.46397877 ...,  0.43045098  0.53027302
   0.54827863]
 ..., 
 [ 0.39458233  0.          0.1701725  ...,  0.15523787  0.06105434
   0.0662232 ]
 [ 0.18374801  0.28308108  0.29180831 ...,  0.19122015  0.2613492
   0.24859291]
 [ 0.13543175  0.30591464  0.37736467 ...,  0.2722317   0.14127395
   0.13895179]] [[ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]

In [12]:
# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 30])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([30, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

# Hypothesis using sigmoid: tf.div(1., 1. + tf.exp(tf.matmul(X, W)))
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)

# cost/loss function
cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

# Accuracy computation
# True if hypothesis>0.5 else False
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

# Launch graph
with tf.Session() as sess:
    # Initialize TensorFlow variables
    sess.run(tf.global_variables_initializer())

    for step in range(10001):
        cost_val, _ = sess.run([cost, train], feed_dict={X: x_data_train, Y: y_data_train})
        if step % 200 == 0:
            print(step, cost_val)

    # Accuracy report
    a = sess.run(accuracy, feed_dict={X: x_data_train, Y: y_data_train})
    print("Train Accuracy: ", a)
    a = sess.run(accuracy, feed_dict={X: x_data_test, Y: y_data_test})
    print("Test Accuracy: ", a)

0 0.990356
200 0.720044
400 0.659994
600 0.614962
800 0.576458
1000 0.543203
1200 0.514316
1400 0.489071
1600 0.466872
1800 0.447232
2000 0.429756
2200 0.414118
2400 0.40005
2600 0.387334
2800 0.375787
3000 0.365256
3200 0.355613
3400 0.34675
3600 0.338577
3800 0.331014
4000 0.323996
4200 0.317463
4400 0.311367
4600 0.305663
4800 0.300315
5000 0.295288
5200 0.290554
5400 0.286086
5600 0.281862
5800 0.277862
6000 0.274067
6200 0.270461
6400 0.267029
6600 0.263759
6800 0.260638
7000 0.257657
7200 0.254804
7400 0.252072
7600 0.249452
7800 0.246937
8000 0.24452
8200 0.242196
8400 0.239958
8600 0.237801
8800 0.235722
9000 0.233714
9200 0.231775
9400 0.2299
9600 0.228086
9800 0.22633
10000 0.224629
Train Accuracy:  0.96
Test Accuracy:  0.961538
