# Lab: Machine Learning

## Setup

In [1]:
import tensorflow as tf
import numpy as np
tf.set_random_seed(777)  # for reproducibility

## Load Data

In [11]:
def read_training_data(fname, D=None):
    file = open(fname)
    params = ["radius", "texture", "perimeter","area","smoothness","compactness","concavity","concave points","symmetry","fractal dimension"];
    stats = ["(mean)", "(stderr)", "(worst)"]
    feature_labels = set([y+x for x in stats for y in params])
    feature_map = {params[i]+stats[j]:j*len(params)+i for i in range(len(params)) for j in range(len(stats))}
    
    patient_ids = []
    feature_vectors = []
    patient_diagnoses = []
    for line in file:
        row = line.split(",")
        patient_ID = int(row[0])
        patient_ids.append(patient_ID)
        
        patient_diagnoses.append([0 if row[1]=='B' else +1])
        feature_vectors.append([float(row[feature_map[key]+2]) for key in feature_labels])
    return patient_ids, feature_labels, np.array(feature_vectors, dtype=np.float32), np.array(patient_diagnoses, dtype=np.float32)

In [15]:
ids_train, labels_train, feature_vectors_train, patient_diagnoses_train = read_training_data('train.data')

In [16]:
print("label: ", labels_train)
print("feature_vectors: ", feature_vectors_train)
print("patient_diagnoses: ", patient_diagnoses_train)

label:  {'radius(stderr)', 'radius(worst)', 'texture(worst)', 'fractal dimension(mean)', 'perimeter(stderr)', 'concave points(stderr)', 'symmetry(mean)', 'concave points(worst)', 'symmetry(stderr)', 'concavity(mean)', 'radius(mean)', 'perimeter(worst)', 'smoothness(mean)', 'smoothness(worst)', 'texture(mean)', 'fractal dimension(worst)', 'area(worst)', 'texture(stderr)', 'area(mean)', 'concavity(stderr)', 'symmetry(worst)', 'compactness(mean)', 'concavity(worst)', 'compactness(stderr)', 'compactness(worst)', 'smoothness(stderr)', 'fractal dimension(stderr)', 'concave points(mean)', 'area(stderr)', 'perimeter(mean)'}
feature_vectors:  [[  1.09500003e+00   2.53799992e+01   1.73299999e+01 ...,   1.47100002e-01
    1.53399994e+02   1.22800003e+02]
 [  5.43500006e-01   2.49899998e+01   2.34099998e+01 ...,   7.01700002e-02
    7.40800018e+01   1.32899994e+02]
 [  7.45599985e-01   2.35699997e+01   2.55300007e+01 ...,   1.27900004e-01
    9.40299988e+01   1.30000000e+02]
 ..., 
 [  6.44999981e

In [17]:
x_data_train = feature_vectors_train
y_data_train = patient_diagnoses_train
print(x_data_train.shape, y_data_train.shape)
print(x_data_train, y_data_train)

(300, 30) (300, 1)
[[  1.09500003e+00   2.53799992e+01   1.73299999e+01 ...,   1.47100002e-01
    1.53399994e+02   1.22800003e+02]
 [  5.43500006e-01   2.49899998e+01   2.34099998e+01 ...,   7.01700002e-02
    7.40800018e+01   1.32899994e+02]
 [  7.45599985e-01   2.35699997e+01   2.55300007e+01 ...,   1.27900004e-01
    9.40299988e+01   1.30000000e+02]
 ..., 
 [  6.44999981e-01   1.33599997e+01   2.33899994e+01 ...,   3.51499990e-02
    4.91100006e+01   7.50000000e+01]
 [  2.30000004e-01   1.62199993e+01   2.52600002e+01 ...,   1.37400003e-02
    2.05599995e+01   9.12200012e+01]
 [  2.86799997e-01   1.09300003e+01   2.42199993e+01 ...,   1.87500007e-02
    2.05599995e+01   6.68499985e+01]] [[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]


In [18]:
ids_test, labels_test, feature_vectors_test, patient_diagnoses_test = read_training_data('validate.data')

In [19]:
x_data_test = feature_vectors_test
y_data_test = patient_diagnoses_test
print(x_data_test.shape, y_data_test.shape)
print(x_data_test, y_data_test)

(260, 30) (260, 1)
[[  1.11099994e+00   2.59300003e+01   2.62399998e+01 ...,   1.06200002e-01
    1.33000000e+02   1.29500000e+02]
 [  3.64199996e-01   1.34600000e+01   2.30699997e+01 ...,   3.09900008e-02
    2.83199997e+01   8.04300003e+01]
 [  1.07200003e+00   2.36800003e+01   2.94300003e+01 ...,   1.28000006e-01
    1.30800003e+02   1.34699997e+02]
 ..., 
 [  5.37500024e-01   1.04899998e+01   3.42400017e+01 ...,   0.00000000e+00
    2.91100006e+01   5.92599983e+01]
 [  2.25400001e-01   1.54799995e+01   2.72700005e+01 ...,   3.73600014e-02
    1.95400009e+01   9.63899994e+01]
 [  2.38800004e-01   1.24799995e+01   3.71599998e+01 ...,   4.10499983e-02
    1.69699993e+01   7.45199966e+01]] [[ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]


## Non normalized

In [20]:
# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 30])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([30, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

# Hypothesis using sigmoid: tf.div(1., 1. + tf.exp(tf.matmul(X, W)))
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)

# cost/loss function
cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

# Accuracy computation
# True if hypothesis>0.5 else False
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

# Launch graph
with tf.Session() as sess:
    # Initialize TensorFlow variables
    sess.run(tf.global_variables_initializer())

    for step in range(10001):
        cost_val, _ = sess.run([cost, train], feed_dict={X: x_data_train, Y: y_data_train})
        if step % 200 == 0:
            print(step, cost_val)

    # Accuracy report
    a = sess.run(accuracy, feed_dict={X: x_data_train, Y: y_data_train})
    print("Train Accuracy: ", a)
    a = sess.run(accuracy, feed_dict={X: x_data_test, Y: y_data_test})
    print("Test Accuracy: ", a)

0 nan
200 nan
400 nan
600 nan
800 nan
1000 nan
1200 nan
1400 nan
1600 nan
1800 nan
2000 nan
2200 nan
2400 nan
2600 nan
2800 nan
3000 nan
3200 nan
3400 nan
3600 nan
3800 nan
4000 nan
4200 nan
4400 nan
4600 nan
4800 nan
5000 nan
5200 nan
5400 nan
5600 nan
5800 nan
6000 nan
6200 nan
6400 nan
6600 nan
6800 nan
7000 nan
7200 nan
7400 nan
7600 nan
7800 nan
8000 nan
8200 nan
8400 nan
8600 nan
8800 nan
9000 nan
9200 nan
9400 nan
9600 nan
9800 nan
10000 nan
Train Accuracy:  0.513333
Test Accuracy:  0.769231


## Normalized

In [7]:
def MinMaxScaler(data):
    numerator = data - np.min(data, 0)
    denominator = np.max(data, 0) - np.min(data, 0)
    # noise term prevents the zero division
    return numerator / (denominator + 1e-7)

In [8]:
x_data_train = MinMaxScaler(feature_vectors_train)
y_data_train = patient_diagnoses_train
print(x_data_train.shape, y_data.shape)
print(x_data, y_data)

(300, 30) (300, 1)
[[ 0.35525984  0.6927352   0.14152451 ...,  0.73111296  0.28062698
   0.54598856]
 [ 0.15527433  0.67725289  0.3035714  ...,  0.3487573   0.12732053
   0.61578327]
 [ 0.22856003  0.62088132  0.36007464 ...,  0.63568562  0.16587907
   0.59574324]
 ..., 
 [ 0.19208035  0.21556173  0.30303836 ...,  0.17470171  0.07905953
   0.21567273]
 [ 0.04159264  0.32909882  0.35287845 ...,  0.06829023  0.02387924
   0.327759  ]
 [ 0.0621895   0.1190949   0.32515988 ...,  0.09319081  0.02387924
   0.15935318]] [[ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 0.]
 [ 1.]
 [ 0.]
 [ 1.]
 [ 1.]
 [ 0.]
 [ 0.]
 [ 0.

In [10]:
# placeholders for a tensor that will be always fed.
X = tf.placeholder(tf.float32, shape=[None, 30])
Y = tf.placeholder(tf.float32, shape=[None, 1])

W = tf.Variable(tf.random_normal([30, 1]), name='weight')
b = tf.Variable(tf.random_normal([1]), name='bias')

# Hypothesis using sigmoid: tf.div(1., 1. + tf.exp(tf.matmul(X, W)))
hypothesis = tf.sigmoid(tf.matmul(X, W) + b)

# cost/loss function
cost = -tf.reduce_mean(Y * tf.log(hypothesis) + (1 - Y) * tf.log(1 - hypothesis))

train = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(cost)

# Accuracy computation
# True if hypothesis>0.5 else False
predicted = tf.cast(hypothesis > 0.5, dtype=tf.float32)
accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted, Y), dtype=tf.float32))

# Launch graph
with tf.Session() as sess:
    # Initialize TensorFlow variables
    sess.run(tf.global_variables_initializer())

    for step in range(10001):
        cost_val, _ = sess.run([cost, train], feed_dict={X: x_data, Y: y_data})
        if step % 200 == 0:
            print(step, cost_val)

    # Accuracy report
    a = sess.run(accuracy, feed_dict={X: x_data, Y: y_data})
    print("Accuracy: ", a)

0 1.32704
200 0.812567
400 0.729623
600 0.676384
800 0.631587
1000 0.593282
1200 0.560325
1400 0.531777
1600 0.506871
1800 0.484988
2000 0.465626
2200 0.448383
2400 0.432932
2600 0.419006
2800 0.40639
3000 0.394903
3200 0.384396
3400 0.374747
3600 0.365849
3800 0.357616
4000 0.349971
4200 0.342852
4400 0.336202
4600 0.329975
4800 0.324129
5000 0.318627
5200 0.313438
5400 0.308535
5600 0.303892
5800 0.299488
6000 0.295303
6200 0.29132
6400 0.287524
6600 0.283901
6800 0.280438
7000 0.277123
7200 0.273948
7400 0.270901
7600 0.267975
7800 0.265162
8000 0.262456
8200 0.259848
8400 0.257335
8600 0.254909
8800 0.252567
9000 0.250303
9200 0.248113
9400 0.245994
9600 0.243941
9800 0.241951
10000 0.240021
Accuracy:  0.933333
