In [37]:
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [1]:


raw_csv_data = np.loadtxt('Audiobooks-data (2).csv', delimiter = ',')

unscaled_inputs_all = raw_csv_data[:,1:-1]
targets_all = raw_csv_data[:,-1]

In [2]:
#Balance the Dataset- count all targets as '1'
# Keep as many '0s' that are '1s'.

num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0
indices_to_remove = []

for i in range(targets_all.shape[0]):
    if targets_all[i]== 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)
            
unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis = 0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis = 0)    



In [3]:
# STANDARDIZE THE INPUTS

scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)




In [4]:
# SHUFFLE INPUTS FOR THE BATCHING METHOD CAN WORK. We don't want them in order, 
#Shuffling data serves the purpose of reducing variance and making sure that models remain general and overfit less.

shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]


In [5]:
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(0.8*samples_count)
validation_samples_count = int(0.1*samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]


validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

print(np.sum(train_targets), train_samples_count, np.sum(train_targets)/train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets)/validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets)/test_samples_count)



1800.0 3579 0.5029337803855826
216.0 447 0.48322147651006714
221.0 448 0.49330357142857145


In [6]:
np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs,targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)

In [7]:
class Audiobooks_Data_Reader():
    def __init__(self, dataset, batch_size = None):
        
        npz = np.load('Audiobooks_data_{0}.npz'.format(dataset))
        
        self.inputs,self.targets = npz['inputs'].astype(np.float), npz['targets'].astype(np.int)
        
        
        if batch_size is None:
            self.batch_size = self.inputs.shape[0]
        else:
            self.batch_size = batch_size
        self.curr_batch = 0
        self.batch_count = self_count = self.inputs.shape[0] //self.batch_size
        
        
    def __next__(self):
        if self.curr_batch >= self.batch_count:
            self.curr_batch = 0
            raise StopIteration()
            
        batch_slice = slice(self.curr_batch * self.batch_size, (self.curr_batch + 1) * self.batch_size)
        inputs_batch = self.inputs[batch_slice]
        targets_batch = self.targets[batch_slice]
        self.curr_batch +=1
        
        classes_num = 2
        targets_one_hot = np.zeros((targets_batch.shape[0], classes_num))
        targets_one_hot[range(targets_batch.shape[0]), targets_batch] =1
        
        return inputs_batch, targets_one_hot
    
    
    def __iter__(self):
        return self
            

In [8]:
import tensorflow as tf


input_size = 10
output_size = 2
# Use same hidden layer size for both hidden layers. Not a necessity.
hidden_layer_size = 50

# Reset any variables left in memory from previous runs.
tf.reset_default_graph()

# As in the previous example - declare placeholders where the data will be fed into.
inputs = tf.placeholder(tf.float32, [None, input_size])
targets = tf.placeholder(tf.int32, [None, output_size])

# Weights and biases for the first linear combination between the inputs and the first hidden layer.
# Use get_variable in order to make use of the default TensorFlow initializer which is Xavier.
weights_1 = tf.get_variable("weights_1", [input_size, hidden_layer_size])
biases_1 = tf.get_variable("biases_1", [hidden_layer_size])

# Operation between the inputs and the first hidden layer.
# We've chosen ReLu as our activation function. You can try playing with different non-linearities.
outputs_1 = tf.nn.relu(tf.matmul(inputs, weights_1) + biases_1)

# Weights and biases for the second linear combination.
# This is between the first and second hidden layers.
weights_2 = tf.get_variable("weights_2", [hidden_layer_size, hidden_layer_size])
biases_2 = tf.get_variable("biases_2", [hidden_layer_size])

# Operation between the first and the second hidden layers. Again, we use ReLu.
outputs_2 = tf.nn.relu(tf.matmul(outputs_1, weights_2) + biases_2)

# Weights and biases for the final linear combination.
# That's between the second hidden layer and the output layer.
weights_3 = tf.get_variable("weights_3", [hidden_layer_size, output_size])
biases_3 = tf.get_variable("biases_3", [output_size])

# Operation between the second hidden layer and the final output.
# Notice we have not used an activation function because we'll use the trick to include it directly in 
# the loss function. This works for softmax and sigmoid with cross entropy.
outputs = tf.matmul(outputs_2, weights_3) + biases_3

# Calculate the loss function for every output/target pair.
# The function used is the same as applying softmax to the last layer and then calculating cross entropy
# with the function we've seen in the lectures. This function, however, combines them in a clever way, 
# which makes it both faster and more numerically stable (when dealing with very small numbers).
# Logits here means: unscaled probabilities (so, the outputs, before they are scaled by the softmax)
# Naturally, the labels are the targets.
loss = tf.nn.softmax_cross_entropy_with_logits(logits=outputs, labels=targets)

# Get the average loss
mean_loss = tf.reduce_mean(loss)

# Define the optimization step. Using adaptive optimizers such as Adam in TensorFlow
# is as simple as that.
optimize = tf.train.AdamOptimizer(learning_rate=0.001).minimize(mean_loss)

# Get a 0 or 1 for every input in the batch indicating whether it output the correct answer out of the 10.
out_equals_target = tf.equal(tf.argmax(outputs, 1), tf.argmax(targets, 1))

# Get the average accuracy of the outputs.
accuracy = tf.reduce_mean(tf.cast(out_equals_target, tf.float32))

# Declare the session variable.
sess = tf.InteractiveSession()

# Initialize the variables. Default initializer is Xavier.
initializer = tf.global_variables_initializer()
sess.run(initializer)

# Batching
batch_size = 100



# Basic early stopping. Set a miximum number of epochs.
max_epochs = 50

# Keep track of the validation loss of the previous epoch.
# If the validation loss becomes increasing, we want to trigger early stopping.
# We initially set it at some arbitrarily high number to make sure we don't trigger it
# at the first epoch
prev_validation_loss = 9999999.

train_data = Audiobooks_Data_Reader('train', batch_size)

validation_data = Audiobooks_Data_Reader('validation')


  from ._conv import register_converters as _register_converters


Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



In [9]:
for epoch_counter in range(max_epochs):
    curr_epoch_loss = 0.
    
    for input_batch, target_batch in train_data:
        _, batch_loss = sess.run([optimize, mean_loss],
            feed_dict= {inputs: input_batch, targets: target_batch})
        
        curr_epoch_loss += batch_loss
        
    curr_epoch_loss /= train_data.batch_count
    
    validation_loss = 0.
    validation_accuracy = 0.
    
    for input_batch, target_batch in validation_data:
        validation_loss, validation_accuracy = sess.run([mean_loss, accuracy],
        feed_dict= {inputs: input_batch, targets: target_batch})
        
    print('Epoch '+str(epoch_counter+1)+
         '. Training loss: '+'{0:.3f}'.format(curr_epoch_loss)+
         '. Validation loss: '+'{0:.3f}'.format(validation_loss)+
         '. Validation accuracy: '+'{0:.2f}'.format(validation_accuracy * 100.)+'%')
    
    if validation_loss > prev_validation_loss:
        break
        
    prev_validation_loss = validation_loss
    
print('End of training')
        

Epoch 1. Training loss: 0.716. Validation loss: 0.558. Validation accuracy: 66.44%
Epoch 2. Training loss: 0.496. Validation loss: 0.465. Validation accuracy: 77.18%
Epoch 3. Training loss: 0.438. Validation loss: 0.422. Validation accuracy: 78.30%
Epoch 4. Training loss: 0.408. Validation loss: 0.398. Validation accuracy: 79.19%
Epoch 5. Training loss: 0.390. Validation loss: 0.383. Validation accuracy: 79.87%
Epoch 6. Training loss: 0.377. Validation loss: 0.374. Validation accuracy: 80.54%
Epoch 7. Training loss: 0.367. Validation loss: 0.366. Validation accuracy: 79.64%
Epoch 8. Training loss: 0.359. Validation loss: 0.361. Validation accuracy: 79.64%
Epoch 9. Training loss: 0.353. Validation loss: 0.357. Validation accuracy: 80.09%
Epoch 10. Training loss: 0.348. Validation loss: 0.354. Validation accuracy: 80.31%
Epoch 11. Training loss: 0.344. Validation loss: 0.353. Validation accuracy: 80.31%
Epoch 12. Training loss: 0.340. Validation loss: 0.351. Validation accuracy: 80.09%
E

In [16]:
test_data = Audiobooks_Data_Reader('test')
for input_batch, target_batch in test_data:
    test_loss, test_accuracy = sess.run([mean_loss, accuracy],
                feed_dict= {inputs: input_batch, targets: target_batch})

    print('Test loss: '+'{0:.3f}'.format(validation_loss)+
         '. Test accuracy: '+'{0:.2f}'.format(validation_accuracy * 100.)+'%')

Test loss: 0.341. Test accuracy: 80.09%


In [29]:
prob = sess.run(outputs, feed_dict= {inputs: input_batch, targets: target_batch})

In [38]:
pred = prob.argmax(axis=1)
true = target_batch.argmax(axis=1)

In [39]:
confusion_matrix(true, pred)

array([[171,  56],
       [ 26, 195]], dtype=int64)