### Import libraries 
### Note: we are going to being using numpy and not pandas because it will make it easier to process the data in this case

In [48]:
import numpy as np
from sklearn import preprocessing

### Load in the data using numpy. We ignore the first column because it is just the IDs which are of no use to the algorithm. Split the inputs and the targets into different arrays (targets are the last column so this can easily be done)

In [49]:
raw_csv_data = np.loadtxt('Audiobooks_data.csv',delimiter=',')

unscaled_inputs_all = raw_csv_data[:,1:-1]

targets_all = raw_csv_data[:,-1]

In [199]:
targets_all[0]

0.0

### We shuffle the data here because the data was collected sequentially (by date) and we want the entries to be spread out randomly. This will help us out later when we train the model in batches

In [50]:
shuffled_indices = np.arange(unscaled_inputs_all.shape[0])
np.random.shuffle(shuffled_indices)

unscaled_inputs_all = unscaled_inputs_all[shuffled_indices]
targets_all = targets_all[shuffled_indices]

### It is important that our dataset is balanced. This involves having roughly an equal amount of each class. In this case an equal amount of entries where the target is 0 (not a return customer) as entries where the target is 1 (a return customer)

In [51]:
num_one_targets = int(np.sum(targets_all))
zero_targets_counter = 0

indices_to_remove = []


for i in range(targets_all.shape[0]):
    if targets_all[i] == 0:
        zero_targets_counter += 1
        if zero_targets_counter > num_one_targets:
            indices_to_remove.append(i)

unscaled_inputs_equal_priors = np.delete(unscaled_inputs_all, indices_to_remove, axis=0)
targets_equal_priors = np.delete(targets_all, indices_to_remove, axis=0)

### We calculated the number of target values that equal one using np.sum(). Now we iterate through all the targets and keep track of the number of targest that have value 0. Once we have found the same amount of targets that equal 0 as targets that equal 1, we can discard the rest of the target values that equal 0.

### Standardize the inputs using the preproccessing function imported from sklearn earlier

In [52]:
scaled_inputs = preprocessing.scale(unscaled_inputs_equal_priors)

### Re-Shuffle the data, same way as before

In [53]:
shuffled_indices = np.arange(scaled_inputs.shape[0])
np.random.shuffle(shuffled_indices)

# Use the shuffled indices to shuffle the inputs and targets.
shuffled_inputs = scaled_inputs[shuffled_indices]
shuffled_targets = targets_equal_priors[shuffled_indices]

In [54]:
samples_count = shuffled_inputs.shape[0]

In [55]:
print(samples_count)

4474


### Before putting the data into the model we must split it into training,testing and validation data. While it varies case to case I am going with an 80/10/10 split (training, testing, validation). A simple way of doing this is just multiplying 0.8 by the total number of entries and using that number as the index of the array. Then continuing on and making the next index the sum of the previous index and the amount of new entries (for validation this would be the amount of training data + the number of validation data).

In [56]:
samples_count = shuffled_inputs.shape[0]

train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)

test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]


validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]

### We can verify that the targets are roughly split up equally between values of 1 and 0.

In [57]:
print(np.sum(train_targets), train_samples_count, np.sum(train_targets) / train_samples_count)
print(np.sum(validation_targets), validation_samples_count, np.sum(validation_targets) / validation_samples_count)
print(np.sum(test_targets), test_samples_count, np.sum(test_targets) / test_samples_count)

1794.0 3579 0.501257334450964
214.0 447 0.47874720357941836
229.0 448 0.5111607142857143


### Import tensorflow which we will use to make our Neural Net

In [59]:
import tensorflow as tf

### Create the model with an input value of 10 and an output size of 2 because there are only two possibilites 0 or 1. While the hidden layer size does not have to be the same throughout each layer I am going to make them all 50. Early stopping is a callback function passed to the model that regardless of how many epochs specified will tell the model to stop training if it notices the loss increase (the patience parameter is how many times it has to notice the loss increase before stopping).  

In [214]:
input_size = 10
hidden_layer_size = 50
output_size = 2

early_stopping = tf.keras.callbacks.EarlyStopping(patience=5)

model = tf.keras.Sequential([
                            tf.keras.layers.Dense(hidden_layer_size,activation = 'relu'),
                            tf.keras.layers.Dropout(0.2),
                            tf.keras.layers.Dense(hidden_layer_size,activation = 'relu'),
                            tf.keras.layers.Dropout(0.2),
                            tf.keras.layers.Dense(output_size,activation = 'sigmoid'),
                            ])


### The activation functions in the hidden layers will use the relu. Whereas the on the output layer I chose to use a sigmoid function which has a prediction range between 0 and 1 which works well when working with a binary outcome (0 or 1). A softmax function would also work well in place of the sigmoid function.

In [215]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [216]:
BATCH_SIZE = 128
MAX_EPOCHS = 100

model.fit(train_inputs,train_targets,batch_size = BATCH_SIZE, epochs = MAX_EPOCHS,callbacks=[early_stopping], validation_data = (validation_inputs, validation_targets), verbose = 2 )


Epoch 1/100
28/28 - 0s - loss: 0.6637 - accuracy: 0.6150 - val_loss: 0.6005 - val_accuracy: 0.7315
Epoch 2/100
28/28 - 0s - loss: 0.5674 - accuracy: 0.7390 - val_loss: 0.5051 - val_accuracy: 0.7964
Epoch 3/100
28/28 - 0s - loss: 0.4869 - accuracy: 0.7608 - val_loss: 0.4419 - val_accuracy: 0.8076
Epoch 4/100
28/28 - 0s - loss: 0.4506 - accuracy: 0.7664 - val_loss: 0.4143 - val_accuracy: 0.8098
Epoch 5/100
28/28 - 0s - loss: 0.4327 - accuracy: 0.7734 - val_loss: 0.4019 - val_accuracy: 0.8054
Epoch 6/100
28/28 - 0s - loss: 0.4244 - accuracy: 0.7664 - val_loss: 0.3911 - val_accuracy: 0.8143
Epoch 7/100
28/28 - 0s - loss: 0.4126 - accuracy: 0.7776 - val_loss: 0.3900 - val_accuracy: 0.8076
Epoch 8/100
28/28 - 0s - loss: 0.4134 - accuracy: 0.7807 - val_loss: 0.3924 - val_accuracy: 0.8031
Epoch 9/100
28/28 - 0s - loss: 0.4038 - accuracy: 0.7913 - val_loss: 0.3820 - val_accuracy: 0.8121
Epoch 10/100
28/28 - 0s - loss: 0.3981 - accuracy: 0.7921 - val_loss: 0.3766 - val_accuracy: 0.8233
Epoch 11/

<tensorflow.python.keras.callbacks.History at 0x7fac7441f310>

In [217]:
predictions = model.predict_classes(test_inputs)

In [218]:
from sklearn.metrics import classification_report,confusion_matrix

In [219]:
print(classification_report(test_targets,predictions))

              precision    recall  f1-score   support

         0.0       0.74      0.85      0.79       219
         1.0       0.83      0.71      0.76       229

    accuracy                           0.78       448
   macro avg       0.78      0.78      0.78       448
weighted avg       0.78      0.78      0.78       448



In [220]:
print(confusion_matrix(test_targets,predictions))

[[186  33]
 [ 67 162]]


### At the end of training and testing we can see the model ended up with a roughly 80% accuracy. That means for nearly 3 out of 4 audiobooks customers our model will be able determine if they would be repeat customers and can focus more energy on either enhancing those customers experience or more energy on the those that are predicted to not be return and try to get them to return