## Machine learning model for audiobook app dataset 


In [3]:
import numpy as np
from sklearn import preprocessing


In [2]:
raw_data=np.loadtxt('Audiobooks_data.csv' , delimiter = ',')

# to include all row and remove first and last column from data
unscaled_input_all=raw_data[ : , 1:-1]
#last row is target
target_all=raw_data[:,-1] 

In [16]:
# count target=1 
count_one_targets = int(np.sum(target_all))
counter_zero_targets =0
to_be_remove = []

for i in range(target_all.shape[0]):
    if target_all[i] == 0:
        counter_zero_targets += 1
        if counter_zero_targets > count_one_targets:
            to_be_remove.append(i)
            
balanced_unscaled_input_all = np.delete(unscaled_input_all, to_be_remove, axis=0)
#axis 0 for rows and 1 for columns
balanced_target_all = np.delete(target_all,to_be_remove,axis = 0)


In [22]:
#standarise the inputs
scaled_bal_input = preprocessing.scale(balanced_unscaled_input_all)

In [23]:
# We want to shuffle inputs and targets, but they are in different arrays and we need a way to shuffle both arrays in the same order.
# For example, if first observation of inputs falls into 10th index of shuffled inputs, then first element of targets should also fall into the same index, i.e., 10th index of shuffled targets.
# Therefore, we shuffle indices and then use the shuffled indices to shuffle the inputs and targets.
shuffled_indices = np.arange(scaled_bal_input.shape[0])
np.random.shuffle(shuffled_indices)
shuffled_inputs = scaled_bal_input[shuffled_indices]
shuffled_targets = balanced_target_all[shuffled_indices]


In [24]:
samples_count = shuffled_inputs.shape[0]
# Count the samples in each subset, assuming we want 80-10-10 distribution of training, validation, and test.
# Naturally, the numbers are integers.
train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_inputs = shuffled_inputs[:train_samples_count]
train_targets = shuffled_targets[:train_samples_count]

validation_inputs = shuffled_inputs[train_samples_count:train_samples_count+validation_samples_count]
validation_targets = shuffled_targets[train_samples_count:train_samples_count+validation_samples_count]

test_inputs = shuffled_inputs[train_samples_count+validation_samples_count:]
test_targets = shuffled_targets[train_samples_count+validation_samples_count:]



In [25]:
# Save the three datasets in *.npz.
# In the next lesson, you will see that it is extremely valuable to name them in such a coherent way!

np.savez('Audiobooks_data_train', inputs=train_inputs, targets=train_targets)
np.savez('Audiobooks_data_validation', inputs=validation_inputs, targets=validation_targets)
np.savez('Audiobooks_data_test', inputs=test_inputs, targets=test_targets)