In [2]:
import numpy as np
from sklearn import preprocessing
import tensorflow as tf

In [3]:
#load
rawData = np.loadtxt('Audiobooks_data.csv', delimiter = ',')

unscaledInputs = rawData[:,1:-1]
targets = rawData[:,-1]

In [4]:
#shuffle

shuffledIndices = np.arange(unscaledInputs.shape[0])
np.random.shuffle(shuffledIndices)

shuffledInputs = unscaledInputs[shuffledIndices]
shuffledTargets = targets[shuffledIndices]

In [5]:
#balance

numOneTargets = int(np.sum(shuffledTargets))
zeroTargetsCount = 0
indicesToRemove = []

for i in range(shuffledTargets.shape[0]):
    if shuffledTargets[i] == 0:
        zeroTargetsCount += 1
        if zeroTargetsCount > numOneTargets:
            indicesToRemove.append(i)

unscaledInputsEqualPriors = np.delete(shuffledInputs, indicesToRemove, axis = 0)
targetsEqualPriors = np.delete(shuffledTargets, indicesToRemove, axis = 0)

In [6]:
#standardize
scaledInputs = preprocessing.scale(unscaledInputsEqualPriors)

In [7]:
#reshuffle
shuffledIndices = np.arange(scaledInputs.shape[0])
np.random.shuffle(shuffledIndices)

reshuffledInputs = scaledInputs[shuffledIndices]
reshuffledTargets = targetsEqualPriors[shuffledIndices]

In [8]:
#split data
samplesCount = reshuffledInputs.shape[0]

trainSamplesCount = int(0.8 * samplesCount)
validationSamplesCount = int(0.1 * samplesCount)
testSamplesCount = samplesCount - trainSamplesCount - validationSamplesCount

trainInputs = reshuffledInputs[:trainSamplesCount]
trainTargets = reshuffledTargets[:trainSamplesCount]

validationInputs = reshuffledInputs[trainSamplesCount:trainSamplesCount+validationSamplesCount]
validationTargets = reshuffledTargets[trainSamplesCount:trainSamplesCount+validationSamplesCount]

testInputs = reshuffledInputs[trainSamplesCount+validationSamplesCount:]
testTargets = reshuffledTargets[trainSamplesCount+validationSamplesCount:]

In [9]:
print(np.sum(trainTargets), trainSamplesCount, np.sum(trainTargets) / trainSamplesCount)
print(np.sum(validationTargets), validationSamplesCount, np.sum(validationTargets) / validationSamplesCount)
print(np.sum(testTargets), testSamplesCount, np.sum(testTargets) / testSamplesCount)

1758.0 3579 0.4911986588432523
232.0 447 0.5190156599552572
247.0 448 0.5513392857142857


In [10]:
np.savez('audiobooksDataTrain', inputs = trainInputs, targets = trainTargets)
np.savez('audiobooksDataValidation', inputs = validationInputs, targets = validationTargets)
np.savez('audiobooksDataTest', inputs = testInputs, targets = testTargets)