In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
col_names = ['id', 'total book length', 'book length avg',
            'total price', 'price avg', 'review', 
            'review score', 'min listened', 'completion', 
            'support requests', 'last visit since first purchase', 'targets' ]
raw_data = pd.read_csv('Audiobooks_data.csv', names = col_names )

In [3]:
#A closer look at the data
raw_data.describe()


Unnamed: 0,id,total book length,book length avg,total price,price avg,review,review score,min listened,completion,support requests,last visit since first purchase,targets
count,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0
mean,16772.491551,1591.281685,1678.608634,7.103791,7.543805,0.16075,8.909795,0.125659,189.888983,0.070222,61.935033,0.158833
std,9691.807248,504.340663,654.838599,4.931673,5.560129,0.367313,0.643406,0.241206,371.08401,0.472157,88.207634,0.365533
min,2.0,216.0,216.0,3.86,3.86,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,8368.0,1188.0,1188.0,5.33,5.33,0.0,8.91,0.0,0.0,0.0,0.0,0.0
50%,16711.5,1620.0,1620.0,5.95,6.07,0.0,8.91,0.0,0.0,0.0,11.0,0.0
75%,25187.25,2160.0,2160.0,8.0,8.0,0.0,8.91,0.13,194.4,0.0,105.0,0.0
max,33683.0,2160.0,7020.0,130.94,130.94,1.0,10.0,1.0,2160.0,30.0,464.0,1.0


In [4]:
##There are no missing values
raw_data.isnull().any()

id                                 False
total book length                  False
book length avg                    False
total price                        False
price avg                          False
review                             False
review score                       False
min listened                       False
completion                         False
support requests                   False
last visit since first purchase    False
targets                            False
dtype: bool

In [5]:
#check the types of features we have (They are all numerical values; no categories)
#6 are float64 which suggests they are continous; the 5 int64 are descrete variables excluding 'id' column
raw_data.dtypes

id                                   int64
total book length                  float64
book length avg                      int64
total price                        float64
price avg                          float64
review                               int64
review score                       float64
min listened                       float64
completion                         float64
support requests                     int64
last visit since first purchase      int64
targets                              int64
dtype: object

In [6]:
#Removing 'id' columns since we have indexed rows
df = raw_data.copy()
df.drop('id', axis = 1, inplace = True)

In [7]:
#as shown, the dataset needs to be balanced
target_count = df['targets'].value_counts()
target_total = target_count[0] + target_count[1]

In [8]:
#I want a 80-10-10 split for my train, val, and test data
#To keep it balanced, I need to keep the same ratio for both 0 and 1 target values
print(target_count[0]/ target_total, target_count[1]/ target_total)

0.8411672820221529 0.1588327179778472


In [9]:
#will sepearte my data and the targets
targets = df.targets
df.drop('targets',axis = 1, inplace = True)

In [10]:
#Creating training and temp (will be val and test) data
x_train, x_temp, y_train, y_temp = train_test_split(df, targets,
                                    test_size = .2, shuffle = 42, stratify = targets)

In [11]:
#Now I'll split my temp data into my new val and test data
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp,
                                    test_size = .5, stratify = y_temp)

In [12]:
#As shown, both train, val and test are now all balanced
y_train_counter = np.unique(y_train, return_counts = True)[1]
y_train_total = y_train_counter[0] + y_train_counter[1]

y_test_counter = np.unique(y_test, return_counts = True)[1]
y_test_total = y_test_counter[0] + y_test_counter[1]

y_val_counter = np.unique(y_val, return_counts = True)[1]
y_val_total = y_val_counter[0] + y_val_counter[1]

print('Training data set: ', y_train_counter[0]/ y_train_total, y_train_counter[1]/ y_train_total)
print('Total size: ', y_train_total, round(y_train_total / (y_train_total + y_test_total + y_val_total), 3))


print('test data set: ', y_test_counter[0]/ y_test_total, y_test_counter[1]/ y_test_total)
print('Total size: ', y_test_total, round(y_test_total / (y_train_total + y_test_total + y_val_total), 3))

print('val data set: ', y_val_counter[0]/ y_val_total, y_val_counter[1]/ y_val_total)
print('Total size: ', y_val_total, round(y_val_total / (y_train_total + y_test_total +y_val_total), 3))

Training data set:  0.8411289606816367 0.15887103931836336
Total size:  11267 0.8
test data set:  0.8410220014194464 0.1589779985805536
Total size:  1409 0.1
val data set:  0.8416193181818182 0.15838068181818182
Total size:  1408 0.1


In [13]:
#Now I will standardize my features by my training data
my_scaler = preprocessing.StandardScaler()
new_x_train = my_scaler.fit_transform(x_train)


In [14]:
#This same scaler will now be applied to my val and test data
new_x_val = my_scaler.transform(x_val)
new_x_test = my_scaler.transform(x_test)

In [15]:
np.savez('audiobook_train_data', inputs = new_x_train, targets = y_train )
np.savez('audiobook_val_data', inputs = new_x_val, targets = y_val)
np.savez('audiobook_test_data', inputs = new_x_test, targets = y_test )