## Imports

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils, to_categorical
import numpy as np
import pandas as pd
import pickle

## Paths

In [2]:
paths = {"fdata": "../../../assets/audio_sentiment_data_v2/data_features/data_features_and_labels.csv",
         "save_path": "../../../assets/audio_sentiment_data_v2/pickles"}

## Loading and Splitting data into Train, Validation and Test sets

In [3]:
df = pd.read_csv(paths["fdata"])

In [4]:
train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10

# train is now 75% of the entire data set
X_train, X_test, y_train, y_test = train_test_split(df.drop(['labels'],axis=1),
                                                    df.labels, 
                                                    test_size=1 - train_ratio,
                                                    shuffle=True,
                                                    random_state=42)

# test is now 10% of the initial data set
# validation is now 15% of the initial data set
X_val, X_test, y_val, y_test = train_test_split(X_test,
                                                y_test, 
                                                test_size=test_ratio/(test_ratio + validation_ratio),
                                                shuffle=False)

In [8]:
print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

(2445, 37) (489, 37) (326, 37)
(2445,) (489,) (326,)


## Scaling the data

In [6]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_val = sc.transform(X_val)
X_test = sc.transform(X_test)

In [7]:
# converting y from dataframes to ndarrays
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

## Converting the targets to one-hot vectors by encoding the labels

In [9]:
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_val = np_utils.to_categorical(lb.fit_transform(y_val))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))
print(lb.classes_)

['angry' 'fear' 'happy' 'sad' 'surprise']


## Pickling out the scaler, labels and split data for future use

In [10]:
outfile = open(f'{paths["save_path"]}/labels.pickle','wb')
pickle.dump(lb,outfile)
outfile.close()

outfile = open(f'{paths["save_path"]}/scaler.pickle', "wb")
pickle.dump(sc, outfile)
outfile.close()

outfile = open(f'{paths["save_path"]}/X_train.pickle', "wb")
pickle.dump(X_train, outfile)
outfile.close()

outfile = open(f'{paths["save_path"]}/X_val.pickle', "wb")
pickle.dump(X_val, outfile)
outfile.close()

outfile = open(f'{paths["save_path"]}/X_test.pickle', "wb")
pickle.dump(X_test, outfile)
outfile.close()

outfile = open(f'{paths["save_path"]}/y_train.pickle', "wb")
pickle.dump(y_train, outfile)
outfile.close()

outfile = open(f'{paths["save_path"]}/y_val.pickle', "wb")
pickle.dump(y_val, outfile)
outfile.close()

outfile = open(f'{paths["save_path"]}/y_test.pickle', "wb")
pickle.dump(y_test, outfile)
outfile.close()