In [26]:
import os
import shutil
import glob
import numpy as np
import pandas as pd

df = pd.read_csv('Categories.csv')

In [23]:
df.sample(5)

Unnamed: 0,FileID,ActorID,SentenceID,Emotion,Intensity
3067,1047_DFA_SAD_XX,1047,DFA,SAD,XX
5260,1079_TSI_ANG_XX,1079,TSI,ANG,XX
3556,1054_IWL_ANG_XX,1054,IWL,ANG,XX
4845,1073_MTI_NEU_XX,1073,MTI,NEU,XX
5572,1084_IWW_DIS_XX,1084,IWW,DIS,XX


# Create the Train-Test split
I'm not using train_test_split because that would distribute the sound clips from every actor in _both_ the train and test set. I have to do this "by hand".

In [16]:
test_size = 0.2

# Create an rng object and feed a seed
rng = np.random.default_rng(500)

# Obtain the list of actors
actors = df.ActorID.unique()
nActors = len(actors)

# Actors for the test set
num_test = round(nActors*test_size)
actors_test = rng.choice(actors, size=num_test, replace=False)
actors_test.sort()

# Actors for the train set
actors_train = list(set(actors)-set(actors_test))
actors_train.sort()


In [17]:
print('Num of Test  actors: %i' %(len(actors_test)))
print('Num of Train actors: %i' %(len(actors_train)))
print('Sum: %i' %(len(actors_test)+len(actors_train)))
print('Original number of actors: %i' %nActors)
print('Num of actors in both train and test set')
print('(This should be 0): %i' %(len(set(actors_test) & set(actors_train))))

Num of Test  actors: 18
Num of Train actors: 73
Sum: 91
Original number of actors: 91
Num of actors in both train and test set
(This should be 0): 0


In [20]:
print('Training set:')
print(actors_train)
print()

print('Test set:')
print(actors_test)

Training set:
[1001, 1002, 1003, 1004, 1005, 1007, 1009, 1010, 1011, 1012, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1025, 1026, 1027, 1030, 1031, 1033, 1035, 1036, 1037, 1038, 1041, 1042, 1044, 1045, 1046, 1047, 1048, 1050, 1052, 1053, 1054, 1055, 1056, 1057, 1058, 1059, 1061, 1062, 1063, 1064, 1065, 1067, 1069, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1078, 1079, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1090, 1091]

Test set:
[1006 1008 1013 1028 1029 1032 1034 1039 1040 1043 1049 1051 1060 1066
 1068 1070 1080 1089]


### Move the files
Now that I have separated the actors into train and test, I'll copy the audio files to their respective folders.

In [30]:
# Create list of train files
if not os.path.isdir('Train'):
    os.mkdir('Train')

files_actors_train = []
for actor in actors_train:
    files_actors_train = files_actors_train + (glob.glob('../../AudioWAV/%s_*.wav' %actor))

# Create list of test files
if not os.path.isdir('Test'):
    os.mkdir('Test')

files_actors_test = []
for actor in actors_test:
    files_actors_test = files_actors_test + (glob.glob('../../AudioWAV/%s_*.wav' %actor))


In [31]:
# Copy train files
for file in files_actors_train:
    shutil.copy(file, 'Train')

# Copy test files
for file in files_actors_test:
    shutil.copy(file, 'Test')

In [32]:
numFilesTrain = len(files_actors_train)
numFilesTest  = len(files_actors_test)
numFiles = numFilesTrain+numFilesTest

print('Number of TRAIN files: %i' %(numFilesTrain))
print('Percentage of TRAIN files: %1.2f%%' %(100*numFilesTrain/numFiles))
print('Number of TEST files: %i' %(numFilesTest))
print('Percentage of TEST files:  %1.2f%%' %( 100*numFilesTest/numFiles))

Number of TRAIN files: 4877
Percentage of TRAIN files: 80.25%
Number of TEST files: 1200
Percentage of TEST files:  19.75%


## Create Categories
Split the categories dataframe in train and test sets

In [35]:
df_train = df.loc[df.ActorID.isin(actors_train)]
df_test = df.loc[df.ActorID.isin(actors_test)]

df_train.to_csv('Categories_train.csv', index=False)
df_test.to_csv('Categories_test.csv', index=False)

In [56]:
print(actors_train.sort() == df_train.ActorID.unique().tolist().)
print(actors_test.sort() == df_test.ActorID.unique().tolist())


True
[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]
