In [1]:
import os
import shutil, glob
import time
import pandas as pd

import numpy as np

## Actor distribution
Here I create a dataframe with the actor code, the sentence spoken, the emotion conveyed, and the intensity in each clip. I'll dump it in a .csv file so that we don't need to do this again.

I also display the unique values in each column and count how many each one appears.

## Part 0: Remove intensity
Here I move the files with intensities to a new location

In [4]:
# Create new folder for the clips that have intensity
if not os.path.isdir('../../AudioWAV_Intensity'):
    os.mkdir('../../AudioWAV_Intensity')

folder = '../../AudioWAV'
files_intensity = glob.glob('%s/*_LO.wav' %folder)
files_intensity = files_intensity + glob.glob('%s/*_MD.wav' %folder)
files_intensity = files_intensity + glob.glob('%s/*_HI.wav' %folder)

for file in files_intensity:
    shutil.move(file, '../../AudioWAV_Intensity/')

### First part: Without intensities
I removed the files with intensities from the folder. Here I process only the files with unspecified intensity (XX).

In [5]:
files = os.listdir('../../AudioWAV')

time_start = time.time()
df = pd.DataFrame(columns = ['FileID', 'ActorID', 'SentenceID', 'Emotion', 'Intensity'],
                  dtype='category')

for file in files:
    fileID = file.split('.')[0]
    cats = fileID.split('_')
    
    new_row = pd.DataFrame([[fileID] + cats],
                        columns = ['FileID', 'ActorID', 'SentenceID', 'Emotion', 'Intensity'])
    df = pd.concat((df,new_row), axis=0, ignore_index=True)

# Turn columns into factors (except for FileID)
for col in df.columns:
    if col != 'FileID':
        df[col] = (df[col]).astype('category')

time_end = time.time()
time_total = time_end-time_start
print('Number of files: %i' %(len(files)))
if time_total <= 60:
    print('Duration: %f (s)' %time_total)
elif 60 < time_total <= 3600:
    print('Duration: %f (min)' %(time_total/60))
else:
    print('Duration: %f (h)' %(time_total/3600))

Number of files: 6077
Duration: 4.497929 (s)


In [6]:
df.describe()

Unnamed: 0,FileID,ActorID,SentenceID,Emotion,Intensity
count,6077,6077,6077,6077,6077
unique,6077,91,12,6,1
top,1050_IWW_HAP_XX,1001,DFA,NEU,XX
freq,1,67,546,1087,6077


In [7]:
print('Actors:')
print(np.unique(df.ActorID))
print()
print('Sentence:')
print(np.unique(df.SentenceID))
print()
print('Emotions:')
print(np.unique(df.Emotion))
print()
print('Intensities:')
print(np.unique(df.Intensity))

Actors:
['1001' '1002' '1003' '1004' '1005' '1006' '1007' '1008' '1009' '1010'
 '1011' '1012' '1013' '1014' '1015' '1016' '1017' '1018' '1019' '1020'
 '1021' '1022' '1023' '1024' '1025' '1026' '1027' '1028' '1029' '1030'
 '1031' '1032' '1033' '1034' '1035' '1036' '1037' '1038' '1039' '1040'
 '1041' '1042' '1043' '1044' '1045' '1046' '1047' '1048' '1049' '1050'
 '1051' '1052' '1053' '1054' '1055' '1056' '1057' '1058' '1059' '1060'
 '1061' '1062' '1063' '1064' '1065' '1066' '1067' '1068' '1069' '1070'
 '1071' '1072' '1073' '1074' '1075' '1076' '1077' '1078' '1079' '1080'
 '1081' '1082' '1083' '1084' '1085' '1086' '1087' '1088' '1089' '1090'
 '1091']

Sentence:
['DFA' 'IEO' 'IOM' 'ITH' 'ITS' 'IWL' 'IWW' 'MTI' 'TAI' 'TIE' 'TSI' 'WSI']

Emotions:
['ANG' 'DIS' 'FEA' 'HAP' 'NEU' 'SAD']

Intensities:
['XX']


In [8]:
pd.set_option('display.max_rows', None)
print('ActorID')
print(df.ActorID.value_counts())
print()

print('SentenceID')
print(df.SentenceID.value_counts().sort_index())
print()

print('Emotion')
print(df.Emotion.value_counts().sort_index())
print()

print('Intensity')
print(df.Intensity.value_counts().sort_index())
print()

ActorID
1001    67
1047    67
1067    67
1066    67
1065    67
1064    67
1063    67
1062    67
1061    67
1060    67
1059    67
1058    67
1057    67
1056    67
1055    67
1054    67
1053    67
1052    67
1051    67
1050    67
1049    67
1068    67
1069    67
1070    67
1082    67
1090    67
1089    67
1088    67
1087    67
1086    67
1085    67
1084    67
1083    67
1081    67
1071    67
1080    67
1079    67
1078    67
1077    67
1075    67
1074    67
1073    67
1072    67
1048    67
1046    67
1023    67
1014    67
1045    67
1022    67
1021    67
1020    67
1018    67
1017    67
1016    67
1015    67
1013    67
1025    67
1012    67
1011    67
1010    67
1007    67
1006    67
1005    67
1004    67
1003    67
1024    67
1091    67
1026    67
1035    67
1044    67
1043    67
1042    67
1041    67
1040    67
1039    67
1038    67
1037    67
1027    67
1036    67
1034    67
1033    67
1032    67
1031    67
1030    67
1029    67
1028    67
1076    66
1002    66
1009    61
1008    61
10

#### Important!
The distribution of actors, sentences, and emotions is not completely uniform. There are some missing cilps, but those were already missing in the CREMA-D Github and Kaggle repositories. Nothing we can do, but we should know about this.

In [75]:
df.to_csv('Data/Categories.csv', index=False)

In [11]:
df

Unnamed: 0,FileID,ActorID,SentenceID,Emotion,Intensity
0,1001_DFA_ANG_XX,1001,DFA,ANG,XX
1,1001_DFA_DIS_XX,1001,DFA,DIS,XX
2,1001_DFA_FEA_XX,1001,DFA,FEA,XX
3,1001_DFA_HAP_XX,1001,DFA,HAP,XX
4,1001_DFA_NEU_XX,1001,DFA,NEU,XX
5,1001_DFA_SAD_XX,1001,DFA,SAD,XX
6,1001_IEO_NEU_XX,1001,IEO,NEU,XX
7,1001_IOM_ANG_XX,1001,IOM,ANG,XX
8,1001_IOM_DIS_XX,1001,IOM,DIS,XX
9,1001_IOM_FEA_XX,1001,IOM,FEA,XX


### Second part: With intensities
For reference, I also process the files with intensities.

In [10]:
files = os.listdir('../../AudioWAV_Intensity')

time_start = time.time()
df2 = pd.DataFrame(columns = ['FileID', 'ActorID', 'SentenceID', 'Emotion', 'Intensity'],
                  dtype='category')

for file in files:
    fileID = file.split('.')[0]
    cats = fileID.split('_')
    
    new_row = pd.DataFrame([[fileID] + cats],
                        columns = ['FileID', 'ActorID', 'SentenceID', 'Emotion', 'Intensity'])
    df2 = pd.concat((df2,new_row), axis=0)

# Turn columns into factors (except for FileID)
for col in df.columns:
    if col != 'FileID':
        df2[col] = (df2[col]).astype('category')

time_end = time.time()
time_total = time_end-time_start
print('Number of files: %i' %(len(files)))
if time_total <= 60:
    print('Duration: %f (s)' %time_total)
elif 60 < time_total <= 3600:
    print('Duration: %f (min)' %(time_total/60))
else:
    print('Duration: %f (h)' %(time_total/3600))

Number of files: 1365
Duration: 1.004620 (s)


In [12]:
df2.describe()

Unnamed: 0,FileID,ActorID,SentenceID,Emotion,Intensity
count,1365,1365,1365,1365,1365
unique,1365,91,1,5,3
top,1082_IEO_SAD_HI,1001,IEO,ANG,HI
freq,1,15,1365,273,455


In [13]:
print('Actors:')
print(np.unique(df2.ActorID))
print()
print('Sentence:')
print(np.unique(df2.SentenceID))
print()
print('Emotions:')
print(np.unique(df2.Emotion))
print()
print('Intensities:')
print(np.unique(df2.Intensity))

Actors:
['1001' '1002' '1003' '1004' '1005' '1006' '1007' '1008' '1009' '1010'
 '1011' '1012' '1013' '1014' '1015' '1016' '1017' '1018' '1019' '1020'
 '1021' '1022' '1023' '1024' '1025' '1026' '1027' '1028' '1029' '1030'
 '1031' '1032' '1033' '1034' '1035' '1036' '1037' '1038' '1039' '1040'
 '1041' '1042' '1043' '1044' '1045' '1046' '1047' '1048' '1049' '1050'
 '1051' '1052' '1053' '1054' '1055' '1056' '1057' '1058' '1059' '1060'
 '1061' '1062' '1063' '1064' '1065' '1066' '1067' '1068' '1069' '1070'
 '1071' '1072' '1073' '1074' '1075' '1076' '1077' '1078' '1079' '1080'
 '1081' '1082' '1083' '1084' '1085' '1086' '1087' '1088' '1089' '1090'
 '1091']

Sentence:
['IEO']

Emotions:
['ANG' 'DIS' 'FEA' 'HAP' 'SAD']

Intensities:
['HI' 'LO' 'MD']


In [14]:
pd.set_option('display.max_rows', None)
print('ActorID')
print(df2.ActorID.value_counts().sort_index().sort_values())
print()

print('SentenceID')
print(df2.SentenceID.value_counts().sort_index())
print()

print('Emotion')
print(df2.Emotion.value_counts().sort_index())
print()

print('Intensity')
print(df2.Intensity.value_counts().sort_index())
print()

ActorID
1001    15
1066    15
1065    15
1064    15
1063    15
1062    15
1061    15
1060    15
1059    15
1058    15
1057    15
1056    15
1055    15
1054    15
1053    15
1052    15
1051    15
1050    15
1049    15
1048    15
1067    15
1047    15
1068    15
1070    15
1089    15
1088    15
1087    15
1086    15
1085    15
1084    15
1083    15
1082    15
1081    15
1080    15
1079    15
1078    15
1077    15
1076    15
1075    15
1074    15
1073    15
1072    15
1071    15
1069    15
1090    15
1046    15
1044    15
1020    15
1019    15
1018    15
1017    15
1016    15
1015    15
1014    15
1013    15
1012    15
1011    15
1010    15
1009    15
1008    15
1007    15
1006    15
1005    15
1004    15
1003    15
1002    15
1021    15
1045    15
1022    15
1024    15
1043    15
1042    15
1041    15
1040    15
1039    15
1038    15
1037    15
1036    15
1035    15
1034    15
1033    15
1032    15
1031    15
1030    15
1029    15
1028    15
1027    15
1026    15
1025    15
1023    15
10