# Testing the pyAudioAnalysis module
We used this file to explore the functions available in `pyAudioAnalysis`. We didn't save or use the results in this file in any other place. It is here purely for reference and exploration of the module.

In [None]:
from pyAudioAnalysis import ShortTermFeatures as aF
from pyAudioAnalysis import audioBasicIO as aIO 
import numpy as np 
import plotly.graph_objs as go 
import plotly
import IPython

import matplotlib.pyplot as plt

Load a couple of files. Here we are looking at the duration of the clips and the size of the arrays `s1` and `s2`.

In [None]:
fs1, s1 = aIO.read_audio_file('../Data/Train/1001_DFA_SAD_XX.wav')
print(fs1)
print(s1.shape)

# print duration in seconds:
duration = len(s1) / float(fs1)
print(f'duration = {duration} seconds')

In [None]:
fs2, s2 = aIO.read_audio_file('../Data/Train/1002_IEO_SAD_LO.wav')
print(fs2)
print(s2.shape)

# print duration in seconds:
duration = len(s2) / float(fs2)
print(f'duration = {duration} seconds')

Since `s2` is longer than `s1`, we add enough zeros to `s1` to match the length of `s2`.

**We might change this step depending on future analysis**

In [None]:
before=0
after=len(s2)-len(s1)
s1_pad = np.pad(s1, (before,after), mode='constant', constant_values=(0,0))

print(s1_pad.shape)

Listen to the audio with extra zeros

In [None]:
IPython.display.Audio(data=s1_pad, rate=fs1)

Listen to the original audio

In [None]:
IPython.display.display(IPython.display.Audio('AudioWAV/1001_DFA_SAD_XX.wav'))

Note: It seems that the volume of the original audio is lower.

### Short term features
Use pyAudioAnalysis to extract features from the sounds. Pay attention to the number of frames obtained.

In [None]:
# extract short-term features using a 50msec non-overlapping windows
win, step = 0.025, 0.025
[f, fn] = aF.feature_extraction(s, fs, int(fs * win), 
                                int(fs * step))
print(f'{f.shape[1]} frames, {f.shape[0]} short-term features')
print('Feature names:')
for i, nam in enumerate(fn):
    print(f'{i}:{nam}')

In [None]:
# plot short-term energy
# create time axis in seconds
time = np.arange(0, duration - step, win) 
# get the feature whose name is 'energy'
energy = f[fn.index('energy'), :]
mylayout = go.Layout(yaxis=dict(title="frame energy value"),
                     xaxis=dict(title="time (sec)"))
plotly.offline.iplot(go.Figure(data=[go.Scatter(x=time, 
                                                y=energy)], 
                               layout=mylayout))


## Explore more
I got the above code from https://hackernoon.com/intro-to-audio-analysis-recognizing-sounds-using-machine-learning-qy2r3ufl. In this section, I want to understand the data types that the functions return.

In [None]:
# extract short-term features using a 50msec non-overlapping windows
win, step = 0.050, 0.050
win_t = int(fs*win)
step_t = int(fs*step)

print('fs = %f' %fs)
print('win_t = %i' %win_t)
print('step_t = %i' %step_t)

[f, fn] = aF.feature_extraction(s, fs, int(fs * win), 
                                int(fs * step))

In [None]:
print('fn')
print(type(fn))
print(len(fn))

print('f')
print(type(f))
print(f.shape)

In [None]:
e_idx = fn.index('energy')
plt.plot(time, f[e_idx,:])

## Another clip
The above works as I expected. aF.feature_extraction gives me a matrix with 68 features and varying number of steps. I'm still not sure where the number of steps came from, but everything else makes sense. Now I'll try another clip -- possibly with a different duration -- and see if I get a 68-by-40 matrix.

In [None]:
fs, s = aIO.read_audio_file('AudioWAV/1002_IEO_HAP_HI.wav')
IPython.display.display(IPython.display.Audio('AudioWAV/1002_IEO_HAP_HI.wav'))

# print duration in seconds:
duration = len(s) / float(fs)
print(f'duration = {duration} seconds')

In [None]:
# extract short-term features using a 50msec non-overlapping windows
win, step = 0.025, 0.025
[f, fn] = aF.feature_extraction(s, fs, int(fs * win), 
                                int(fs * step))
print(f'{f.shape[1]} frames, {f.shape[0]} short-term features')

No, in this new clip, I get 100 frames instead of 80.

In [None]:
# plot short-term energy
# create time axis in seconds
time = np.arange(0, duration - step, win) 
# get the feature whose name is 'energy'
energy = f[fn.index('energy'), :]
mylayout = go.Layout(yaxis=dict(title="frame energy value"),
                     xaxis=dict(title="time (sec)"))
plotly.offline.iplot(go.Figure(data=[go.Scatter(x=time, 
                                                y=energy)], 
                               layout=mylayout))


## Different lengths...
Ok, so different clips give matrices with different sizes. Ok, now I'll import a small batch (10, 50, 100?) and see how long it takes to get features.

In [None]:
import time
from os import listdir

files = listdir('AudioWAV')

In [None]:
freqs = []
signals = []
features = []

win, step = 0.050, 0.050
time_start = time()
for i in range(500):
    filename = files[i]
    
    fs, s = aIO.read_audio_file('../../CREMA-D/AudioWAV/%s' %filename)
    [f, fn] = aF.feature_extraction(s, fs, int(fs * win), int(fs * step))
    
    freqs.append(fs)
    signals.append(s)
    features.append(f)

time_end = time()
time_total = time_end-time_start

if time_total <= 60:
    print('Duration: %f (s)' %time_total)
elif 60 < time_total <= 3600:
    print('Duration: %f (min)' %(time_total/60))
else:
    print('Duration: %f (h)' %(time_total/3600))

Note: Duration per number of files
- 10 files: 0.411219 (s)
- 100 files: 5.666346 (s)
- 500 files: 28.749217 (s)

## Actor distribution
Here I create a dataframe with the actor code, the sentence spoken, the emotion conveyed, and the intensity in each clip. I'll dump it in a .csv file so that we don't need to do this again.

In [None]:
import os
import time
import pandas as pd

In [None]:
files = os.listdir('AudioWAV')

time_start = time.time()
df = pd.DataFrame(columns = ['actor', 'sentence', 'emotion', 'intensity'])
for file in files:
    cats = file.split('.')[0].split('_')
    cats = pd.DataFrame([cats],
                        columns = ['actor', 'sentence', 'emotion', 'intensity'])
    df = pd.concat((df,cats), axis=0)

time_end = time.time()
time_total = time_end-time_start
print('Number of files: %i' %(len(files)))
if time_total <= 60:
    print('Duration: %f (s)' %time_total)
elif 60 < time_total <= 3600:
    print('Duration: %f (min)' %(time_total/60))
else:
    print('Duration: %f (h)' %(time_total/3600))

In [None]:
df.to_csv('Categories.csv')

In [None]:
df.describe()

In [None]:
print('Actors:')
print(np.unique(df.sentence))
print()
print('Emotions:')
print(np.unique(df.emotion))
print()
print('Intensities:')
print(np.unique(df.intensity))

In [None]:
# print(np.sum((df.intensity=='X')*1))
print(np.sum((df.intensity=='XX')*1))

In [None]:
91*11*6+91*1