# Speech Emotion Recognition Training

### Downloading dataset and arranging files
We are going to use the RAVDESS dataset for this purpose. 
The dataset is available [here](https://zenodo.org/record/1188976#.XJtURKczbCI).
We will use the audio-only datasets. There are 2 of them speech based and song based.
We will use speech bases system. There are ~ 1400 files with different tags.

In [3]:
import os
import requests
import sys
import zipfile
import shutil

os.getcwd()
dataset_path = os.getcwd() + '/RawData/'
dataset_url = 'https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip'

def download_file(url, folder):
    print('Downloading dataset ...')
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(folder + local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                if chunk: # filter out keep-alive new chunks
                    f.write(chunk)
                    # f.flush()
    return local_filename

def unzip(filename, folder):
    print('Finished downloading dataset. Now extracting files')
    zip_ref = zipfile.ZipFile(dataset_path + "Audio_Speech_Actors_01-24.zip", 'r')
    zip_ref.extractall(dataset_path)
    zip_ref.close()
    
# Check if the folder already exists 
if os.path.isdir(dataset_path) and os.path.exists(dataset_path):
    print(dataset_path + ' exists, checking if the audio files are there')
else:
    os.makedirs(dataset_path)
    if os.path.isdir(dataset_path) and os.path.exists(dataset_path):
        print('Created '+ dataset_path)
    else:
        print('Failed to create directory: ' + dataset_path + ' Check if you have enough permissions')
        sys.exit()

# Check if the directory has files or not
audio_files = os.listdir(dataset_path)
if(len(audio_files)<1):
    local_filename = download_file(dataset_url, dataset_path)
    unzip(dataset_path + local_filename, dataset_path)
    os.remove(dataset_path + local_filename)
    print(dataset_path + local_filename + "File Removed!")

# The audio files are in /RawData/Actor_**/ folders
# Let's move all the audio files to /RawData/ folder 
# And delete all the /RawData/Actor_**/ folders.
for folder in os.listdir(dataset_path):
    if os.path.isdir(dataset_path + folder): 
        for f in os.listdir(dataset_path + folder):
            shutil.move(dataset_path + folder + '/' + f, dataset_path)
        os.rmdir(dataset_path + folder)
        continue
    else:
        continue

audio_files = os.listdir(dataset_path)
print('Number of audio files = ' + str(len(audio_files)))
if len(audio_files)>1400:
    print('Successfully prepared dataset')

Created /home/absin/git/sentenceSimilarity/speech/RawData/
Downloading dataset ...
Finished downloading dataset. Now extracting files
/home/absin/git/sentenceSimilarity/speech/RawData/Audio_Speech_Actors_01-24.zipFile Removed!
Number of audio files = 1440
Successfully prepared dataset


## Filename identifiers 

* Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
* Vocal channel (01 = speech, 02 = song).
* Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
* Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
* Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
* Repetition (01 = 1st repetition, 02 = 2nd repetition).
* Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

In [6]:
def get_file_metadata(file_name):
    print(file_name + ' Characteristics: ')
    split = file_name.split('-')
    if(split[0] == '01'):
        print('Modality: full-AV')
    elif(split[0] == '02'):
        print('Modality: video-only')
    elif(split[0] == '03'):
        print('Modality: audio-only')
        
    if(split[1] == '01'):
        print('Vocal channel: speech')
    elif(split[1] == '02'):
        print('Vocal channel: song')
        
    if(split[2] == '01'):
        print('Emotion: neutral')
    elif(split[2] == '02'):
        print('Emotion: calm')
    elif(split[2] == '03'):
        print('Emotion: happy')
    elif(split[2] == '04'):
        print('Emotion: sad')
    elif(split[2] == '05'):
        print('Emotion: angry')
    elif(split[2] == '06'):
        print('Emotion: fearful')
    elif(split[2] == '07'):
        print('Emotion: disgust')
    elif(split[2] == '08'):
        print('Emotion: surprised')
    
    if(split[3] == '01'):
        print('Emotional intensity: normal')
    elif(split[3] == '02'):
        print('Emotional intensity: strong')
    
    if(split[4] == '01'):
        print('Statement: Kids are talking by the door')
    elif(split[4] == '02'):
        print('Statement: Dogs are sitting by the door')
        
    if(split[5] == '01'):
        print('Repetition: 1st repetition')
    elif(split[5] == '02'):
        print('Repetition: 2nd repetition')
        
    print('Actor: ' + split[5])
    
# Let's understand what the name of a RAVDESS file-represents
get_file_metadata('03-01-01-01-01-01-03.wav')

03-01-01-01-01-01-03.wav Characteristics: 
Modality: audio-only
Vocal channel: speech
Emotion: neutral
Emotional intensity: normal
Statement: Kids are talking by the door
Repetition: 1st repetition
Actor: 01


### Feature extraction
We are