# Extract mid-term features
Extract the mid term features from the training files. We use the function `directory_feature_extraction` from `pyAudioAnalysis`.

## Load modules and files

In [None]:
from pyAudioAnalysis import ShortTermFeatures as aF
from pyAudioAnalysis import audioBasicIO as aIO 
from pyAudioAnalysis import MidTermFeatures as mF
import numpy as np 
import plotly.graph_objs as go 
import plotly
import IPython

import pandas as pd

import os, shutil
import time

import matplotlib.pyplot as plt

import random

In [None]:
# Get list of all file names in ../Data/Train
files = os.listdir('../Data/Train')

# Training files
### Extract features

In [None]:
#Extract the averaged mid_features for all WAV files in the directory ..Data/Train. 
# Arguments: filename (str), 
#            mid_window (s), 
#            mid_step (s), 
#            short_window (s), 
#            short_step (s), 
#            compute_beat (Bool)
# Returns: features, file_list, mid_feature_names 
features_train, file_list, mid_feature_names = mF.directory_feature_extraction('../Data/Train', mid_window=0.5, mid_step = 0.1, 
                                                 short_window = .05, short_step = .025,
                                                  compute_beat=False)

In [None]:
type(features_train)
features_train.shape

### Create a dataframe
Create a dataframe with the features obtained above. We also add identifying information from the file names.

In [None]:
# Create a data frame. Among the columns should be: name of file, mid feature values, classification, actor id
# Create a list of file names, with just actorID, emotion, and intensity:
nice_names = []
for name in files:
    stripped_name = name.lstrip('../../SampleAudio')
    stripped_name = stripped_name.strip('.wav')
    nice_names.append(stripped_name)

# Create new columns for Actor ID, emotion, and sentence
actor_ids = []
emotion = []
sentence = []

for name in nice_names:
    name_list = name.split('_')
    actor_ids.append(name_list[0])
    sentence.append(name_list[1])
    emotion.append(name_list[2])

# Create a dict with keys the column names of the data frame    
train_dict = {}
train_dict['FileID'] = nice_names
train_dict['ActorID'] = actor_ids
train_dict['Emotion'] = emotion
train_dict['SentenceID'] = sentence

# add columns for the mid features to the data frame. 
for i in range(len(mid_feature_names)):
    train_dict[mid_feature_names[i]] = features_train[:, i]


In [None]:
data_train = pd.DataFrame(train_dict)
data_train.head()

### Visualize the data frame

In [None]:
data_train.info()

In [None]:
data_train.Emotion.value_counts()

In [None]:
data_train.SentenceID.value_counts()

In [None]:
data_train.actorID.value_counts()

In [None]:
# Save the dataframe to a csv file
data_train.to_csv('../Feature Extraction/midFeaturesTrainSet.csv', index=False)

# Test files
### Extract features

In [None]:
#Extract the averaged mid_features for all WAV files in the directory AudioWav. 
# Arguments: filename (str), 
#            mid_window (s), 
#            mid_step (s), 
#            short_window (s), 
#            short_step (s), 
#            compute_beat (Bool)
# Returns: features, file_list, mid_feature_names 
features_test, file_list, mid_feature_names = mF.directory_feature_extraction('../Data/Test', mid_window=0.5, mid_step = 0.1, 
                                                 short_window = .05, short_step = .025,
                                                  compute_beat=False)

In [None]:
type(features)
features.shape

### Create a dataframe
Create a dataframe with the features obtained above. We also add identifying information from the file names.

In [None]:
# Create a data frame. Among the columns should be: name of file, mid feature values, classification, actor id
# Create a list of file names, with just actorID, emotion, and intensity:
nice_names = []
for name in files:
    stripped_name = name.lstrip('../../SampleAudio')
    stripped_name = stripped_name.strip('.wav')
    nice_names.append(stripped_name)

# Create new columns for Actor ID, emotion, and sentence
actor_ids = []
emotion = []
sentence = []
for name in nice_names:
    name_list = name.split('_')
    actor_ids.append(name_list[0])
    sentence.append(name_list[1])
    emotion.append(name_list[2])

# Create a dict with keys the column names of the data frame    
test_dict = {}
test_dict['FileID'] = nice_names
test_dict['ActorID'] = actor_ids
test_dict['Emotion'] = emotion
test_dict['SentenceID'] = sentence

# add columns for the mid features to the data frame. 
for i in range(len(mid_feature_names)):
    test_dict[mid_feature_names[i]] = features_test[:, i]


In [None]:
data_test = pd.DataFrame(test_dict)
data_test.head()

### Visualize the data frame

In [None]:
data_test.info()

In [None]:
data_test.Emotion.value_counts()

In [None]:
data_test.SentenceID.value_counts()

In [None]:
data_test.actorID.value_counts()

In [None]:
# Save the dataframe to a csv file
data_test.to_csv('../Feature Extraction/midFeaturesTestSet.csv', index=False)