# Get correctly identified clips
In the CREMA-D [paper](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4313618/), the audio clips were given to human raters to identify the expressed emotion but not all clips were rated correctly. This script adds a column to our dataframes indicating whether human raters correctly identified the clips or not.

In [61]:
import numpy as np
import pandas as pd

import os, shutil

import random

## Load dataframes

In [63]:
df_train = pd.read_csv('../Data/Mid_features/midFeaturesTrainFinal2.csv')
df_test = pd.read_csv('../Data/Mid_features/midFeaturesTestFinal2.csv')

df_train.sort_values(by=['FileID']);
df_test.sort_values(by=['FileID']);

In [64]:
summary = pd.read_csv('../Data/CREMADsummaryTable.csv', index_col=0)
summary

Unnamed: 0,FileName,VoiceVote,VoiceLevel,FaceVote,FaceLevel,MultiModalVote,MultiModalLevel
1,1001_IEO_NEU_XX,N,69.1,N,92.22,N,64.78
2,1001_IEO_HAP_LO,N,71.67,H,57,H,57.38
3,1001_IEO_HAP_MD,N,67.71,H,62.62,H,56.56
4,1001_IEO_HAP_HI,H,63.5,H,68.25,H,73.2
5,1001_IEO_SAD_LO,N,73.71,N,73.5,N,74.8
...,...,...,...,...,...,...,...
7438,1091_WSI_HAP_XX,N,68.12,H,65.12,H,64.8
7439,1091_WSI_SAD_XX,N,55.67,N,71.57,N,48.8
7440,1091_WSI_ANG_XX,A,34.4,A,50.43,A,67.11
7441,1091_WSI_FEA_XX,S,39.67,F,68.29,F,51.71


## Calculating the "correctness"
The column VoiceVote in summary is the emotion that human raters chose. It is codified as the initial of the emotion only, so we will compare the file name with the vote to determine if the human raters were correct or not.

In [65]:
# Creating a new column and saving the real label code as a single letter
# Filename format:
    # NNNN_SEN_EMO_XX
    # NNNN = actor code
    # SEN = sentence code
    # EMO = emotion
# The initial of the emotion is in position 9 of the file name
summary['filecode'] = ''
for i in summary.index:
    summary.loc[i,'filecode'] = summary.loc[i,'FileName'][9]

# The raters were correct if the filecode coincides with the vote
# We multiply by 1 because we prefer a 0/1 encoding rather than False/True
summary['is_correct'] = 1*(summary['filecode'] == summary['VoiceVote'])

In [60]:
summary

Unnamed: 0,FileName,VoiceVote,VoiceLevel,FaceVote,FaceLevel,MultiModalVote,MultiModalLevel,filecode,is_correct,is_correct_2,is_correct_3
1,1001_IEO_NEU_XX,N,69.1,N,92.22,N,64.78,N,1.0,True,1
2,1001_IEO_HAP_LO,N,71.67,H,57,H,57.38,H,0.0,False,0
3,1001_IEO_HAP_MD,N,67.71,H,62.62,H,56.56,H,0.0,False,0
4,1001_IEO_HAP_HI,H,63.5,H,68.25,H,73.2,H,1.0,True,1
5,1001_IEO_SAD_LO,N,73.71,N,73.5,N,74.8,S,0.0,False,0
...,...,...,...,...,...,...,...,...,...,...,...
7438,1091_WSI_HAP_XX,N,68.12,H,65.12,H,64.8,H,0.0,False,0
7439,1091_WSI_SAD_XX,N,55.67,N,71.57,N,48.8,S,0.0,False,0
7440,1091_WSI_ANG_XX,A,34.4,A,50.43,A,67.11,A,1.0,True,1
7441,1091_WSI_FEA_XX,S,39.67,F,68.29,F,51.71,F,0.0,False,0


In [66]:
# Save them all into csv
summary.to_csv('../Data/Mid_features/summary_corrLabel.csv', index=False)

## Add correctness to the midFeature dataframe

In [26]:
# Train set
df_train.insert(loc = 4,
              column = 'is_correct',
              value = np.nan)

for i in df_train.index:
    x = df_train.loc[i,'FileID']
    y = summary.loc[summary['FileName'] == x]
    df_train.loc[i,'is_correct'] = y['is_correct'].values[0]

# Test set
df_test.insert(loc = 4,
              column = 'is_correct',
              value = np.nan)

for i in df_test.index:
    x = df_test.loc[i,'FileID']
    y = summary.loc[summary['FileName'] == x]
    df_test.loc[i,'is_correct'] = y['is_correct'].values[0]

In [84]:
df_train.head()

Unnamed: 0,FileID,actorID,Emotion,SentenceID,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,...,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std,is_correct
0,1001_DFA_ANG_XX,1001,ANG,DFA,0.12174,0.010421,2.983526,0.216327,0.225219,0.932025,...,0.018907,0.020462,0.016111,0.006406,0.00784,0.013986,0.014026,0.003441,0.00775,1.0
1,1001_DFA_DIS_XX,1001,DIS,DFA,0.161743,0.00653,2.940205,0.246532,0.212951,1.265724,...,0.024405,0.024236,0.024355,0.00888,0.00261,0.004799,0.011605,0.003827,0.010198,1.0
2,1001_DFA_FEA_XX,1001,FEA,DFA,0.158708,0.015425,2.973619,0.252136,0.225729,1.245681,...,0.003273,0.00796,0.058401,0.010373,0.00383,0.009172,0.025511,0.005837,0.017773,1.0
3,1001_DFA_HAP_XX,1001,HAP,DFA,0.159097,0.00576,2.937929,0.229749,0.208469,1.379728,...,0.006455,0.007594,0.043598,0.007653,0.011884,0.015029,0.013349,0.014063,0.012297,0.0
4,1001_DFA_NEU_XX,1001,NEU,DFA,0.164732,0.008302,2.892321,0.264956,0.227461,1.380184,...,0.014561,0.020798,0.051023,0.011482,0.004178,0.002889,0.015255,0.007335,0.016231,1.0


In [85]:
df_test.head()

Unnamed: 0,FileID,actorID,Emotion,SentenceID,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,spectral_entropy_mean,...,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std,is_correct
0,1006_DFA_ANG_XX,1006,ANG,DFA,0.158988,0.019054,2.838967,0.254748,0.231145,1.284158,...,0.029961,0.00997,0.039871,0.001132,0.010264,0.016142,0.023298,0.007521,0.013553,1.0
1,1006_DFA_DIS_XX,1006,DIS,DFA,0.158327,0.008196,2.790725,0.241138,0.211998,0.922673,...,0.034746,0.037144,0.045503,0.002213,0.005307,0.009227,0.011744,0.008643,0.016935,1.0
2,1006_DFA_FEA_XX,1006,FEA,DFA,0.136295,0.018828,2.762704,0.20928,0.214652,0.719738,...,0.01631,0.036891,0.039144,0.004587,0.012374,0.00599,0.019046,0.005988,0.014511,0.0
3,1006_DFA_HAP_XX,1006,HAP,DFA,0.134154,0.010857,2.881997,0.231003,0.220957,0.989411,...,0.020426,0.015004,0.033402,0.02334,0.012938,0.019577,0.021913,0.00782,0.01302,1.0
4,1006_DFA_NEU_XX,1006,NEU,DFA,0.144614,0.016931,2.826965,0.21985,0.200363,0.913796,...,0.011041,0.031599,0.027577,0.015176,0.01189,0.008234,0.014857,0.014921,0.012871,1.0


Save results to disk

In [154]:
df_train.to_csv('../Data/Mid_features/df_train_corrLabel.csv',
                index=False)
df_test.to_csv('../Data/Mid_features/df_test_all.csv',
               index=False)

### Save dataframes with only the correct responses

In [157]:
df_test_corrlabel = df_test[df_test['is_correct'] == 1]
df_test_corrlabel

Unnamed: 0,FileID,actorID,Emotion,SentenceID,is_correct,zcr_mean,energy_mean,energy_entropy_mean,spectral_centroid_mean,spectral_spread_mean,...,delta chroma_4_std,delta chroma_5_std,delta chroma_6_std,delta chroma_7_std,delta chroma_8_std,delta chroma_9_std,delta chroma_10_std,delta chroma_11_std,delta chroma_12_std,delta chroma_std_std
0,1006_DFA_ANG_XX,1006,ANG,DFA,1.0,0.205303,0.026108,2.936040,0.295857,0.220094,...,0.009290,0.011798,0.012141,0.005553,0.001191,0.005477,0.013535,0.017165,0.008230,0.008214
1,1006_DFA_DIS_XX,1006,DIS,DFA,1.0,0.142912,0.009082,2.815900,0.227427,0.198710,...,0.043360,0.034025,0.037411,0.016621,0.002165,0.006136,0.005255,0.008078,0.006201,0.013113
3,1006_DFA_HAP_XX,1006,HAP,DFA,1.0,0.117750,0.012498,2.945889,0.220303,0.219790,...,0.033142,0.021647,0.012535,0.028002,0.018135,0.012021,0.022196,0.020446,0.008418,0.012261
4,1006_DFA_NEU_XX,1006,NEU,DFA,1.0,0.118134,0.016632,2.802065,0.193803,0.183517,...,0.026941,0.010169,0.038335,0.015219,0.011879,0.015302,0.008216,0.013460,0.016431,0.012039
5,1006_DFA_SAD_XX,1006,SAD,DFA,1.0,0.115795,0.014708,2.844136,0.204687,0.210093,...,0.022490,0.013506,0.030509,0.026745,0.006870,0.014035,0.009864,0.018840,0.012973,0.013929
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1192,1089_TSI_NEU_XX,1089,NEU,TSI,1.0,0.301379,0.019511,2.851780,0.379223,0.250684,...,0.028692,0.024063,0.006848,0.013698,0.002913,0.013846,0.009565,0.003859,0.003258,0.009791
1194,1089_WSI_ANG_XX,1089,ANG,WSI,1.0,0.229983,0.045743,2.994412,0.316311,0.226412,...,0.002990,0.001033,0.004155,0.003678,0.001318,0.019570,0.003408,0.002347,0.026694,0.009494
1195,1089_WSI_DIS_XX,1089,DIS,WSI,1.0,0.383122,0.012807,2.857422,0.432665,0.261348,...,0.004918,0.013561,0.010522,0.009827,0.005967,0.010294,0.010915,0.004520,0.010917,0.005369
1198,1089_WSI_NEU_XX,1089,NEU,WSI,1.0,0.139094,0.008446,2.883838,0.221700,0.209398,...,0.023018,0.010559,0.011100,0.021006,0.007919,0.003927,0.009988,0.009861,0.020112,0.007493


In [None]:
df_test_corrlabel.to_csv('../Data/Mid_features/df_test_only_corrLabel.csv', index=False)

Percentage of correct responses in both the train and test sets

In [161]:
sum(df_train['is_correct'])/len(df_train['is_correct'])

0.4273995077932732

In [162]:
sum(df_test['is_correct'])/len(df_test['is_correct'])

0.4658333333333333

## Do the same for Categories.csv files

In [86]:
df_cat_train = pd.read_csv('../Data/Categories_train.csv')
df_cat_test = pd.read_csv('../Data/Categories_test.csv')
df_cat = pd.read_csv('../Data/Categories.csv')

df_cat_train.sort_values(by=['FileID']);
df_cat_test.sort_values(by=['FileID']);
df_cat.sort_values(by=['FileID']);

In [91]:
# Train test
df_cat_train.insert(loc = 5,
              column = 'is_correct',
              value = np.nan)

for i in df_cat_train.index:
    x = df_cat_train.loc[i,'FileID']
    y = summary.loc[summary['FileName'] == x]
    df_cat_train.loc[i,'is_correct'] = y['is_correct'].values[0]

# Test set
df_cat_test.insert(loc = 5,
              column = 'is_correct',
              value = np.nan)

for i in df_cat_test.index:
    x = df_cat_test.loc[i,'FileID']
    y = summary.loc[summary['FileName'] == x]
    df_cat_test.loc[i,'is_correct'] = y['is_correct'].values[0]

# Full dataset
df_cat.insert(loc = 5,
              column = 'is_correct',
              value = summary['is_correct'])

In [96]:
# Save to disk
df_cat_train.to_csv('../Data/Categories_train_corr.csv', index=False)
df_cat_test.to_csv('../Data/Categories_test_corr.csv', index=False)
df_cat.to_csv('../Data/Categories_corr.csv', index=False)

#### Visualize datasets

In [94]:
df_cat_train

Unnamed: 0,FileID,ActorID,SentenceID,Emotion,Intensity,is_correct,Old_Dur,New_Dur
0,1001_DFA_ANG_XX,1001,DFA,ANG,XX,1.0,2.275562,1.66
1,1001_DFA_DIS_XX,1001,DFA,DIS,XX,1.0,2.335688,1.84
2,1001_DFA_FEA_XX,1001,DFA,FEA,XX,1.0,2.168813,1.58
3,1001_DFA_HAP_XX,1001,DFA,HAP,XX,0.0,1.868500,1.66
4,1001_DFA_NEU_XX,1001,DFA,NEU,XX,1.0,2.035375,1.74
...,...,...,...,...,...,...,...,...
4871,1091_WSI_DIS_XX,1091,WSI,DIS,XX,0.0,2.502500,1.96
4872,1091_WSI_FEA_XX,1091,WSI,FEA,XX,0.0,2.502500,1.66
4873,1091_WSI_HAP_XX,1091,WSI,HAP,XX,0.0,2.168813,1.90
4874,1091_WSI_NEU_XX,1091,WSI,NEU,XX,1.0,2.335688,1.88


In [95]:
df_cat_test

Unnamed: 0,FileID,ActorID,SentenceID,Emotion,Intensity,is_correct,Old_Dur,New_Dur
0,1006_DFA_ANG_XX,1006,DFA,ANG,XX,1.0,2.602625,2.22
1,1006_DFA_DIS_XX,1006,DFA,DIS,XX,1.0,2.769437,2.42
2,1006_DFA_FEA_XX,1006,DFA,FEA,XX,0.0,2.369000,1.78
3,1006_DFA_HAP_XX,1006,DFA,HAP,XX,1.0,2.302313,1.82
4,1006_DFA_NEU_XX,1006,DFA,NEU,XX,1.0,2.569250,1.84
...,...,...,...,...,...,...,...,...
1195,1089_WSI_DIS_XX,1089,WSI,DIS,XX,1.0,4.070750,2.68
1196,1089_WSI_FEA_XX,1089,WSI,FEA,XX,0.0,2.302313,1.64
1197,1089_WSI_HAP_XX,1089,WSI,HAP,XX,0.0,2.202187,1.82
1198,1089_WSI_NEU_XX,1089,WSI,NEU,XX,1.0,2.569250,1.66


In [159]:
df_cat

Unnamed: 0,FileID,ActorID,SentenceID,Emotion,Intensity,is_correct
0,1001_DFA_ANG_XX,1001,DFA,ANG,XX,1.0
1,1001_DFA_DIS_XX,1001,DFA,DIS,XX,0.0
2,1001_DFA_FEA_XX,1001,DFA,FEA,XX,0.0
3,1001_DFA_HAP_XX,1001,DFA,HAP,XX,1.0
4,1001_DFA_NEU_XX,1001,DFA,NEU,XX,0.0
...,...,...,...,...,...,...
6071,1091_WSI_DIS_XX,1091,WSI,DIS,XX,1.0
6072,1091_WSI_FEA_XX,1091,WSI,FEA,XX,1.0
6073,1091_WSI_HAP_XX,1091,WSI,HAP,XX,0.0
6074,1091_WSI_NEU_XX,1091,WSI,NEU,XX,1.0
