In [82]:
import os
import numpy as np
from pathlib import Path
import tarfile
import pandas as pd
from IPython.display import Audio
import matplotlib.pyplot as plt

Emotional content- angry, happy, sad, neutral, frustrated, excited, fearful, 

The recorded dialogs are either improvisations of affective scenarios, or performances of theatrical scripts. They have been manually segmented into utterances. Each utterance from either of the actors in the interaction has been evaluated categorically over the set of: {angry, happy, sad, neutral, frustrated, excited, fearful, surprised, disgusted, other} by at least three different annotators, and dimensionally over the axes of: valence (positive vs. negative); activation (calm vs. excited); and dominance (passive vs. aggressive) by at least two different annotators. 

In each recording of a session only one actor wears MoCap markers while both are being recorded by microphones and cameras. Thus there are available MoCap data (facial expression, head and hand movement) for one actor per recording, while there are wavefile and videos for both actors. The naming convention regarding the data is e.g., **Ses01F_impro01** while indicates Session1, where the Female actor is wearing the markers and actors are performing improvisation 1. The release contains two formats: **dialog format** which contains data from the entire dyadic interaction and the **sentence format** where the data per dialog (recording) have been further segmented into utterances (see folders SessionX/dialog and SessionX/sentences respectively). For the utterance format the naming is as follows: Ses01F_impro01_M000 indicates first session, Female actor is wearing markers, actors are performing improvisation 1 and this is the first utterance of the Male actor. The timing of the sentences in each dialog can be found in the lab files in SessionX/dialog/lab

We are also distributing wavefiles (**sentence** and dialog format), the videos of the recordings (SessionX/dialog/avi/), and transcriptions of the dialogs (SessionX/dialog/transcriptions/). For the segmented utterances we are also providing the results of forced alignment which contain detail phoneme, syllable and word level timing information (SessionX/sentences/ForcedAlignment/).

The evaluations (emotional annotations) for each recording and each uterance are contained in folder SessionX/dialog/Evaluation/. Each file provides the detailed evaluation reports for the categorical evaluators (e.g., C-E1), the dimensional evaluators (e.g., A-E1), and the self-evaluatiors (e.g., C-F1 or C-M1, A-F1 or A-M1). The utterance-level information can be found in the first line of an utterance summary.  The first entry represents the start and end times for the utterance.  The second entry is the utterance name (e.g., Ses01_impro01_F003).  The third entry is the ground truth (if no majority ground truth could be assigned, the ground truth is labeled xxx).  The final engry is the average dimensional evaluation (over the evaluators, except the self-evaluators).

We are providing  a full release of this data in the hopes that it will provide a valuable resource to the emotion recognition community. We request that any published work using IEMOCAP should cite the paper entitled: "IEMOCAP: Interactive emotional dyadic motion capture database" (JLRE, 2008). If you do any further evaluation of the data, we request that you send us the detailed results so that we may provide a more detailed resource to the community. Also, please feel free to send us feedback regarding the database: how it is being used, if the information provided is sufficient, and how you have decided to utilize the evaluation information. Thanks again.

In [81]:
folder = os.path.join(os.getcwd(), 'IEMOCAP')

conv_id = 0

df = []
for session in os.listdir(folder):
    session_path = os.path.join(folder, session)
    # 'dialogue' folder contains Emotions and Transcripts
    # 'sentences' folder contains Audios

    trans_path = os.path.join(session_path, 'dialog', 'transcriptions')

    for trans in os.listdir(trans_path):
        if trans[:2] != '._':
            emo_path = os.path.join(session_path, 'dialog', 'EmoEvaluation', trans)
            with open(os.path.join(trans_path, trans), encoding='utf8') as f, open(emo_path, encoding='utf8') as e:
                conv_id += 1
                turn_id = 0
                for line in f:
                    name, text = line.split(':')
                    if trans.split('.')[0] in name:
                        turn_id += 1

                        wav_path = os.path.join(session_path, 'sentences', 'wav', trans.split('.')[0], name.split(' ')[0] + '.wav')

                        reached = False
                        count_em = {'Anger': 0, 'Happiness': 0, 'Sadness': 0, 'Neutral': 0, 'Frustration': 0, 'Excited': 0, 'Fear': 0, 'Surprise': 0, 'Disgust': 0, 'Other': 0}
                        for line_e in e:
                            try:
                                if name.split(' ')[0] in line_e:
                                    emotion, vad = line_e.split('\t')[-2:]
                                    vad = vad[1:-2].split(',')
                                    reached = True
                                elif line_e[0] == 'C' and reached:
                                    evaluator = line_e.split(':')[0]
                                    
                                    emotions = line_e.split(':')[1].split('(')[0].split(';')
                                    emotions = [em.strip() for em in emotions]
                                    for em in emotions:
                                        if em != '':
                                            count_em[em] += 1
                                elif reached:
                                    e.seek(0)
                                    break
                            except:
                                print(line_e)
                                    

                        row = {'conv_id': conv_id, 
                                'turn_id': turn_id, 
                                'sentence': text.strip(),
                                'path': wav_path,
                                'emotion': emotion,
                                'valence': float(vad[0]),
                                'activation': float(vad[1]),
                                'dominance': float(vad[2])
                                }
                        
                        df.append(dict(**row, **count_em))

df = pd.DataFrame(df)
df.head()

        

Unnamed: 0,conv_id,turn_id,sentence,path,emotion,valence,activation,dominance,Anger,Happiness,Sadness,Neutral,Frustration,Excited,Fear,Surprise,Disgust,Other
0,1,1,Excuse me.,c:\Users\User\OneDrive\Desktop\NLP\IEMOCAP\Ses...,neu,2.5,2.5,2.5,0,0,0,4,0,0,0,0,0,0
1,1,2,Do you have your forms?,c:\Users\User\OneDrive\Desktop\NLP\IEMOCAP\Ses...,fru,2.5,2.0,2.5,0,0,0,1,3,0,0,0,0,1
2,1,3,Yeah.,c:\Users\User\OneDrive\Desktop\NLP\IEMOCAP\Ses...,neu,2.5,2.5,2.5,1,0,0,4,0,0,0,0,0,0
3,1,4,Let me see them.,c:\Users\User\OneDrive\Desktop\NLP\IEMOCAP\Ses...,fru,2.5,2.0,2.5,0,0,0,0,3,0,0,0,0,1
4,1,5,Is there a problem?,c:\Users\User\OneDrive\Desktop\NLP\IEMOCAP\Ses...,neu,2.5,2.5,2.5,1,0,0,3,0,0,0,1,0,0


In [28]:
for em in np.unique(df.loc[:, ['emotion']]):    
    print(f"emotion {em}: {np.sum(df['emotion'] == em)}")

emotion ang: 214
emotion exc: 1946
emotion fea: 285
emotion fru: 985
emotion hap: 293
emotion neu: 3107
emotion sad: 851
emotion xxx: 2406


In [None]:
for idx in range(df.shape[0]):
    print()

In [68]:
Audio(df.iloc[5, 3])