### Speech Audio to Emotion

In [3]:
import math
from pydub import AudioSegment
from speechbrain.pretrained.interfaces import foreign_class

class AudioPitchClassifier():
    def __init__(self, folder, filename, classifier):
        self.filename = filename
        self.folder = folder
        self.classifier = classifier
        self.audio = AudioSegment.from_wav(self.folder + "//" + self.filename)
        self.labels = ['neutral', 'distress', 'excitment', 'distress']

    def get_duration(self):
        return self.audio.duration_seconds

    def single_split(self, from_sec, to_sec, split_filename):
        t1 = from_sec * 1000
        t2 = to_sec * 1000
        split_audio = self.audio[t1:t2]
        split_audio.export(self.folder + "//" + split_filename, format="wav")

    def predict_pitch_class(self, audio_file):

        out_prob, score, index, text_lab = self.classifier.classify_file(self.folder + "//" + audio_file)
        probs = out_prob.tolist()[0]
        max_prob = max(probs)
        max_index = probs.index(max_prob)
        label = self.labels[max_index]
        return label

    def multiple_split(self, sec_per_split):
        total_secs = math.ceil(self.get_duration()) 
        predictions = []
        for i in range(0, total_secs, sec_per_split):
            split_fn = str(i) + '_' + self.filename
            self.single_split(i, i+sec_per_split, split_fn)
            predictions.append(self.predict_pitch_class(split_fn))
            if i == total_secs - sec_per_split:
                print('All splited successfully')

        return predictions

  from .autonotebook import tqdm as notebook_tqdm
torchvision is not available - cannot save figures


In [4]:
classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
folder = "meeting_id//speaker_2"
audio_file = "audio.wav"
split_wav = AudioPitchClassifier(folder, audio_file, classifier)
predictions = split_wav.multiple_split(sec_per_split=10)

Downloading: 100%|██████████| 1.64k/1.64k [00:00<00:00, 1.87MB/s]
Downloading: 100%|██████████| 6.01k/6.01k [00:00<00:00, 4.41MB/s]
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]
Downloading: 100%|██████████| 159/159 [00:00<00:00, 136kB/s]
Downloading: 100%|██████████| 1.84k/1.84k [00:00<00:00, 484kB/s]
Downloading: 100%|██████████| 380M/380M [01:14<00:00, 5.11MB/s] 
Some weights of the model checkpoint at facebook/wav2vec2-base were not used when initializing Wav2Vec2Model: ['quantizer.weight_proj.bias', 'project_hid.weight', 'project_q.bias', 'quantizer.weight_proj.weight', 'project_hid.bias', 'quantizer.codevectors', 'project_q.weight']
- This IS expected if you are initializing Wav2Vec2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
speechbrain.lobes.models.huggingface_wav2vec - wav2

In [8]:
print("There are {} neutral events.".format(predictions.count('neutral')))
print("There are {} excitment events.".format(predictions.count('excitment')))
print("There are {} distress events.".format(predictions.count('distress')))

There are 93 neutral events.
There are 7 excitment events.
There are 8 distress events.
