In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
from pathlib import Path
import math

import torch
import numpy as np
import pandas as pd
import scipy
import textgrids
import librosa
import python_speech_features as psf

import matplotlib.pyplot as plt

%matplotlib inline

In [5]:
from load_data import *

In [6]:
audio_path, ann_path = get_file_paths("Female/TMIT/SI2220")

In [11]:
signal, frequency = read_audio(audio_path)
print(len(signal))
print(frequency)

64492
22050


In [13]:
annotations = read_annotation(ann_path)

# TODO: how to deal with rounding errors ?
signal_labels = []
for (duration, label) in annotations:
    num_dur_samples = int(np.round(duration * frequency))
    for _ in range(num_dur_samples):
        signal_labels.append(label)

print(len(signal_labels))

64492


In [42]:
winlen = 0.025
winstep = 0.010
num_mel_filters = 32
# Number of frames to stack up for a single spectogram of the sequence
#
# One frame produces a single dimension (row of size num_mel_filters = 32)
# of the spectogram. Rows generated by multiple frames, governed by this
# number (32), are stacked on top of each other to create the image (spectogram)
# of the sequence
num_seq_frames = 32

In [62]:
def calc_nfft(frequency):
    est_frame_len = int(np.round(winlen * frequency))
    return 2**int(np.ceil(np.log2(est_frame_len)))

def vote_for_label(labels):
    voted_label, _ = scipy.stats.mode(labels, axis=0)
    voted_label = int(voted_label.item())
    return voted_label

In [28]:
# Get (log) mel-filterbank energies
# (num_frames, num_mel_filters)
frame_emb = psf.base.logfbank(
    signal=signal,
    samplerate=frequency,
    winlen=winlen,
    winstep=winstep,
    nfilt=num_mel_filters,
    nfft=calc_nfft(frequency),
    lowfreq=0,
    highfreq=None,
)
print(frame_emb.shape)

(291, 32)


In [31]:
# (num_frames, frame_len)
frame_labels = psf.sigproc.framesig(
    sig=signal_labels,
    frame_len=winlen * frequency,
    frame_step=winstep * frequency,
    winfunc=np.ones
)
print(frame_labels.shape)

# (num_frames, 1)
frame_labels, _ = scipy.stats.mode(frame_labels, axis=1, keepdims=True)
print(frame_labels.shape)

(291, 551)
(291, 1)


In [63]:
num_frames = frame_labels.shape[0]
print(f"{num_frames=}")

# Create sequence of spectograms
seq_len = math.ceil(num_frames / num_seq_frames)
print(f"{seq_len=}")

num_frames=291
seq_len=10


In [None]:
X_seq = [
    frame_emb[i:i+num_seq_frames, :]
    for i in range(0, num_frames, num_seq_frames)
]
Y_seq = [
    vote_for_label(frame_labels[i:i+num_seq_frames, 0])
    for i in range(0, num_frames, num_seq_frames)
]