In [3]:
# Base
import itertools
from glob import glob
from tqdm import tqdm
import math

# ML
import torch
import torch.nn.functional as F
from torch.utils.data import DistributedSampler, DataLoader
from torch.utils.tensorboard import SummaryWriter

# Local
from utils.misc import dict_to_object, plot_specgram, plot_waveform
from utils.audio import spectogram, load_mono_audio
from utils.alignment import init_alignment, alignment

In [4]:
device = "cpu"
init_alignment(device)
waveform = load_mono_audio("./external_datasets/lj-speech-1.1/wavs/LJ005-0006.wav", 16000).to(device)
text = "who handed a shilling to the escort warder to provide her with a hackney coach; but this functionary pocketed the cash, and obliged the woman to walk"

In [5]:
alignment(waveform, text, 16000)

[('who', 0.0, 0.006493506493506494),
 ('handed', 0.017316017316017316, 0.05411255411255411),
 ('a', 0.06060606060606061, 0.06277056277056277),
 ('shilling', 0.08008658008658008, 0.11688311688311688),
 ('to', 0.12337662337662338, 0.12987012987012986),
 ('the', 0.1341991341991342, 0.14285714285714285),
 ('escort', 0.16017316017316016, 0.2077922077922078),
 ('warder', 0.22077922077922077, 0.2619047619047619),
 ('to', 0.27705627705627706, 0.2857142857142857),
 ('provide', 0.29004329004329005, 0.33766233766233766),
 ('her', 0.341991341991342, 0.354978354978355),
 ('with', 0.36363636363636365, 0.3787878787878788),
 ('a', 0.38095238095238093, 0.38311688311688313),
 ('hackney', 0.38961038961038963, 0.43506493506493504),
 ('coach', 0.44155844155844154, 0.487012987012987),
 ('but', 0.5562770562770563, 0.5692640692640693),
 ('this', 0.5735930735930735, 0.5909090909090909),
 ('functionary', 0.6038961038961039, 0.6666666666666666),
 ('pocketed', 0.6774891774891775, 0.7316017316017316),
 ('the', 0.7

In [8]:
def get_slopes_power_of_2(n):
    start = (2**(-2**-(math.log2(n)-3)))
    ratio = start
    return [start*ratio**i for i in range(n)]
def get_slopes(n):
    if math.log2(n).is_integer():
        return get_slopes_power_of_2(n)                   #In the paper, we only train models that have 2^a heads for some a. This function has
    else:                                                 #some good properties that only occur when the input is a power of 2. To maintain that even
        closest_power_of_2 = 2**math.floor(math.log2(n))  #when the number of heads is not a power of 2, we use this workaround. 
    return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2*closest_power_of_2)[0::2][:n-closest_power_of_2]

In [15]:
get_slopes(16)

[0.7071067811865476,
 0.5000000000000001,
 0.35355339059327384,
 0.25000000000000006,
 0.17677669529663692,
 0.12500000000000006,
 0.08838834764831849,
 0.06250000000000004,
 0.044194173824159244,
 0.03125000000000002,
 0.022097086912079626,
 0.01562500000000001,
 0.011048543456039816,
 0.007812500000000007,
 0.005524271728019908,
 0.003906250000000004]