In [2]:
import sys
sys.path.append('src')

In [104]:
from pathlib import Path
import typing

from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd

from timething import dataset, job, text, utils, align  # type: ignore

metadata = Path("/Users/purzelrakete/src/scratch/keanu/text.csv")
cfg = utils.load_config("english")
ds = dataset.SpeechDataset(Path(metadata), cfg.sampling_rate)
device = "cpu"
aligner = align.Aligner.build(device, cfg)
loader = DataLoader(
    ds,
    batch_size=1,
    num_workers=1,
    collate_fn=dataset.collate_fn,
    shuffle=False,
)

In [4]:
batch = next(iter(loader))

In [7]:
xs, ys, ys_original, ids = batch
xs.shape

torch.Size([1, 1, 26759040])

In [143]:
offset = (16000*60*10)
duration = (16000*30)
window = xs[:, :, offset:(offset+duration)]
window

tensor([[[-3.5329e-05, -6.2365e-05,  2.2021e-05,  ..., -1.3205e-01,
          -1.5090e-01, -1.6871e-01]]])

In [144]:
ipd.Audio(window.squeeze(), rate=cfg.sampling_rate)

In [145]:
logprobs = aligner.logp(window)
logprobs



tensor([[[-5.8443e-04, -2.5318e+01, -2.5248e+01,  ..., -1.5519e+01,
          -1.2485e+01, -1.4603e+01],
         [-5.5550e-05, -3.0979e+01, -3.1144e+01,  ..., -1.7851e+01,
          -1.6716e+01, -1.7544e+01],
         [-1.2131e-03, -2.6593e+01, -2.6581e+01,  ..., -1.5895e+01,
          -1.3523e+01, -1.4987e+01],
         ...,
         [-2.9542e-02, -2.1105e+01, -2.1053e+01,  ..., -1.3409e+01,
          -1.0146e+01, -1.3352e+01],
         [-5.0626e+00, -1.9250e+01, -1.9189e+01,  ..., -1.2708e+01,
          -9.8022e+00, -1.3852e+01],
         [-6.1057e-01, -1.9205e+01, -1.9169e+01,  ..., -1.2520e+01,
          -4.9877e+00, -1.3373e+01]]])

In [146]:
transcript

"<pad>|pr<pad>o<pad>c<pad>es<pad>s<pad>i<pad>n<pad>g<pad>|<pad>o<pad>r<pad>|<pad>t<pad>e<pad>x<pad>t<pad>|<pad>s<pad>t<pad>il<pad>l<pad>|<pad>pr<pad>o<pad>c<pad>e<pad>s<pad>s<pad>i<pad>n<pad>g<pad>|<pad>t<pad>o<pad>|<pad>f<pad>i<pad>ve<pad>-hund<pad>r<pad>ed<pad>|f<pad>i<pad>ve<pad>-h<pad>un<pad>d<pad>r<pad>e<pad>d<pad>|<pad>a<pad>l<pad>e<pad>x<pad>|p<pad>a<pad>p<pad>a<pad>d<pad>e<pad>m<pad>u<pad>s<pad>|<pad>you've|writen|a|b<pad>o<pad>o<pad>k<pad>y<pad>e<pad>|<pad>h<pad>ow<pad>|<pad>a<pad>b<pad>o<pad>u<pad>t|<pad>th<pad>a<pad>t<pad>it<pad>'s|<pad>c<pad>a<pad>l<pad>l<pad>e<pad>d<pad>|<pad>k<pad>i<pad>a<pad>n<pad>a<pad>r<pad>e<pad>v<pad>'s|m<pad>o<pad>s<pad>t<pad>|t<pad>r<pad>i<pad>u<pad>m<pad>ph<pad>a<pad>n<pad>t<pad>|<pad>t<pad>he<pad>|<pad>m<pad>o<pad>v<pad>i<pad>e<pad>s<pad>|<pad>an<pad>d<pad>|<pad>m<pad>e<pad>a<pad>n<pad>i<pad>n<pad>g<pad>|<pad>o<pad>f<pad>|an|<pad>i<pad>r<pad>r<pad>e<pad>p<pad>re<pad>s<pad>s<pad>i<pad>b<pad>le<pad>|<pad>i<pad>c<pad>o<pad>n<pad>b<pad>ut<pad>|i|have

In [147]:
d = {v: k for (k, v) in aligner.vocab().items()}
x = torch.argmax(logprobs, dim=2)
tokens = [d[code.item()] for code in x.squeeze()]
transcript = ''.join(c for c, _ in itertools.groupby(tokens))
cleaned = " ".join(transcript.replace(d[0], '').split("|"))
cleaned

"kurt was a different kind of rock star kianoo's a different kind of rock star too andhe has a lot of big hits he has speed which sortofconfirms him like point breakers like this guy could be an action star and ten speed comes l o this guy is an action starthenhe runs in the other directioit is a very genex thing to do when confronted with the opportunity to be superfamous hdoes hamlet canadaand he's clearly unsatisf"

In [148]:
def windows(text: str, n_chars=80) -> typing.List[str]:
    n = int(2 * len(text) / n_chars)
    def offset(i):
        return int(i * n_chars / 2)
    return [
        text[offset(i):(offset(i)+n_chars)] for i in range(n)
    ]
    
def shingle(text: str, k = 5):
    return {text[i:i+k] for i in range(len(text))}

def jaquard(a: set, b: set) -> float:
    return len(a.intersection(b)) / len(a.union(b))

def similarity(queries, candidates):
    for i, query in enumerate(queries):
        for j, candidate in enumerate(candidates):
            if jaquard(shingle(query), shingle(candidate)) > 0.4:
                print(i, j)
                print("Q: ", query)
                print("X: ", candidate)
                
similarity(windows(cleaned), windows(ys[0].lower()))

1 239
Q:  ianoo's a different kind of rock star too andhe has a lot of big hits he has spe
X:  different kind of rock star too. and he has a lot of big hits. he has “speed,” w
6 245
Q:  s an action starthenhe runs in the other directioit is a very genex thing to do 
X:  then he runs in the other direction. it is a very gen x thing to do when confron
7 246
Q:   directioit is a very genex thing to do when confronted with the opportunity to 
X:  is a very gen x thing to do when confronted with the opportunity to be super fam
8 247
Q:  when confronted with the opportunity to be superfamous hdoes hamlet canadaand he
X:  ted with the opportunity to be super famous. and he does hamlet in canada. and h
