In [None]:
from collections import Counter, defaultdict
import glob
import json

import IPython.display as ipd
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import soundfile as sf
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
def get_mfcc(audio):
    n_fft = 512
    hop_length = 160   # sr * .01
    mfcc = librosa.feature.mfcc(audio, sr=SR, n_mfcc=13, n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window='hann')

    mean = mfcc.mean(axis=1, keepdims=True)
    std = mfcc.std(axis=1, keepdims=True)
    mfcc = (mfcc - mean) / std
    return mfcc

In [None]:
def get_spectrogram(audio):
    n_fft = 2048
    hop_length = 160   # sr * .01
    spec = np.abs(librosa.stft(
        audio, n_fft=n_fft, hop_length=hop_length, win_length=n_fft, window='hann')) ** 2
    spec = librosa.power_to_db(spec, ref=np.max)
    
    return spec

In [None]:
CHAR2CODE = {
    'A':'. -',
    'B':'- . . .',
    'C':'- . - .',
    'D':'- . .',
    'E':'.',
    'F':'. . - .',
    'G':'- - .',
    'H':'. . . .',
    'I':'. .',
    'J':'. - - -',
    'K':'- . -',
    'L':'. - . .',
    'M':'- -',
    'N':'- .',
    'O':'- - -',
    'P':'. - - .',
    'Q':'- - . -',
    'R':'. - .',
    'S':'. . .',
    'T':'-',
    'U':'. . -',
    'V':'. . . -',
    'W':'. - -',
    'X':'- . . -',
    'Y':'- . - -',
    'Z':'- - . .',
    ' ':'       ',
}
# dot=1
# dash=3
# space=1
# between letters = 3
# between words = 7

SR = 16000

unit = 0.10
u = np.linspace(0, unit, int(unit*SR))
u3 = np.linspace(0, unit*3, int(unit*SR*3))
dot = np.sin(2 * np.pi * 880 * u)
dash = np.sin(2 * np.pi * 880 * u3)
space = np.zeros_like(u)

CODE2AUDIO = {
    '.': dot,
    '-': dash,
    ' ': space,
}

def text2code(text):
    words = text.upper().split(' ')
    codes = ['   '.join(CHAR2CODE[c] for c in word) for word in words]
    margin = 3
    prev_end = -7 + margin
    offsets = []
    for word, code in zip(words, codes):
        st, ed = (prev_end + 7, prev_end + 7 + len(code.replace('-', '...')))
        offsets.append((word, st - margin, ed + margin))
        prev_end = ed
    code = '       '.join(codes)
    code = ' '*margin + code + ' '*margin
    return code, offsets

def code2audio(code):
    audio = [CODE2AUDIO[c] for c in code]
    audio = np.concatenate(audio)
    return audio
    
def text2audio(text):
    code, offsets = text2code(text)
    audio = code2audio(code)
    offsets = [(word, st * unit, ed * unit) for word, st, ed in offsets]
    return audio, offsets

In [None]:
def visualize(spec):
    fig, ax = plt.subplots(nrows=1, ncols=1, sharex=True, figsize=(15, 5))
    librosa.display.specshow(spec, sr=SR, x_axis='time', ax=ax, cmap='inferno')

In [None]:
code, offsets = text2code('earth species project')

In [None]:
code, offsets

In [None]:
audio, offsets = text2audio('earth species project')

In [None]:
offsets

In [None]:
spec = get_spectrogram(audio)

In [None]:
visualize(spec)

In [None]:
ipd.Audio(audio, rate=SR)

In [None]:
ipd.Audio(audio[int(offsets[0][1]*SR):int(offsets[0][2]*SR)], rate=SR)

In [None]:
ipd.Audio(audio[int(offsets[1][1]*SR):int(offsets[1][2]*SR)], rate=SR)

In [None]:
ipd.Audio(audio[int(offsets[2][1]*SR):int(offsets[2][2]*SR)], rate=SR)

## Generate dataset for LibriSpeech (dev-clean)

In [None]:
metadata = {}
for filename in glob.glob('data/LibriSpeech/dev-clean/**/*.trans.txt', recursive=True):
    print(filename)
    with open(filename) as f:
        for line in f:
            prefix, trans = line.strip().split(' ', maxsplit=1)
            trans = trans.lower()
            trans = trans.replace("'", '')     # TODO: remove apostrophes .. is this good?
            audio, offsets = text2audio(trans)
            # sf.write(f'data/LibriMorse/{prefix}.wav', audio, SR)
            metadata['dev-clean/'+prefix] = offsets

In [None]:
with open('data/LibriMorse/dev-clean/metadata.jsonl', mode='w') as f:
    for prefix, offsets in metadata.items():
        f.write(json.dumps({'prefix': prefix, 'offsets': offsets}))
        f.write('\n')

## train-clean-100

In [None]:
metadata = {}
count = 0
for filename in glob.glob('data/LibriSpeech/train-clean-100/**/*.trans.txt', recursive=True):
    print(filename)
    with open(filename) as f:
        for line in f:
            prefix, trans = line.strip().split(' ', maxsplit=1)
            trans = trans.lower()
            trans = trans.replace("'", '')     # TODO: remove apostrophes .. is this good?
            audio, offsets = text2audio(trans)
            # sf.write(f'data/LibriMorse/train-clean-100/{prefix}.wav', audio, SR)
            metadata['train-clean-100/'+prefix] = offsets
    count += 1
    if count == 100:
        break

In [None]:
with open('data/LibriMorse/train-clean-100/metadata.jsonl', mode='w') as f:
    for prefix, offsets in metadata.items():
        f.write(json.dumps({'prefix': prefix, 'offsets': offsets}))
        f.write('\n')

In [None]:
audio, _ = librosa.load('data/LibriMorse/dev-clean/2078-142845-0000.wav', sr=SR)

In [None]:
visualize(get_spectrogram(audio))

In [None]:
visualize(get_mfcc(audio))

In [None]:
visualize(get_mfcc(audio[int(metadata['2078-142845-0000'][0][1]*SR):int(metadata['2078-142845-0000'][0][2]*SR)]))

In [None]:
visualize(get_mfcc(audio)[:, int(metadata['2078-142845-0002'][3][1]*100):int(metadata['2078-142845-0002'][3][2]*100)])