## Dependencies/Packages/Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install swig
!pip3 install sequitur-g2p
!git clone --recursive https://github.com/petronny/g2p
!unzip ./drive/MyDrive/ifadataset.zip -d ./

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./ifadata/boundaries/F40L1FR1C_0.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/M66O2PS8A_0.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/F60E1FPA2VOW_3.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/F60E1VI6C_2.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/M66O1FY2A_7.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/M40K2FPB2VOW_11.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/M40K1FS84A_0.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/F60E1FPA2NUM_6.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/M66O2VR1BN_3.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/F20N1FT6I_3.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/M66O2PS1A_5.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/F60E2VS68A_1.phoneme.boundary.npy  
  inflating: ./ifadata/boundaries/M66O1FPA2NUM_8.phoneme.boundary.npy  
  inflating: ./ifadata

In [3]:
#initial imports
import torch
from torch import nn
import numpy as np
import pandas as pd
import math
import os
import sys
import librosa
from pathlib import Path
from functools import partial
import unicodedata
import re
from tqdm.contrib.concurrent import process_map, thread_map
from copy import deepcopy
import subprocess
import torchaudio
from g2p.en_us import G2P
from tqdm.notebook import tqdm

## Datasets

### Buckeye

In [None]:
phoneme_map = {
        'a': 'AH',
        'e': 'EH',
        'h': 'HH',
        'i': 'IH',

        'tq': 'T',
        'q': 'TH',
        'id': 'D',

        'dx': ['D', 'Z'],
        'nx': ['N', 'Z'],
        'aan': ['AA', 'N'],
        'aen': ['AE', 'N'],
        'ahn': ['AH', 'N'],
        'aon': ['AO', 'N'],
        'awn': ['AW', 'N'],
        'ayn': ['AY', 'N'],
        'ehn': ['EH', 'N'],
        'ern': ['ER', 'N'],
        'eyn': ['EY', 'N'],
        'hhn': ['HH', 'N'],
        'ihn': ['IH', 'N'],
        'iyn': ['IY', 'N'],
        'own': ['OW', 'N'],
        'oyn': ['OY', 'N'],
        'uhn': ['UH', 'N'],
        'uwn': ['UW', 'N'],

        'ah l': ['AH', 'L'],
        'ah n': ['AH', 'N'],
        'ah r': ['AH', 'R'],
        'ih l': ['IH', 'L'],

        'en': ['AE', 'N'],
        'em': ['AH', 'M'],
        'el': ['AH', 'L'],
        'eng': ['IH', 'NG'],
}

class Word:

    def __init__(self, content=None, start=None, end=None):
        self.content = content
        self.start = start
        self.end = end

class Phoneme(Word):
    pass

class Sentence:

    def __init__(self, start=None, end=None):
        self.start = start
        self.end = end
        self.words = [Word(start=start)]

    def __str__(self):
        return str([word.content for word in self.words])

def clip_wav(source, start, end, target):
    duration = '%.6f' % (end - start)
    start = '%.6f' % start
    subprocess.run(['ffmpeg', '-i', source, '-ss', start, '-t', duration, target, '-v', 'quiet', '-y'])

def process_segmentation(segmented, g2p, path):
    lines = open(path).readlines()[9:]
    lines = [line.strip('\r\n') for line in lines]

    sentences = []
    for line in lines:
        line = line.split(';')[0].split(' ')
        line = [i for i in line if i != '']
        if len(line) == 0:
            continue
        if len(line) > 3:
            line[2] = ' '.join(line[2:])
            del line[3:]
            if not line[2].startswith('<') or not line[-1].endswith('>'):
                print('Invaild word:', path, line[2])
        word = line[2]
        time = float(line[0])
        if word in ['{B_TRANS}', '<IVER>', '{E_TRANS}', '<VOCNOISE>', '<SIL>']:
            if len(sentences) > 0 and sentences[-1].end is None:
                sentences[-1].end = sentences[-1].words[-1].end
            sentences.append(Sentence(start=time))
        else:
            if len(sentences) == 0:
                sentences.append(Sentence(start=0))
            if sentences[-1].words[-1].end is None:
                sentences[-1].words[-1].end = time
                sentences[-1].words[-1].content = word
            else:
                sentences[-1].words.append(Word(content=word, start=sentences[-1].words[-1].end, end=time))

    lines = open(path.parent/ (path.stem + '.phones')).readlines()[9:]
    lines = [line.strip('\r\n') for line in lines]

    phonemes = []
    start = 0
    for line in lines:
        line = line.split(';')[0].split(' ')
        line = [i for i in line if i != '']
        if len(line) == 0:
            continue
        if len(line) > 3:
            line[2] = ' '.join(line[2:])
            del line[3:]
            if not line[2].startswith('<') or not line[-1].endswith('>'):
                print('Invaild phoneme:', path.stem + '.phones', line[2])
        time = float(line[0])
        if len(line) < 3:
            if phonemes[-1].end is None:
                phonemes[-1].end = time
            continue
        phoneme = line[2].replace('+1', '')
        if len(phonemes) == 0:
            phonemes.append(Phoneme(content=phoneme, start=0, end=time))
        else:
            phonemes.append(Phoneme(content=phoneme, start=phonemes[-1].end, end=time))

    for i, sentence in enumerate(sentences):
        try:
            sentence.words = [word for word in sentence.words if not word.content.startswith('<')]
        except:
            continue

        if len(sentence.words) == 0:
            continue

        if sentence.start is None or sentence.end is None:
            print('Ignored invalid sentence:', sentence)
            continue

        if sentence.end - sentence.start <= 0.025:
            continue

        clip_wav(path.parent / (path.stem + '.wav'), sentence.start, sentence.end, segmented / ('%s-%03d.wav' % (path.stem, i)))

        lines = []

        for word in sentence.words:
            lines.append('%.6f\t%.6f\t%s' % (word.start - sentence.start, word.end - sentence.start, word.content))

        lines = [line + '\n' for line in lines]
        with open(segmented / ('%s-%03d.words' % (path.stem, i)), 'w') as f:
            f.writelines(lines)

        lines = []
        _phonemes = [j for j in phonemes if sentence.start <= j.start and j.end <= sentence.end]
        for phoneme in _phonemes:
            if phoneme.content.upper() in g2p.symbols:
                lines.append('%.6f\t%.6f\t%s' % (phoneme.start - sentence.start, phoneme.end - sentence.start, phoneme.content.upper()))
            elif phoneme.content in phoneme_map.keys():
                if type(phoneme_map[phoneme.content]) is list:
                    for j, p in enumerate(phoneme_map[phoneme.content]):
                        if j == 0:
                            lines.append('%.6f\t%.6f\t%s' % (phoneme.start - sentence.start, -1, p))
                        elif j == len(phoneme_map[phoneme.content]) - 1:
                            lines.append('%.6f\t%.6f\t%s' % (-1, phoneme.end - sentence.start, p))
                        else:
                            lines.append('%.6f\t%.6f\t%s' % (-1, -1, p))
                else:
                    lines.append('%.6f\t%.6f\t%s' % (phoneme.start - sentence.start, phoneme.end - sentence.start, phoneme_map[phoneme.content]))
            elif phoneme.content in ['{B_TRANS}', '{E_TRANS}', 'VOCNOISE', 'SIL', 'IVER', 'LAUGH', '<EXCLUDE-name>', 'UNKNOWN', 'NOISE', 'IVER-LAUGH', '<exclude-Name>']:
                continue
            else:
                print('Unknown phoneme', path.stem, i, phoneme.content)
                lines.append('%.6f\t%.6f\t%s' % (phoneme.start - sentence.start, phoneme.end - sentence.start, phoneme.content))

        with open(segmented / ('%s-%03d.phonemes' % (path.stem, i)), 'w') as f:
            f.write('\n'.join(lines))

def process_word(g2p, path):
    lines = open(path).readlines()
    lines = [i.strip('\r\n') for i in lines]
    lines = [i.split('\t') for i in lines]
    words = [i[2] for i in lines]

    phonemes = []
    for word in words:
        phonemes += g2p.convert(word)
    phonemes = [g2p.symbol2id[i] + 1 for i in phonemes if i in g2p.symbols]
    if len(phonemes) == 0:
        print('Removing sentence with no phoneme:', path)
        for i in path.parent.rglob(f'{path.stem}.*'):
            i.unlink()
        return

    phonemes = np.array(phonemes)
    np.save(path.parent / (path.stem + '.word.npy'), phonemes)

    result = []
    for i, word in enumerate(words):
        phonemes = g2p.convert(word)
        result.append(np.zeros((len(phonemes), 2)) - 1)
        result[-1][0, 0] = float(lines[i][0])
        result[-1][-1, 1] = float(lines[i][1])

    result = np.concatenate(result)
    np.save(path.parent / (path.stem + '.word.boundary.npy'), result)

def process_phoneme(g2p, path):
    lines = open(path).readlines()
    lines = [line.strip('\r\n') for line in lines]
    lines = [line.split('\t') for line in lines]

    phonemes = [i[2] for i in lines]
    if False in [phoneme in g2p.symbols for phoneme in phonemes]:
        print('Removing sentence with unknown phoneme:', path)
        for i in path.parent.rglob(f'{path.stem}.*'):
            i.unlink()
        return

    phonemes = [g2p.symbol2id[i] + 1 for i in phonemes if i in g2p.symbols]
    if len(phonemes) == 0:
        print('Removing sentence with no phoneme:', path)
        for i in path.parent.rglob(f'{path.stem}.*'):
            i.unlink()
        return

    phonemes = np.array(phonemes)
    np.save(path.parent / (path.stem + '.phoneme.npy'), phonemes)

    result = np.zeros((len(lines), 2))
    for i, line in enumerate(lines):
        result[i, 0] = float(line[0])
        result[i, 1] = float(line[1])

    np.save(path.parent / (path.stem + '.phoneme.boundary.npy'), result)

def process_wav(file: Path, sample_rate=16000):
    waveform, sample_rate = librosa.load(file, sr=sample_rate, mono=True)
    if len(waveform) == 0:
        print('Removing empty wav:', file)
        for i in file.parent.rglob(f'{file.stem}.*'):
            i.unlink()
        return
    mfcc = librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=13, hop_length=int(sample_rate/100), n_fft=int(sample_rate/40), fmax=8000)
    delta = librosa.feature.delta(mfcc, width=3, order=1)
    delta2 = librosa.feature.delta(mfcc, width=3, order=2)
    np.save(file.parent / (file.name[:-3] + 'mfcc.npy'), np.concatenate([mfcc, delta, delta2]).T.astype(np.float32))

def get_mean_and_std(mfccs):
    mfccs = np.concatenate(mfccs, axis=0)
    mean = mfccs.mean(axis=0, keepdims=False)
    std = mfccs.std(axis=0, keepdims=False)
    return mean, std

def process_normalization(mfccs_per_speaker):
    mfccs = [np.load(i) for i in mfccs_per_speaker]
    mean, std = get_mean_and_std(mfccs)
    for i, mfcc in enumerate(mfccs):
        mfcc -= mean
        mfcc /= std
        np.save(mfccs_per_speaker[i].parent / (mfccs_per_speaker[i].name[:-8] + 'normalized.mfcc.npy'), mfcc.astype(np.float32))
import os
class Buckeye(torch.utils.data.Dataset):

    def __init__(self, path, reduction: int = 1):
        super().__init__()
        self.path = Path(path)
        self.segmented = self.path / 'segmented'
        self.wavs = [i for i in self.segmented.rglob('*.wav')]
        self.words = [Path(str(i)[:-3] + 'word.npy') for i in self.wavs]
        self.phonemes = [Path(str(i)[:-3] + 'phoneme.npy') for i in self.wavs]
        self.word_boundaries = [Path(str(i)[:-3] + 'word.boundary.npy') for i in self.wavs]
        self.phoneme_boundaries = [Path(str(i)[:-3] + 'phoneme.boundary.npy') for i in self.wavs]
        self.mfccs = [Path(str(i)[:-3] + 'mfcc.npy') for i in self.wavs]
        self.normalized_mfccs = [Path(str(i)[:-3] + 'normalized.mfcc.npy') for i in self.wavs]
        self.reduction = reduction

    def __len__(self):
        return len(self.wavs)

    def __getitem__(self, index):
        word = np.load(self.words[index])
        word_boundary = np.load(self.word_boundaries[index])
        mfcc = np.load(self.normalized_mfccs[index])
        print(mfcc.shape)

        if mfcc.shape[0] % self.reduction != 0:
            mfcc = np.concatenate([mfcc, np.zeros((self.reduction - mfcc.shape[0] % self.reduction, mfcc.shape[1]))])
        if self.reduction > 1:
            mfcc = mfcc.reshape(mfcc.shape[0] // self.reduction, mfcc.shape[1] * self.reduction)

        return word, mfcc.astype(np.float32), word_boundary.astype(np.float32)

class BuckeyePhoneme(Buckeye):

    def __getitem__(self, index):
        phoneme = np.load(self.phonemes[index])
        phoneme_boundary = np.load(self.phoneme_boundaries[index])
        mfcc = np.load(self.normalized_mfccs[index])

        if mfcc.shape[0] % self.reduction != 0:
            mfcc = np.concatenate([mfcc, np.zeros((self.reduction - mfcc.shape[0] % self.reduction, mfcc.shape[1]))])
        if self.reduction > 1:
            mfcc = mfcc.reshape(mfcc.shape[0] // self.reduction, mfcc.shape[1] * self.reduction)

        return phoneme, mfcc.astype(np.float32), phoneme_boundary.astype(np.float32)

In [None]:
!unzip ./drive/MyDrive/Buckeye.zip -d ./

In [None]:
!zip -r ./BuckeyeClean.zip ./Buckeye

In [None]:
!unzip ./drive/MyDrive/BuckeyeFinal.zip -d ./

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./Buckeye/segmented/s2303b-376.normalized.mfcc.npy  
  inflating: ./Buckeye/segmented/s0503b-065.normalized.mfcc.npy  
  inflating: ./Buckeye/segmented/s1402a-104.wav  
  inflating: ./Buckeye/segmented/s3902a-196.words  
  inflating: ./Buckeye/segmented/s2802a-211.mfcc.npy  
  inflating: ./Buckeye/segmented/s2402a-072.phoneme.boundary.npy  
  inflating: ./Buckeye/segmented/s1502b-280.words  
  inflating: ./Buckeye/segmented/s0205b-029.phoneme.npy  
  inflating: ./Buckeye/segmented/s3102b-156.phoneme.npy  
  inflating: ./Buckeye/segmented/s2201b-265.mfcc.npy  
  inflating: ./Buckeye/segmented/s0501b-223.phonemes  
  inflating: ./Buckeye/segmented/s0701b-003.phoneme.boundary.npy  
  inflating: ./Buckeye/segmented/s2102a-152.phoneme.boundary.npy  
  inflating: ./Buckeye/segmented/s3501b-188.phonemes  
  inflating: ./Buckeye/segmented/s1302a-159.phonemes  
  inflating: ./Buckeye/segmented/s1203b-117.wav  
  infla

In [None]:
# g2p = G2P()
# dataset = BuckeyePhoneme('./Buckeye', reduction=4)

# if not dataset.segmented.exists():
#     dataset.segmented.mkdir(exist_ok=True)
#     words = [i for i in dataset.path.rglob('*.words')]
#     thread_map(partial(process_segmentation, dataset.segmented, g2p), words)
#     dataset = Buckeye('./Buckeye', reduction=4)

# if not dataset.phonemes[0].exists():
#     phonemes = [i for i in dataset.segmented.rglob('*.phonemes')]
#     thread_map(partial(process_phoneme, g2p), phonemes)

# if not dataset.words[0].exists():
#     texts = [i for i in dataset.segmented.rglob('*.words')]
#     thread_map(partial(process_word, g2p), texts)
#     print('[Important] Now please run this script again to finish the preprocessing.')

# thread_map(process_wav, dataset.wavs)
# dataset = Buckeye('./Buckeye', reduction=4)

# if not dataset.normalized_mfccs[0].exists():
#     mfccs_per_speaker = {}
#     for i in dataset.mfccs:
#         name = i.name[:3]
#         if not name in mfccs_per_speaker:
#             mfccs_per_speaker[name] = [i]
#         else:
#             mfccs_per_speaker[name].append(i)

#     thread_map(process_normalization, mfccs_per_speaker.values())

In [None]:
!zip -r ./BuckeyeFinal.zip ./Buckeye

In [None]:
!cp ./drive/MyDrive/BuckeyeFinal.zip ./BuckeyeFinal.zip

In [None]:
print([i for i in os.listdir('./Buckeye/Buckeye/segmented/') if i[-8:]=='phonemes'])

['s1001b-082.phonemes', 's0803b-200.phonemes', 's2303b-003.phonemes', 's1701a-050.phonemes', 's2102b-259.phonemes', 's0503a-286.phonemes', 's1101a-199.phonemes', 's0702a-208.phonemes', 's3102a-111.phonemes', 's3101b-229.phonemes', 's0902b-040.phonemes', 's0202b-023.phonemes', 's1902a-042.phonemes', 's0601b-073.phonemes', 's2902a-094.phonemes', 's0601a-069.phonemes', 's1603b-246.phonemes', 's2301a-070.phonemes', 's1604a-123.phonemes', 's1901b-024.phonemes', 's1803a-012.phonemes', 's0701b-109.phonemes', 's2402a-241.phonemes', 's3802a-027.phonemes', 's0902a-064.phonemes', 's0901b-151.phonemes', 's1301a-140.phonemes', 's2101b-264.phonemes', 's3401a-148.phonemes', 's2801a-226.phonemes', 's1802a-117.phonemes', 's1103a-330.phonemes', 's2601b-026.phonemes', 's2302a-047.phonemes', 's2302a-054.phonemes', 's3801a-053.phonemes', 's1901a-065.phonemes', 's2301b-113.phonemes', 's3504a-238.phonemes', 's3302a-164.phonemes', 's0803b-289.phonemes', 's0502a-201.phonemes', 's3502b-015.phonemes', 's4001b-06

In [None]:
bpath = './Buckeye/s10/s1001b/'
with open(bpath + 's1001b.phones', 'r') as file:
    i = 0
    for line in file:
        print(line.strip())

In [None]:
bpath = './Buckeye/s10/s1001b/'
with open(bpath + 's1001b.words', 'r') as file:
    i = 0
    for line in file:
        print(line.strip())

signal s1001b.sd
type 0
comment created using xlabel Tue Jan 17 23:01:10 2006
comment M O
color 122
font -misc-*-bold-*-*-*-15-*-*-*-*-*-*-*
separator ;
nfields 3
#
0.072926 122 {B_TRANS}; B; B; null
9.121000 122 <IVER>; S; S; null
9.236000 122 i; ay; ay; PRP
9.380000 122 know; n ow; n ow; VBP
9.500000 122 i; ay; ay; PRP
9.657000 122 don't; d ow n t; d ow n; VBP_RB
9.841000 122 <VOCNOISE>; U; U; null
10.997000 122 <IVER>; S; S; null
11.063000 122 <SIL>; S; S; null
11.220000 122 i; ay; ay; PRP
11.325000 122 <SIL>; S; S; null
11.456000 122 i; ay; ay; PRP
11.675000 122 guess; g eh s; g eh s; NN
11.850000 122 it's; ih t s; ih s; PRP_VBZ
12.213000 122 just; jh ah s t; jh ah s t; RB
12.339000 122 <SIL>; S; S; null
12.598000 122 over; ow v er; ow v r ah; IN
12.691000 122 the; dh iy; dh eh; DT
13.087000 122 course; k ow r s; k ow r s; NN
13.124000 122 of; ah v; uh; IN
13.223000 122 the; dh iy; dh iy; DT
13.567000 122 year; y ih r; y ih r; NN
13.967000 122 everything; eh v r iy th ih ng; eh v r

In [None]:
bpath = './Buckeye/Buckeye/segmented/'
with open(bpath + 's1001b-082.phonemes', 'r') as file:
    i = 0
    for line in file:
        print(line.strip())
        i += 1
        if i == 20:
            break

0.032666	0.068733	N
0.068733	0.088313	DH
0.088313	0.159418	AH
0.159418	0.194375	G
0.194375	0.281018	OW
0.281018	0.324375	DH
0.324375	0.385099	EH
0.385099	0.425289	T
0.425289	0.496394	S
0.496394	0.617994	T
0.617994	0.666428	R
0.666428	0.774375	AE
0.774375	0.834375	N
0.834375	0.884375	Z
0.884375	0.959088	M
0.959088	1.027101	ER
1.027101	1.105420	R
1.105420	1.170342	IH
1.170342	-1.000000	D
-1.000000	1.201257	Z


In [None]:
!unzip ./BuckeyeFinal.zip -d ./

In [None]:
dataset = Buckeye('./Buckeye', reduction=1)

# if not dataset.mfccs[0].exists():
#     thread_map(process_wav, dataset.wavs)
#     dataset = Buckeye('drive/MyDrive/Buckeye', reduction=4)

# if not dataset.normalized_mfccs[0].exists():
#     mfccs_per_speaker = {}
#     for i in dataset.mfccs:
#         name = i.name[:3]
#         if not name in mfccs_per_speaker:
#             mfccs_per_speaker[name] = [i]
#         else:
#             mfccs_per_speaker[name].append(i)

#     thread_map(process_normalization, mfccs_per_speaker.values())

data_loader = torch.utils.data.DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=Collate('cuda:0'), drop_last=True)
for batch in data_loader:
    for _list in batch:
        print([i.shape for i in _list])
        print([i.dtype for i in _list])
    break

(39, 39)
(257, 39)
[torch.Size([5]), torch.Size([34])]
[torch.int64, torch.int64]
[torch.Size([39, 39]), torch.Size([257, 39])]
[torch.float32, torch.float32]
[torch.Size([5, 2]), torch.Size([34, 2])]
[torch.float32, torch.float32]


In [None]:
dataset = Buckeye('./Buckeye', reduction=4)

46946


In [None]:
print(len(dataset.wavs))
print(len(dataset.phonemes))

46946
46946


In [None]:
stub = 's1001b-082'
for i in os.listdir('./Buckeye/Buckeye/segmented'):
    if i[:len(stub)] == stub:
        print(i)


s1001b-082.phonemes
s1001b-082.normalized.mfcc.npy
s1001b-082.phoneme.npy
s1001b-082.mfcc.npy
s1001b-082.word.npy
s1001b-082.word.boundary.npy
s1001b-082.wav
s1001b-082.words
s1001b-082.phoneme.boundary.npy


In [None]:
bepath = './Buckeye/Buckeye/segmented/'
with open(bepath + 's1001b-082.phonemes', 'r') as file:
    for line in file:
        print(line)

bepath = './Buckeye/Buckeye/segmented/'
with open(bepath + 's1001b-082.words', 'r') as file:
    for line in file:
        print(line)

In [None]:
import random, os
if not os.path.isdir('./dud'):
    os.mkdir('./dud')
def train_test_split(wav_list, path):
    random.shuffle(wav_list)
    train_wavs = wav_list[:len(wav_list)//4*3]
    test_wavs = wav_list[len(wav_list)//4*3:]

    wav_list = train_wavs
    words = [Path(str(i)[:-3] + 'word.npy') for i in wav_list]
    phonemes = [Path(str(i)[:-3] + 'phoneme.npy') for i in wav_list]
    word_boundaries = [Path(str(i)[:-3] + 'word.boundary.npy') for i in wav_list]
    phoneme_boundaries = [Path(str(i)[:-3] + 'phoneme.boundary.npy') for i in wav_list]
    mfccs = [Path(str(i)[:-3] + 'mfcc.npy') for i in wav_list]
    normalized_mfccs = [Path(str(i)[:-3] + 'normalized.mfcc.npy') for i in wav_list]
    train_set, train_setp = Buckeye('./dud', reduction=1), BuckeyePhoneme('./dud', reduction=1)
    train_set.path = train_setp.path = Path(path)
    train_set.segmented = train_setp.segmented = train_setp.path / 'segmented'
    train_set.wavs = train_setp.wavs = wav_list
    train_set.words = train_setp.words = words
    train_set.phonemes = train_setp.phonemes = phonemes
    train_set.word_boundaries = train_setp.word_boundaries = word_boundaries
    train_set.phoneme_boundaries = train_setp.phoneme_boundaries = phoneme_boundaries
    train_set.mfccs = train_setp.mfccs = mfccs
    train_set.normalized_mfccs = train_setp.normalized_mfccs = normalized_mfccs

    wav_list = test_wavs
    print(len(wav_list))
    words = [Path(str(i)[:-3] + 'word.npy') for i in wav_list]
    phonemes = [Path(str(i)[:-3] + 'phoneme.npy') for i in wav_list]
    word_boundaries = [Path(str(i)[:-3] + 'word.boundary.npy') for i in wav_list]
    phoneme_boundaries = [Path(str(i)[:-3] + 'phoneme.boundary.npy') for i in wav_list]
    mfccs = [Path(str(i)[:-3] + 'mfcc.npy') for i in wav_list]
    normalized_mfccs = [Path(str(i)[:-3] + 'normalized.mfcc.npy') for i in wav_list]
    test_set, test_setp = Buckeye('./dud', reduction=1), BuckeyePhoneme('./dud', reduction=1)
    test_set.path = test_setp.path = Path(path)
    test_set.segmented = test_setp.segmented = test_setp.path / 'segmented'
    test_set.wavs = test_setp.wavs = wav_list
    test_set.words = test_setp.words = words
    test_set.phonemes = test_setp.phonemes = phonemes
    test_set.word_boundaries = test_setp.word_boundaries = word_boundaries
    test_set.phoneme_boundaries = test_setp.phoneme_boundaries = phoneme_boundaries
    test_set.mfccs = test_setp.mfccs = mfccs
    test_set.normalized_mfccs = test_setp.normalized_mfccs = normalized_mfccs

    return train_set, train_setp, test_set, test_setp


In [None]:
other_set, _, test_set, test_setp = train_test_split(dataset.wavs, './Buckeye')
train_set, train_setp, valid_set, valid_setp = train_test_split(other_set.wavs, './Buckeye')

11738
8802


In [None]:
print(len(test_set.wavs))

11738


## IFA

In [None]:
#get ifa dataset
import requests
from bs4 import BeautifulSoup
import re
from tqdm.notebook import tqdm
from tqdm.contrib.concurrent import thread_map, process_map
import os
from multiprocessing import Pool

In [None]:
response = requests.get('https://www.fon.hum.uva.nl/IFA-SpokenLanguageCorpora/IFAcorpus/SLspeech/sentences/fm/')
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
pattern = re.compile(r'[FM]\d{2}[A-Z]/')
for link in links:
    if pattern.match(link.get('href')):
        print(f"Link: {link.get('href')}")

Link: F20N/
Link: F28G/
Link: F40L/
Link: F60E/
Link: M15R/
Link: M40K/
Link: M56H/
Link: M66O/


In [None]:
labels_url = 'https://www.fon.hum.uva.nl/IFA-SpokenLanguageCorpora/IFAcorpus/SLcorpus/Labels/sentences/'
response = requests.get(labels_url)
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')

pattern = re.compile(r'[FM]\d{2}[A-Z]/')
subject_urls = []
for link in links:
    href = link.get('href')
    if not pattern.match(href):
        continue
    subject_urls.append(labels_url + href + 'phoneme/')

pattern = re.compile(r'[FM]\d{2}[A-Z].*phoneme')
download_links = []
file_ids = []
for subject_url in subject_urls:
    response = requests.get(subject_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')

    for link in links:
        href = link.get('href')
        if not pattern.match(href):
            continue
        download_links.append(subject_url + href)
        file_ids.append(href)

folder = './labels'
os.makedirs(folder, exist_ok = True)

AttributeError: ignored

In [None]:
for f in os.listdir('./wavs'):
    if f[-4:] == '.wav':
        os.remove('./wavs/' + f)

In [None]:
def download_file(info):
    file_id, link = info
    downloaded = requests.get(link)
    if downloaded.status_code != 200:
        return False
    with open('./labels/' + file_id, 'wb') as f:
        f.write(downloaded.content)
        return True

r = list(process_map(download_file, list(zip(file_ids, download_links)), chunksize = 1))

  0%|          | 0/4453 [00:00<?, ?it/s]

In [None]:
print([i for i in r if not i])

[]


In [None]:
folder = './wavs'
os.makedirs(folder, exist_ok = True)

wav_url = 'https://www.fon.hum.uva.nl/IFA-SpokenLanguageCorpora/IFAcorpus/SLspeech/sentences/fm/'

def download_wav(phon_file):
    recording_id = phon_file.split('_')[0]
    person = recording_id[:4]
    f_name = f'{recording_id}_fm.aifc'
    request = requests.get(wav_url + f'{person}/' + f'{recording_id}_fm.aifc', allow_redirects=True)
    if request.status_code != 200:
        return False
    with open(f'./wavs/{f_name}', 'wb') as f:
        f.write(request.content)
        return True

r = list(process_map(download_wav, os.listdir('./labels'), chunksize = 1))
print([i for i, x in enumerate(r) if not x])

  0%|          | 0/4453 [00:00<?, ?it/s]

[1375]


In [None]:
!git clone https://git.ffmpeg.org/ffmpeg.git ffmpeg

Cloning into 'ffmpeg'...
remote: Enumerating objects: 17968, done.[K
remote: Counting objects: 100% (17968/17968), done.[K
remote: Compressing objects: 100% (10615/10615), done.[K
remote: Total 706032 (delta 13209), reused 9668 (delta 7294)[K
Receiving objects: 100% (706032/706032), 174.22 MiB | 13.54 MiB/s, done.
Resolving deltas: 100% (570400/570400), done.
Updating files: 100% (8253/8253), done.


In [None]:
file_name = os.listdir('./ifadata/wavs')[1]
call(f'ffmpeg -i ./ifadata/wavs/{file_name} -acodec pcm_s16le -ac 1 -ar 16000 ./wavs/{file_name[:-4]}wav', shell=True)

0

In [None]:
from subprocess import call

os.makedirs('./wavs', exist_ok=True)
def convert(file_name):
    return call(f'ffmpeg -i ./ifadata/wavs/{file_name} -acodec pcm_s16le -ac 1 -ar 16000 ./wavs/{file_name[:-4]}wav', shell=True)

r = list(process_map(convert, os.listdir('./ifadata/wavs'), chunksize = 1))
print([i for i, x in enumerate(r) if not x])

  0%|          | 0/4452 [00:00<?, ?it/s]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [None]:
from IPython.display import Audio, display

display(Audio(f'/content/ifadata/wavs/F20N1FS11A_fm.wav', autoplay=True))

In [None]:
import os
check = set([i[:-8] for i in os.listdir('./ifadata/labels')])
for filename in os.listdir('./ifadata/wavs'):
    if filename[:-4] not in check:
        print(filename)


In [None]:
labels = './ifadata/labels/'
wavs = './ifadata/wavs/'

In [None]:
sample = os.listdir(labels)[0]
print(sample)

M56H1FS86A.phoneme


In [None]:
import wave
import contextlib
ct = 0
for wav in os.listdir(wavs):
    fname = wavs + wav
    with contextlib.closing(wave.open(fname,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
        if duration > 8:
            ct += 1

In [None]:
removing = ['F40L1FS81A', ]

In [None]:
with open(labels + 'F28G1VI6E.phoneme', 'r') as file:
    lines = file.readlines()
    for i, line in enumerate(lines):
        if line[1:2] == '0':
            line = '"O' + line[2:]
            lines[i] = line
            print(lines[i-1: i+1])
with open(labels + 'F28G1VI6E.phoneme', 'w') as file:
    file.writelines(lines)

['1.2277687\n', '"O=__F28G1VI6E4TC1"\n']
['1.2277687\n', '"O=__F28G1VI6E4TC"\n']


In [None]:
with open(labels + 'F28G1VI6E.phoneme', 'rb') as file:
    for line in file.readlines()[9:]:
        if line.decode('utf-8').strip() == '"IntervalTier"':
            break
        print(line.decode('utf-8').strip())

In [None]:
[i for i in os.listdir(wavs) if '_' in i]

[]

In [None]:
for j in [i for i in os.listdir(wavs) if '_' in i]:
    os.remove(wavs / j)

In [None]:
r = process_map(process_file, [i for i in os.listdir(labels) if i[-7:] == 'phoneme'], chunksize = 1)

  0%|          | 0/4433 [00:00<?, ?it/s]

In [None]:
phoneme_set = set()
for _, x in r:
    phoneme_set = phoneme_set.union(x)
print(phoneme_set)

{'J', 'r', 'w', 'i', 'O~', 'O', 'z', 's', 'f', 't', '*', 'Y', '@', 'Z', 'S', 'E', 'l', '2', 'j', 'A', 'O+', 'p', 'b', 'E~', 'o', 'G', 'd', 'a', 'I', 'v', '9+', 'k', 'g', 'm', 'y', 'n', 'h', 'N', 'E+', 'x', 'u', 'A~', 'e'}


In [None]:
print([i for i in phoneme_set if i not in CGN_CMU_map])

[]


In [6]:
g2p.symbols

['AA',
 'AA0',
 'AA1',
 'AA2',
 'AE',
 'AE0',
 'AE1',
 'AE2',
 'AH',
 'AH0',
 'AH1',
 'AH2',
 'AO',
 'AO0',
 'AO1',
 'AO2',
 'AW',
 'AW0',
 'AW1',
 'AW2',
 'AY',
 'AY0',
 'AY1',
 'AY2',
 'B',
 'CH',
 'D',
 'DH',
 'EH',
 'EH0',
 'EH1',
 'EH2',
 'ER',
 'ER0',
 'ER1',
 'ER2',
 'EY',
 'EY0',
 'EY1',
 'EY2',
 'F',
 'G',
 'HH',
 'IH',
 'IH0',
 'IH1',
 'IH2',
 'IY',
 'IY0',
 'IY1',
 'IY2',
 'JH',
 'K',
 'L',
 'M',
 'N',
 'NG',
 'OW',
 'OW0',
 'OW1',
 'OW2',
 'OY',
 'OY0',
 'OY1',
 'OY2',
 'P',
 'R',
 'S',
 'SH',
 'T',
 'TH',
 'UH',
 'UH0',
 'UH1',
 'UH2',
 'UW',
 'UW0',
 'UW1',
 'UW2',
 'V',
 'W',
 'Y',
 'Z',
 'ZH']

In [None]:
g2p = G2P()
symbolset = set(g2p.symbols)
for i in CGN_CMU_map.values():
    if type(i) == list:
        for j in i:
            if j not in symbolset:
                print(j)
    elif i not in symbolset:
        print(i)

In [16]:
CGN_CMU_map = {
    'p': 'P', #g
    'b': 'B', #g
    't': 'T', #g
    'd': 'D', #g
    'k': 'K', #g
    'g': 'G', #g
    'f': 'F', #g
    'v': 'V', #g
    's': 'S', #g
    'z': 'ZH', #g
    'S': 'SH', #g
    'Z': 'JH', #g
    'x': ['K', 'HH'], #g
    'G': ['G', 'HH'], #g
    'h': 'HH', #g
    'N': 'NG', #g
    'm': 'M', #g
    'n': 'N', #g
    'J': ['N', 'Y'], #g
    'l': 'L', #g
    'r': 'R', #g
    'w': 'W', #g possibly change to vw in future
    'j': 'Y', #g
    'I': 'IH', #g
    'E': 'EH', #g
    'A': 'AA', #g
    'O': 'AO', #g
    'Y': ['UH', 'R'], #g
    'i': 'IY', #g
    'y': ['IH', 'W'], #g
    'e': 'EY', #g
    '2': ['OW', 'UW'], #g
    'a': 'AH', #g
    'o': ['OW', 'AH'], #g
    'u': 'UH', #g
    '@': 'AH0', #g
    'E+': ['UH', 'EY'],
    '9+': 'OY', #g
    'O+': 'AW', #koud #g
    'E:': ['EH', 'EH'], #scene #g
    '9:': 'ER', #1 freule #g
    'O:': 'OW', #g
    'E~': ['EH', 'N'], #g
    'A~': ['AA', 'N'], #g
    'O~': ['OW', 'N'], #g
    'Y~': ['UH', 'M'], #g
}

silences = {
    '*',
    'SIL',
    ' '
}

In [None]:
import librosa

In [4]:
g2p = G2P()

In [15]:
for file in os.listdir(phoneme_folder):
    with open(phoneme_folder / file, 'r') as f:
        lines = f.readlines()
        found = False
        for i, line in enumerate(lines):
            if 'ZH' in line and i != len(lines) - 1:
                if 'ZH' in lines[i + 1]:
                    found = True
                    break
        if found:
            print(file)

F60E2VY19A_4.phoneme
F40L2VY8A_1.phoneme
F60E2VY5A_3.phoneme
M40K2VW2A_1.phoneme
F60E2VW13A_6.phoneme
F40L1FT2C_0.phoneme
F60E2VS19A_3.phoneme
M15R1FS12A_0.phoneme


In [18]:
labels = Path('./ifadata/labels/')
wavs = Path('./ifadata/wavs/')
segmented_audio = Path('./ifadata/segmented/')
phoneme_folder = Path('./ifadata/phonemes/')
boundaries = Path('./ifadata/boundaries/')
phoneme_data = Path('./ifadata/phoneme_data/')
mfcc_folder = Path('./ifadata/mfccs/')
nmfccs = Path('./ifadata/nmfccs/')

segmented_audio.mkdir(exist_ok=True)
phoneme_folder.mkdir(exist_ok=True)
boundaries.mkdir(exist_ok=True)
phoneme_data.mkdir(exist_ok=True)
mfcc_folder.mkdir(exist_ok=True)
nmfccs.mkdir(exist_ok=True)

def clip_wav(source, start, end, target):
    duration = '%.6f' % (end - start)
    start = '%.6f' % start
    subprocess.run(['ffmpeg', '-i', source, '-ss', start, '-t', duration, target, '-v', 'quiet', '-y'])

#filename is a .phoneme file
def to_unicode(string):
    try:
        return string.decode('utf-8')
    except UnicodeDecodeError:
        return ""

def process_file(filename):
    with open(labels / filename, 'rb') as file:
        start = False
        lines = [to_unicode(i).strip() for i in file.readlines()[12:]]
        phonemes = []
        phoneme_set = set()
        for i in range(0, len(lines), 3):
            if lines[i] == '"IntervalTier"':
                break
            start = float(lines[i].strip())
            end = float(lines[i + 1].strip())
            phoneme = lines[i + 2].split('__')[0][1:].replace('=', '')
            phonemes.append((start, end, phoneme))
            phoneme_set.add(phoneme)
    return phonemes

def get_chunks(id):
    lines = None
    phonemes = process_file(id + '.phoneme')

    chunk_start = 0
    chunks = []
    chunk = []
    for piece in phonemes:
        p_start, p_end, phoneme = piece
        if phoneme in silences:
            if chunk:
                chunks.append((chunk_start, p_start, chunk))
                chunk = []
            chunk_start = p_end
            continue

        p_start -= chunk_start
        p_end -= chunk_start
        arpa_phonemes = CGN_CMU_map[phoneme]

        if type(arpa_phonemes) == str:
            chunk.append((p_start, p_end, arpa_phonemes))
            continue

        chunk.append((p_start, -1, arpa_phonemes[0]))
        chunk.append((-1, p_end, arpa_phonemes[1]))

    for i, (_, _, chunk) in enumerate(chunks):
        with open(phoneme_folder / f'{id}_{i}.phoneme', 'w') as f:
            for p_start, p_end, phoneme in chunk:
                f.write(f'{p_start:.6f}\t{p_end:.6f}\t{phoneme}\n')
    return chunks

def chunk_audio(id, chunks):
    source = wavs / f'{id}.wav'
    for i, (start, end, _) in enumerate(chunks):
        clip_wav(str(source), start, end, str(segmented_audio / f'{id}_{i}.wav'))

def process_phoneme(id, chunks):

    chunked_phonemes = [[g2p.symbol2id[p] + 1 for _, _, p in chunk] for _, _, chunk in chunks]

    for i, ((_, _, chunk), phonemes) in enumerate(zip(chunks, chunked_phonemes)):
        if len(phonemes) == 0: continue
        phonemes = np.array(phonemes)
        np.save(phoneme_data / f'{id}_{i}.phoneme.npy', phonemes)

        result = np.zeros((len(chunk), 2))
        for j, (b1, b2, _) in enumerate(chunk):
            result[j, 0] = b1
            result[j, 1] = b2

        np.save(boundaries / f'{id}_{i}.phoneme.boundary.npy', result)

def process_wavs(id, num, sample_rate=16000):
    mfccs = []
    non_list = []
    for i, file in enumerate([segmented_audio / f'{id}_{i}.wav' for i in range(num)]):
        waveform, sample_rate = librosa.load(file, sr=sample_rate, mono=True)
        if len(waveform) == 0:
            print('Empty wav:', file)
            file.unlink()
        if len(waveform) < 400:
            non_list.append(f'{id}_{i}')
            continue
        mfcc = librosa.feature.mfcc(y=waveform, sr=sample_rate, n_mfcc=13, hop_length=int(sample_rate/100), n_fft=int(sample_rate/40), fmax=8000)
        delta = librosa.feature.delta(mfcc, width=3, order=1)
        delta2 = librosa.feature.delta(mfcc, width=3, order=2)
        mfcc = np.concatenate([mfcc, delta, delta2]).T.astype(np.float32)
        np.save(mfcc_folder / f'{id}_{i}.mfcc.npy', mfcc)
        mfccs.append(mfcc)
    return mfccs, non_list

def get_mean_and_std(mfccs):
    mfccs = np.concatenate(mfccs, axis=0)
    mean = mfccs.mean(axis=0, keepdims=False)
    std = mfccs.std(axis=0, keepdims=False)
    return mean, std

def process_normalization(id, mfccs):
    mean, std = get_mean_and_std(mfccs)
    for i, mfcc in enumerate(mfccs):
        mfcc -= mean
        mfcc /= std
        np.save(nmfccs / f'{id}_{i}.normalized.mfcc.npy', mfcc.astype(np.float32))

def process_subject(id):
    chunks = get_chunks(id)
    # chunk_audio(id, chunks)
    process_phoneme(id, chunks)
    # mfccs, non_list = process_wavs(id, len(chunks))
    # process_normalization(id, mfccs)
    # return non_list

class IFAPhoneme(torch.utils.data.Dataset):

    def __init__(self, path, reduction: int = 1):
        super().__init__()
        self.path = Path(path)
        self.ids = [file[:-7] for file in os.listdir(self.path / 'phonemes')]
        self.phonemes = [phoneme_data / f'{i}phoneme.npy' for i in self.ids]
        self.phoneme_boundaries = [boundaries / f'{i}phoneme.boundary.npy' for i in self.ids]
        self.normalized_mfccs = [nmfccs / f'{i}normalized.mfcc.npy' for i in self.ids]
        self.reduction = reduction

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, index):
        phoneme = np.load(self.phonemes[index])
        phoneme_boundary = np.load(self.phoneme_boundaries[index])
        mfcc = np.load(self.normalized_mfccs[index])

        if mfcc.shape[0] % self.reduction != 0:
            mfcc = np.concatenate([mfcc, np.zeros((self.reduction - mfcc.shape[0] % self.reduction, mfcc.shape[1]))])
        if self.reduction > 1:
            mfcc = mfcc.reshape(mfcc.shape[0] // self.reduction, mfcc.shape[1] * self.reduction)

        return phoneme, mfcc.astype(np.float32), phoneme_boundary.astype(np.float32)

In [None]:
import wave
import contextlib
ct = 0
for file in os.listdir(segmented_audio):
    fname = segmented_audio / file
    with contextlib.closing(wave.open(str(fname),'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
print(ct)

3527


In [None]:
dataset = IFAPhoneme('./ifadata', reduction=4)

In [None]:
for _ in DataLoader(dataset):
    pass

FileNotFoundError: ignored

In [None]:
os.path.exists(phoneme_data / f'{dataset.ids[0]}phoneme.npy')

True

In [None]:
missing1 = []
missing2 = []
missing3 = []
for i in dataset.ids:
    if not os.path.exists(phoneme_data / f'{i}phoneme.npy'):
        missing1.append(i)
        continue
    if not os.path.exists(boundaries/ f'{i}phoneme.boundary.npy'):
        missing2.append(i)
        continue
    if not os.path.exists(phoneme_data / f'{i}normalized.mfccs.npy'):
        missing3.append(i)
print(len(missing2))

0


In [None]:
os.remove()

In [19]:
r = process_map(process_subject, [i[:-4] for i in os.listdir(wavs)], chunksize=1)

  0%|          | 0/4432 [00:00<?, ?it/s]

In [None]:
for i in r:
    if i:
        print(i)

['F28G1FT6I_F28G2VR1BZ_89.normalized.mfcc.npy']
['M15R1FT2F_F28G2VR1BZ_89.normalized.mfcc.npy']


In [None]:
!zip -r ifadataset.zip ./ifadata
!cp ./ifadataset.zip ./drive/MyDrive/ifadataset.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: ifadata/phoneme_data/F28G2FPB1VCVD_6.phoneme.npy (deflated 50%)
  adding: ifadata/phoneme_data/M66O1FPA2VCVE_6.phoneme.npy (deflated 50%)
  adding: ifadata/phoneme_data/M56H1FW10A_10.phoneme.npy (deflated 53%)
  adding: ifadata/phoneme_data/M56H2PS13A_0.phoneme.npy (deflated 59%)
  adding: ifadata/phoneme_data/F28G1FPA2HVDB_6.phoneme.npy (deflated 49%)
  adding: ifadata/phoneme_data/F28G2VT7E_1.phoneme.npy (deflated 58%)
  adding: ifadata/phoneme_data/F28G1FW9A_8.phoneme.npy (deflated 49%)
  adding: ifadata/phoneme_data/M15R1FR2AQ_2.phoneme.npy (deflated 62%)
  adding: ifadata/phoneme_data/F60E2VY11A_0.phoneme.npy (deflated 49%)
  adding: ifadata/phoneme_data/F20N1FW12A_4.phoneme.npy (deflated 49%)
  adding: ifadata/phoneme_data/F40L1VI5C_4.phoneme.npy (deflated 50%)
  adding: ifadata/phoneme_data/M66O1FPA1HVDB_0.phoneme.npy (deflated 49%)
  adding: ifadata/phoneme_data/F28G1FT9B_0.phoneme.npy (deflated 49%)
  a

In [None]:
g2p = G2P()


dataset = IFAPhoneme('./ifadata', reduction=4)

## Models

In [None]:
class BidirectionalAttention(nn.Module):

    def __init__(self, k1_dim, k2_dim, v1_dim, v2_dim, attention_dim):
        super().__init__()
        self.k1_layer = nn.Linear(k1_dim, attention_dim)
        self.k2_layer = nn.Linear(k2_dim, attention_dim)
        self.score_layer = nn.Linear(attention_dim, 1)
        self.softmax1 = nn.Softmax(dim=-1)
        self.softmax2 = nn.Softmax(dim=-1)

    def forward(self, k1, k2, v1, v2, k1_lengths=None, k2_lengths=None):
        k1 = self.k1_layer(k1)
        k2 = self.k2_layer(k2)
        score = torch.bmm(k1, k2.transpose(1, 2))

        if k1_lengths or k2_lengths:
            mask = torch.zeros(score.shape, dtype=torch.int).detach().to(score.device)
            for i, l in enumerate(k1_lengths):
                mask[i,l:,:] += 1
            for i, l in enumerate(k2_lengths):
                mask[i,:,l:] += 1
            mask = mask == 1
            score = score.clone().masked_fill_(mask, -float('inf'))

        w1 = self.softmax1(score.transpose(1, 2))
        w2 = self.softmax2(score)

        o1 = torch.bmm(w1, v1)
        o2 = torch.bmm(w2, v2)

        w1 = [i[:l2, :l1] for i, l1, l2 in zip(w1, k1_lengths, k2_lengths)]
        w2 = [i[:l1, :l2] for i, l1, l2 in zip(w2, k1_lengths, k2_lengths)]
        score = [i[:l1, :l2] for i, l1, l2 in zip(score, k1_lengths, k2_lengths)]

        return o1, o2, w1, w2, score

class BidirectionalAdditiveAttention(nn.Module):

    def __init__(self, k1_dim, k2_dim, v1_dim, v2_dim, attention_dim):
        super().__init__()
        self.k1_layer = nn.Linear(k1_dim, attention_dim)
        self.k2_layer = nn.Linear(k2_dim, attention_dim)
        self.score_layer = nn.Linear(attention_dim, 1)
        self.tanh = nn.Tanh()
        self.softmax1 = nn.Softmax(dim=-1)
        self.softmax2 = nn.Softmax(dim=-1)

    def forward(self, k1, k2, v1, v2, k1_lengths=None, k2_lengths=None):
        k1 = self.k1_layer(k1).repeat(k2.shape[1], 1, 1, 1).permute(1,2,0,3)
        k2 = self.k2_layer(k2).repeat(k1.shape[1], 1, 1, 1).permute(1,0,2,3)
        score = self.score_layer(self.tanh(k1 + k2)).squeeze(-1)

        if k1_lengths or k2_lengths:
            mask = torch.zeros(score.shape, dtype=torch.int).detach().to(score.device)
            for i, l in enumerate(k1_lengths):
                mask[i,l:,:] += 1
            for i, l in enumerate(k2_lengths):
                mask[i,:,l:] += 1
            mask = mask == 1
            score = score.masked_fill_(mask, -float('inf'))

        w1 = self.softmax1(score.transpose(1, 2))
        w2 = self.softmax2(score)

        o1 = torch.bmm(w1, v1)
        o2 = torch.bmm(w2, v2)

        w1 = [i[:l2, :l1] for i, l1, l2 in zip(w1, k1_lengths, k2_lengths)]
        w2 = [i[:l1, :l2] for i, l1, l2 in zip(w2, k1_lengths, k2_lengths)]
        score = [i[:l1, :l2] for i, l1, l2 in zip(score, k1_lengths, k2_lengths)]

        return o1, o2, w1, w2, score

In [None]:
class PositionalEncoding(torch.nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = torch.nn.Dropout(p=dropout)
        self.d_model = d_model

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('div_term', div_term)
        self.register_buffer('pe', pe)

    def forward(self, x, custom_position=None):
        if not custom_position is None:
            custom_position = custom_position.repeat(self.div_term.shape[0], 1, 1).permute(1, 2, 0)
            pe = torch.zeros(custom_position.shape[:-1] + (self.d_model, ))
            pe[:, :, 0::2] = torch.sin(custom_position * self.div_term)
            pe[:, :, 1::2] = torch.cos(custom_position * self.div_term)
            x = x + pe.to(x.device)
        else:
            x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [None]:
class GumbelSoftmax(nn.Module):

    def __init__(self, hard=True, **kwargs):
        super().__init__()
        self.hard = hard
        self.kwargs = kwargs

    def forward(self, inputs):
        return nn.functional.gumbel_softmax(inputs, hard=self.hard, **self.kwargs)

class MixedGumbelSoftmax(nn.Module):

    def __init__(self, hard_rate=0.5, **kwargs):
        super().__init__()
        if 'hard' in kwargs:
            del kwargs['hard']
        self.kwargs = kwargs
        self.hard_rate = hard_rate

    def forward(self, inputs):
        if self.hard_rate == 0:
            return nn.functional.gumbel_softmax(inputs, hard=False, **self.kwargs)
        if self.hard_rate == 1:
            return nn.functional.gumbel_softmax(inputs, hard=True, **self.kwargs)
        soft = nn.functional.gumbel_softmax(inputs, hard=False, **self.kwargs)
        hard = nn.functional.gumbel_softmax(inputs, hard=True, **self.kwargs)
        random = torch.rand(soft.shape[:-1], device=soft.device)
        random[random <= self.hard_rate] = 0
        random[random > self.hard_rate] = 1
        random = random.repeat(soft.shape[-1], 1, 1).permute(1, 2, 0)
        return hard * random + soft * (1 - random)

class LinearNorm(torch.nn.Module):
    def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
        super(LinearNorm, self).__init__()
        self.linear_layer = torch.nn.Linear(in_dim, out_dim, bias=bias)

        torch.nn.init.xavier_uniform_(
            self.linear_layer.weight,
            gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, x):
        return self.linear_layer(x)

class Prenet(nn.Module):

    def __init__(self, in_dim, sizes=[256, 128]):
        super(Prenet, self).__init__()
        in_sizes = [in_dim] + sizes[:-1]
        self.layers = nn.ModuleList(
            [nn.Linear(in_size, out_size)
             for (in_size, out_size) in zip(in_sizes, sizes)])
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, inputs):
        for linear in self.layers:
            inputs = self.dropout(self.relu(linear(inputs)))
        return inputs


class BatchNormConv1d(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size, stride, padding, activation=None):
        super().__init__()
        self.conv1d = nn.Conv1d(in_dim, out_dim, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
        self.bn = nn.BatchNorm1d(out_dim)
        self.activation = activation

    def forward(self, x):
        x = self.conv1d(x)
        if self.activation is not None:
            x = self.activation(x)
        return self.bn(x)

class BatchNormConv2d(nn.Module):
    def __init__(self, in_dim, out_dim, kernel_size, stride, padding,
                 activation=None):
        super().__init__()
        self.conv2d = nn.Conv2d(in_dim, out_dim, kernel_size=kernel_size, stride=stride, padding=padding, bias=False)
        self.bn = nn.BatchNorm2d(out_dim)
        self.activation = activation

    def forward(self, x):
        x = self.conv2d(x)
        if self.activation is not None:
            x = self.activation(x)
        return self.bn(x)


class Highway(nn.Module):
    def __init__(self, in_size, out_size):
        super(Highway, self).__init__()
        self.H = nn.Linear(in_size, out_size)
        self.H.bias.data.zero_()
        self.T = nn.Linear(in_size, out_size)
        self.T.bias.data.fill_(-1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, inputs):
        H = self.relu(self.H(inputs))
        T = self.sigmoid(self.T(inputs))
        return H * T + inputs * (1.0 - T)


class CBHG(nn.Module):
    """CBHG module: a recurrent neural network composed of:
        - 1-d convolution banks
        - Highway networks + residual connections
        - Bidirectional gated recurrent units
    """

    def __init__(self, in_dim, K=16, projections=[128, 128]):
        super(CBHG, self).__init__()
        self.in_dim = in_dim
        self.relu = nn.ReLU()
        self.conv1d_banks = nn.ModuleList([BatchNormConv1d(in_dim, in_dim, kernel_size=k, stride=1, padding=k // 2, activation=self.relu) for k in range(1, K + 1)])
        self.max_pool1d = nn.MaxPool1d(kernel_size=2, stride=1, padding=1)

        in_sizes = [K * in_dim] + projections[:-1]
        activations = [self.relu] * (len(projections) - 1) + [None]
        self.conv1d_projections = nn.ModuleList(
            [BatchNormConv1d(in_size, out_size, kernel_size=3, stride=1,
                             padding=1, activation=ac)
             for (in_size, out_size, ac) in zip(
                 in_sizes, projections, activations)])

        self.pre_highway = nn.Linear(projections[-1], in_dim, bias=False)
        self.highways = nn.ModuleList(
            [Highway(in_dim, in_dim) for _ in range(4)])

        self.gru = nn.GRU(
            in_dim, in_dim, 1, batch_first=True, bidirectional=True)

    def forward(self, inputs, input_lengths=None):
        # (B, T_in, in_dim)
        x = inputs

        # Needed to perform conv1d on time-axis
        # (B, in_dim, T_in)
        if x.size(-1) == self.in_dim:
            x = x.transpose(1, 2)

        T = x.size(-1)

        # (B, in_dim*K, T_in)
        # Concat conv1d bank outputs
        x = torch.cat([conv1d(x)[:, :, :T] for conv1d in self.conv1d_banks], dim=1)
        assert x.size(1) == self.in_dim * len(self.conv1d_banks)
        x = self.max_pool1d(x)[:, :, :T]

        for conv1d in self.conv1d_projections:
            x = conv1d(x)

        # (B, T_in, in_dim)
        # Back to the original shape
        x = x.transpose(1, 2)

        if x.size(-1) != self.in_dim:
            x = self.pre_highway(x)

        # Residual connection
        x += inputs
        for highway in self.highways:
            x = highway(x)

        if input_lengths is not None:
            x = nn.utils.rnn.pack_padded_sequence(
                x, input_lengths, batch_first=True, enforce_sorted=False)

        # (B, T_in, in_dim*2)
        outputs, _ = self.gru(x)

        if input_lengths is not None:
            outputs, _ = nn.utils.rnn.pad_packed_sequence(
                outputs, batch_first=True)

        return outputs

class TacotronEncoder(nn.Module):

    def __init__(self, hparams):
        super().__init__()
        self.embedding = nn.Embedding(hparams.num_symbols, hparams.embedding_dim)
        self.prenet = Prenet(hparams.embedding_dim, sizes=hparams.prenet.sizes)
        self.cbhg = CBHG(hparams.cbhg.dim, K=hparams.cbhg.K, projections=hparams.cbhg.projections)

    def forward(self, inputs, input_lengths=None):
        x = self.embedding(inputs)
        x = self.prenet(x)
        x = self.cbhg(x, input_lengths)
        return x

class ConvNorm(torch.nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1,
                 padding=None, dilation=1, bias=True, w_init_gain='linear'):
        super(ConvNorm, self).__init__()
        if padding is None:
            assert(kernel_size % 2 == 1)
            padding = int(dilation * (kernel_size - 1) / 2)

        self.conv = torch.nn.Conv1d(in_channels, out_channels,
                                    kernel_size=kernel_size, stride=stride,
                                    padding=padding, dilation=dilation,
                                    bias=bias)

        torch.nn.init.xavier_uniform_(
            self.conv.weight, gain=torch.nn.init.calculate_gain(w_init_gain))

    def forward(self, signal):
        conv_signal = self.conv(signal)
        return conv_signal

class Tacotron2Encoder(nn.Module):

    def __init__(self, hparams):
        super().__init__()

        self.embedding = nn.Embedding(hparams.num_symbols, hparams.embedding_dim)
        convolutions = []
        for _ in range(hparams.cnn.num_layers):
            conv_layer = nn.Sequential(
                ConvNorm(hparams.embedding_dim,
                         hparams.embedding_dim,
                         kernel_size=hparams.cnn.kernel_size, stride=1,
                         padding=int((hparams.cnn.kernel_size - 1) / 2),
                         dilation=1, w_init_gain='relu'),
                nn.BatchNorm1d(hparams.embedding_dim))
            convolutions.append(conv_layer)
        self.convolutions = nn.ModuleList(convolutions)

        self.lstm = nn.LSTM(hparams.embedding_dim, int(hparams.embedding_dim / 2), 1, batch_first=True, bidirectional=True)

    def forward(self, x, input_lengths):
        x = self.embedding(x).transpose(1, 2)

        for conv in self.convolutions:
            x = torch.nn.functional.dropout(torch.nn.functional.relu(conv(x)), 0.5, self.training)

        x = x.transpose(1, 2)

        x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False)
        self.lstm.flatten_parameters()
        x, _ = self.lstm(x)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)

        return x

    def inference(self, x):
        for conv in self.convolutions:
            x = F.dropout(F.relu(conv(x)), 0.5, self.training)

        x = x.transpose(1, 2)

        self.lstm.flatten_parameters()
        outputs, _ = self.lstm(x)

        return outputs

class ContentEncoder(nn.Module):

    def __init__(self, hparams):
        super().__init__()

        filters = [hparams.input_dim] + hparams.cnn.filters
        convs = (BatchNormConv1d(filters[i], filters[i + 1], hparams.cnn.kernel_size, 1, hparams.cnn.kernel_size//2) for i in range(len(hparams.cnn.filters)))
        self.convs = nn.Sequential(*convs)

        self.gru = nn.GRU(input_size=hparams.cnn.filters[-1], hidden_size=hparams.gru_dim, bidirectional=True, batch_first=True)
        self.gru2 = nn.GRU(input_size=hparams.gru_dim * 2, hidden_size=hparams.gru_dim, bidirectional=True, batch_first=True)

    def forward(self, inputs, input_lengths):
        x = self.convs(inputs.transpose(1, 2)).transpose(1, 2)

        x = nn.utils.rnn.pack_padded_sequence(x, input_lengths, batch_first=True, enforce_sorted=False)
        self.gru.flatten_parameters()
        x, _ = self.gru(x)
        x, _ = self.gru2(x)
        x, _ = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)

        return x

class Decoder(nn.Module):

    def __init__(self, hparams):
        super().__init__()

        self.lstm = nn.LSTM(input_size=hparams.input_dim, hidden_size=hparams.lstm_dim, bidirectional=True, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=hparams.lstm_dim * 2, hidden_size=hparams.lstm_dim, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(hparams.lstm_dim * 2, hparams.output_dim)

    def forward(self, inputs, input_lengths):
        x = nn.utils.rnn.pack_padded_sequence(inputs, input_lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.lstm(x)
        x, _ = self.lstm2(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        x = self.linear(x)
        x = [x[i][:l] for i, l in enumerate(input_lengths)]
        return x

class Aligner(nn.Module):

    def __init__(self, hparams):
        super().__init__()
        self.max_frames = hparams.max_frames
        #self.lstm = nn.GRU(input_size=6, hidden_size=hparams.lstm_dim, bidirectional=True, batch_first=True)
        #self.lstm2 = nn.GRU(input_size=hparams.lstm_dim * 2, hidden_size=hparams.lstm_dim, bidirectional=True, batch_first=True)

        #self.location_layer = LocationLayer(hparams.location_layer.attention_n_filters, hparams.location_layer.attention_kernel_size, hparams.location_layer.output_dim)
        filters = [hparams.input_dim] + hparams.cnn.filters
        #convs = (BatchNormConv1d(filters[i], filters[i + 1], hparams.cnn.kernel_size, 1, hparams.cnn.kernel_size//2) for i in range(len(hparams.cnn.filters)))
        #self.convs = nn.Sequential(*convs)
        convs = (BatchNormConv2d(filters[i], filters[i + 1], hparams.cnn.kernel_size, 1, tuple([i//2 for i in hparams.cnn.kernel_size])) for i in range(len(hparams.cnn.filters)))
        self.convs = nn.Sequential(*convs)

        self.linear = nn.Linear(filters[-1], 2)
        self.softmax = nn.Softmax(-1)

    def stack_attention(self, w1, w2):
        w1 = [i.T for i in w1]
        max_frames = max([i.shape[1] for i in w1])
        w1 = [torch.cat([i, torch.zeros(i.shape[0], max_frames - i.shape[1], device=i.device)], dim=-1) for i in w1]
        w2 = [torch.cat([i, torch.zeros(i.shape[0], max_frames - i.shape[1], device=i.device)], dim=-1) for i in w2]
        w1 = nn.utils.rnn.pad_sequence(w1, batch_first=True)
        w2 = nn.utils.rnn.pad_sequence(w2, batch_first=True)
        accumulated_w1 = torch.cumsum(w1, -1)
        accumulated_w2 = torch.cumsum(w2, -1)
        accumulated_w1_backward = torch.cumsum(w1.flip(-1), -1).flip(-1)
        accumulated_w2_backward = torch.cumsum(w2.flip(-1), -1).flip(-1)
        #x = torch.stack([w1, w2, accumulated_w1, accumulated_w2, accumulated_w1_backward, accumulated_w2_backward], dim=-1).permute(1, 0, 3, 2)
        #return torch.stack([self.convs(i) for i in x]).permute(1, 0, 3, 2)
        x = torch.stack([w1, w2, accumulated_w1, accumulated_w2, accumulated_w1_backward, accumulated_w2_backward], dim=-1).permute(0, 3, 1, 2)
        return self.convs(x).permute(0, 2, 3, 1)

    def forward(self, texts, w1, w2, text_lengths, mfcc_lengths):
        x = self.stack_attention(w1, w2)
        x = torch.sigmoid(self.linear(x).transpose(-1, -2))
        x = torch.cumsum(x, dim=-1)
        #x = torch.stack([torch.cumsum(x[:,:,0,:], dim=-1), torch.cumsum(x[:,:,1,:].flip(-1), dim=-1).flip(-1)], dim=-2)
        x = torch.tanh(x)
        x = [b[:l1, :, :l2] for b, l1, l2 in zip(x, text_lengths, mfcc_lengths)]
        return x

class Predictor(nn.Module):

    def __init__(self, hparams):
        super().__init__()
        self.max_frames = hparams.max_frames
        self.lstm = nn.GRU(input_size=hparams.input_dim, hidden_size=hparams.lstm_dim, bidirectional=True, batch_first=True, dropout=0.5)
        self.lstm2 = nn.GRU(input_size=hparams.lstm_dim * 2, hidden_size=hparams.lstm_dim, bidirectional=True, batch_first=True, dropout=0.5)
        self.linear = nn.Linear(hparams.lstm_dim * 2, 2)
        #self.location_layer = LocationLayer(hparams.location_layer.attention_n_filters, hparams.location_layer.attention_kernel_size, hparams.location_layer.output_dim)

    def clip_score(self, score, text_lengths, mfcc_lengths):
        middles = [torch.linspace(self.max_frames, self.max_frames + mfcc_lengths[i] - 1, text_length, dtype=torch.int) for i, text_length in enumerate(text_lengths)]
        tops = [i + self.max_frames for i in middles]
        bottoms = [i - self.max_frames for i in middles]

        clipped_score = []
        score = [torch.cat([torch.zeros(i.shape[0], self.max_frames, device=i.device), i, torch.zeros(i.shape[0], self.max_frames, device=i.device)], dim=-1) for i in score]
        clipped_score = []
        for i, top, bottom in zip(score, tops, bottoms):
            clipped_score.append([torch.cat([j[b:t], t.to(j.device).unsqueeze(-1), b.to(j.device).unsqueeze(-1)]) for j, t, b in zip(i, top, bottom)])
            clipped_score[-1] = torch.stack(clipped_score[-1], axis=0)
        return clipped_score

    def forward(self, texts, w1, w2, text_lengths, mfcc_lengths):
        w1 = [i.T for i in w1]
        clipped_w1 = self.clip_score(w1, text_lengths, mfcc_lengths)
        clipped_w2 = self.clip_score(w2, text_lengths, mfcc_lengths)
        clipped_score = [torch.cat([i, j], dim=-1) for i, j in zip(clipped_w1, clipped_w2)]
        #clipped_score = [torch.cat([i, j], dim=-1) for i, j in zip(clipped_w1, clipped_w1)]

        clipped_score = nn.utils.rnn.pad_sequence(clipped_score, batch_first=True)
        x = torch.cat([texts, clipped_score], dim=-1)
        x = nn.utils.rnn.pack_padded_sequence(x, text_lengths, batch_first=True, enforce_sorted=False)
        x, _ = self.lstm(x)
        x, _ = self.lstm2(x)
        x, _ = nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        boundries = torch.relu(self.linear(x))
        #boundries = torch.cumsum(boundries, dim=-1)
        boundries = [boundries[i, :l] for i, l in enumerate(text_lengths)]
        return boundries

In [None]:
class BidirectionalAttention(BidirectionalAttention):

    def __init__(self, input1_dim, input2_dim, attention_dim):
        super().__init__(input1_dim, input2_dim, input1_dim, input2_dim, attention_dim)
        #self.softmax2 = GumbelSoftmax(dim=1)

class BidirectionalAdditiveAttention(BidirectionalAdditiveAttention):

    def __init__(self, input1_dim, input2_dim, attention_dim):
        super().__init__(input1_dim, input2_dim, input1_dim, input2_dim, attention_dim)

class NeuFA_base(nn.Module):

    def __init__(self, hparams):
        super().__init__()

        self.text_encoder = Tacotron2Encoder(hparams.text_encoder)
        self.speech_encoder = ContentEncoder(hparams.speech_encoder)
        #self.attention = BidirectionalAdditiveAttention(hparams.text_encoder.output_dim, hparams.speech_encoder.output_dim, hparams.attention.dim)
        self.attention = BidirectionalAttention(hparams.attention.text_input_dim, hparams.attention.speech_input_dim, hparams.attention.dim)
        self.text_decoder = Decoder(hparams.text_decoder)
        self.speech_decoder = Decoder(hparams.speech_decoder)
        self.positional_encoding_text = PositionalEncoding(hparams.text_encoder.output_dim)
        self.positional_encoding_speech = PositionalEncoding(hparams.speech_encoder.output_dim)
        self.aligner = Aligner(hparams.aligner)
        #self.aligner = Predictor(hparams.predictor)

        self.mse = nn.MSELoss()
        self.mae = nn.L1Loss()
        self.cross_entrophy = torch.nn.CrossEntropyLoss()

    def encode(self, texts, mfccs):
        text_lengths = [i.shape[0] for i in texts]
        texts = pad_sequence(texts, batch_first=True)
        texts = self.text_encoder(texts, text_lengths)
        texts = torch.cat([texts, torch.zeros((texts.shape[0], 1, texts.shape[2]), device=texts.device)], axis=-2)

        mfcc_lengths = [i.shape[0] for i in mfccs]
        mfccs = pad_sequence(mfccs, batch_first=True)
        mfccs = self.speech_encoder(mfccs, mfcc_lengths)
        mfccs = torch.cat([mfccs, torch.zeros((mfccs.shape[0], 1, mfccs.shape[2]), device=mfccs.device)], axis=-2)

        return texts, mfccs, text_lengths, mfcc_lengths

    def positional_encoding(self, texts, mfccs):
        texts_pe = self.positional_encoding_text(texts)
        mfccs_pe = self.positional_encoding_speech(mfccs)
        return texts_pe, mfccs_pe

    def decode(self, texts, mfccs, texts_pe, mfccs_pe, text_lengths, mfcc_lengths):
        texts_at_frame, mfccs_at_text, w1, w2, score = self.attention(texts_pe, mfccs_pe, texts, mfccs, text_lengths, mfcc_lengths)

        p_texts = self.text_decoder(mfccs_at_text, text_lengths)
        p_mfccs = self.speech_decoder(texts_at_frame, mfcc_lengths)

        return p_texts, p_mfccs, w1, w2, score

    def forward(self, texts, mfccs):
        texts, mfccs, text_lengths, mfcc_lengths = self.encode(texts, mfccs)
        texts_pe, mfccs_pe = self.positional_encoding(texts, mfccs)
        p_texts, p_mfccs, w1, w2, score = self.decode(texts, mfccs, texts_pe, mfccs_pe, text_lengths, mfcc_lengths)
        boundaries = self.aligner(texts[:,:-1,:], w1, w2, text_lengths, mfcc_lengths)
        return p_texts, p_mfccs, w1, w2, boundaries

    def text_loss(self, p_texts, texts):
        p_texts = torch.cat(p_texts)
        texts = torch.cat(texts)
        return self.cross_entrophy(p_texts, texts)

    def mfcc_loss(self, p_mfccs, mfccs):
        p_mfccs = torch.cat(p_mfccs)
        mfccs = torch.cat(mfccs)
        return self.mse(p_mfccs, mfccs)

    def attention_loss(self, w1, w2, alpha=0.5):
        loss = []
        for _w1, _w2 in zip(w1, w2):
            w = torch.maximum(_w1.T, _w2)
            a = torch.linspace(1e-6, 1, w.shape[0], device=w.device).repeat(w.shape[1], 1).T
            b = torch.linspace(1e-6, 1, w.shape[1], device=w.device).repeat(w.shape[0], 1)
            r1 = torch.maximum((a / b), (b / a))
            r2 = torch.maximum(a.flip(1) / b.flip(0), b.flip(0)/ a.flip(1))
            r = torch.maximum(r1, r2) - 1
            r = torch.tanh(alpha * r)
            loss.append(torch.mean(w * r.detach()))
        loss = torch.stack(loss)
        return torch.mean(loss)

    def boundary_mae(self, p_boundaries, boundaries):
        boundaries = [i.reshape((-1,1)) for i in boundaries]
        p_boundaries = [i.reshape((-1,1)) for i in p_boundaries]
        p_boundaries = [p_boundary[boundaries[i]>-1] for i, p_boundary in enumerate(p_boundaries)]
        boundaries = [i[i>-1] for i in boundaries]
        boundaries = torch.cat(boundaries)
        p_boundaries = torch.cat(p_boundaries)
        #print(torch.median(torch.abs(p_boundaries - boundaries)))
        return self.mae(p_boundaries, boundaries)

    def extract_boundary(self, p_boundaries, threshold=0.5):
        result = []
        for p_boundary in p_boundaries:
            result.append([])
            result[-1].append(torch.FloatTensor([i[i<threshold].shape[0] / 100 for i in p_boundary[:,0,:]]))
            result[-1].append(torch.FloatTensor([i[i<threshold].shape[0] / 100 for i in p_boundary[:,1,:]]))
            #result[-1].append(torch.FloatTensor([i[i>threshold].shape[0] / 100 for i in p_boundary[:,1,:]]))
            result[-1] = torch.stack(result[-1], dim=-1).to(p_boundaries[0].device)
        return result

    def boundary_loss(self, p_boundaries, boundaries):
        boundaries = [i.reshape((-1, 1)) for i in boundaries]
        p_boundaries = [i.reshape((-1, 1, i.shape[2])) for i in p_boundaries]
        p_boundaries = [p_boundary[boundaries[i]>-1] for i, p_boundary in enumerate(p_boundaries)]
        boundaries = [i[i>-1] for i in boundaries]
        gated_boundaries = [torch.zeros(i.shape, device=p_boundaries[0].device) for i in p_boundaries]
        for i, boundary in enumerate(boundaries):
            for j, b in enumerate(boundary):
                #if j == 0:
                    gated_boundaries[i][j, int(100 * b):] = 1
                #else:
                #    gated_boundaries[i][j, :int(100 * b)] = 1
        boundaries = [i.reshape((-1,1)) for i in gated_boundaries]
        p_boundaries = [i.reshape((-1,1)) for i in p_boundaries]
        boundaries = torch.cat(boundaries)
        p_boundaries = torch.cat(p_boundaries)
        return self.mae(p_boundaries, boundaries)

class NeuFA_TeP(NeuFA_base):

    def __init__(self, hparams):
        super().__init__(hparams)
        self.tep = nn.Linear(hparams.speech_encoder.output_dim, 1)

    def positional_encoding(self, texts, mfccs):
        tep = torch.relu(self.tep(mfccs)).squeeze(-1)
        tep = torch.cumsum(tep, dim=-1)

        texts_pe = self.positional_encoding_text(texts)
        mfccs_pe = self.positional_encoding_speech(mfccs, tep)
        return texts_pe, mfccs_pe, tep

    def forward(self, texts, mfccs):
        texts, mfccs, text_lengths, mfcc_lengths = self.encode(texts, mfccs)
        texts_pe, mfccs_pe, tep = self.positional_encoding(texts, mfccs)
        p_texts, p_mfccs, w1, w2, score = self.decode(texts, mfccs, texts_pe, mfccs_pe, text_lengths, mfcc_lengths)
        boundaries = self.aligner(texts[:,:-1,:], w1, w2, text_lengths, mfcc_lengths)
        p_text_lengths = [tep[i][l-1] for i, l in enumerate(mfcc_lengths)]
        return p_texts, p_mfccs, w1, w2, text_lengths, p_text_lengths, boundaries

    def length_loss(self, lengths, p_lengths, normalize=True):
        p_lengths = torch.stack(p_lengths)
        lengths = torch.FloatTensor(lengths).to(p_lengths.device)
        if normalize:
            p_lengths = p_lengths / lengths.detach()
            lengths = lengths / lengths.detach()
            return self.mae(lengths, p_lengths)
        else:
            return self.mse(lengths, p_lengths)

class NeuFA_MeP(NeuFA_base):

    def __init__(self, hparams):
        super().__init__(hparams)
        self.mep = nn.Linear(hparams.text_encoder.output_dim, 1)

    def positional_encoding(self, texts, mfccs):
        mep = torch.relu(self.mep(texts)).squeeze(-1)
        mep = torch.cumsum(mep, dim=-1)

        texts_pe = self.positional_encoding_text(texts, mep)
        mfccs_pe = self.positional_encoding_speech(mfccs)
        return texts_pe, mfccs_pe, mep

    def forward(self, texts, mfccs):
        texts, mfccs, text_lengths, mfcc_lengths = self.encode(texts, mfccs)
        texts_pe, mfccs_pe, mep = self.positional_encoding(texts, mfccs)
        p_texts, p_mfccs, w1, w2, score = self.decode(texts, mfccs, texts_pe, mfccs_pe, text_lengths, mfcc_lengths)
        boundaries = self.aligner(texts[:,:-1,:], w1, w2, text_lengths, mfcc_lengths)
        p_mfcc_lengths = [mep[i][l-1] for i, l in enumerate(text_lengths)]
        return p_texts, p_mfccs, w1, w2, mfcc_lengths, p_mfcc_lengths, boundaries

    length_loss = NeuFA_TeP.length_loss

class NeuFA_TeMP(NeuFA_TeP, NeuFA_MeP):

    def positional_encoding(self, texts, mfccs):
        tep = torch.relu(self.tep(mfccs)).squeeze(-1)
        tep = torch.cumsum(tep, dim=-1)

        mep = 10 * torch.relu(self.mep(texts)).squeeze(-1)
        mep = torch.cumsum(mep, dim=-1)

        texts_pe1 = self.positional_encoding_text(texts)
        mfccs_pe1 = self.positional_encoding_speech(mfccs)
        texts_pe2 = self.positional_encoding_text(texts, mep)
        mfccs_pe2 = self.positional_encoding_speech(mfccs, tep)
        texts_pe = torch.cat([texts_pe1, texts_pe2], dim=-1)
        mfccs_pe = torch.cat([mfccs_pe2, mfccs_pe1], dim=-1)
        return texts_pe, mfccs_pe, tep, mep

    def forward(self, texts, mfccs):
        texts, mfccs, text_lengths, mfcc_lengths = self.encode(texts, mfccs)
        texts_pe, mfccs_pe, tep, mep = self.positional_encoding(texts, mfccs)
        p_texts, p_mfccs, w1, w2, score = self.decode(texts, mfccs, texts_pe, mfccs_pe, text_lengths, mfcc_lengths)
        boundaries = self.aligner(texts[:,:-1,:], w1, w2, text_lengths, mfcc_lengths)
        p_text_lengths = [tep[i][l-1] for i, l in enumerate(mfcc_lengths)]
        p_mfcc_lengths = [mep[i][l-1] for i, l in enumerate(text_lengths)]
        return p_texts, p_mfccs, w1, w2, text_lengths, p_text_lengths, mfcc_lengths, p_mfcc_lengths, boundaries

# if __name__ == '__main__':
#     from hparams import base, temp

#     device = 'cuda:5'
#     batch_size = 4

#     dataset = Buckeye(os.path.expanduser('~/BuckeyeTrain'), reduction=base.reduction_rate)
#     data_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=Collate(device), drop_last=True)

#     for batch in data_loader:
#         for Model in [NeuFA_TeMP, NeuFA_MeP, NeuFA_TeP, NeuFA_base]:
#             if Model == NeuFA_TeMP:
#                 model = Model(temp)
#             else:
#                 model = Model(base)
#             model.to(device)
#             output = model(*batch[:2])
#             #print([i.shape for i in output])
#             print(model.text_loss(output[0], batch[0]))
#             print(model.mfcc_loss(output[1], batch[1]))
#             print(model.boundary_loss(output[-1], batch[2]))
#             print(model.boundary_mae(model.extract_boundary(output[-1]), batch[2]))
#             print(model.attention_loss(output[2], output[3]))
#             if Model in [NeuFA_TeP, NeuFA_MeP, NeuFA_TeMP]:
#                 print(model.length_loss(output[4], output[5]))
#             if Model in [NeuFA_TeMP]:
#                 print(model.length_loss(output[6], output[7]))
#             break
#         break

## Training

In [None]:
def save_figure_to_numpy(fig):
    # save it to a numpy array.
    data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
    data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
    return data

def plot_alignment_to_numpy(alignment, info=None):
    fig, ax = plt.subplots(figsize=(6, 4))
    im = ax.imshow(alignment, aspect='auto', origin='lower', interpolation='none')
    fig.colorbar(im, ax=ax)
    #xlabel = 'Decoder timestep'
    #if info is not None:
    #    xlabel += '\n\n' + info
    #plt.xlabel(xlabel)
    #plt.ylabel('Encoder timestep')
    plt.tight_layout()

    fig.canvas.draw()
    data = save_figure_to_numpy(fig)
    plt.close()
    return data

class Save:

    def __init__(self, name='noname'):

        self.name = name + datetime.now().strftime("-%Y%m%d-%H%M%S")
        self.path = Path('save') / self.name
        self.path.mkdir(parents=True, exist_ok=True)

        self.logger = logging.getLogger(self.name)
        options.logging = 'debug'
        enable_pretty_logging(options=options, logger=self.logger)

        self.writer = SummaryWriter(self.path)

    def save_log(self, stage, epoch, batch, step, loss):
        self.logger.info('[%s] %s epoch %d batch %d step %d loss %f', self.name, stage, epoch, batch, step, loss)
        self.writer.add_scalar(f"{stage}/loss", loss, step)

    def save_parameters(self, hparams):
        self.writer.add_text("hparams", json.dumps(hparams, indent=2))

    def save_model(self, model, filename):
        torch.save(model.state_dict(), os.path.join(self.path, filename))

    def save_boundary(self, stage, step, p_boundary, boundary, shape):
        figure = np.zeros(shape)

        for i in range(boundary.shape[0]):
            for j, k in [(p_boundary[i][0], 0.7), (p_boundary[i][1], 0.7), (boundary[i][0], 1), (boundary[i][1], 1)]:
                try:
                    if j >= 0:
                        #print(int(100 * j), i, k)
                        figure[int(100 * j), i] = k
                except:
                    pass

        self.writer.add_image(f"{stage}/boundary", plot_alignment_to_numpy(figure), step, dataformats='HWC')

    def save_attention(self, stage, step, w1, w2):
        self.writer.add_image(f"{stage}/w1", plot_alignment_to_numpy(w1.T.data.cpu().numpy()), step, dataformats='HWC')
        self.writer.add_image(f"{stage}/w2", plot_alignment_to_numpy(w2.T.data.cpu().numpy()), step, dataformats='HWC')

In [None]:
from tqdm import tqdm

args = {
    'model': 'temp',
    'train_path': './Librispeech',
    'load_model': None,
    'name': None,
    'train_path': './LibriSpeech',
    'dev_path': './Buckeye',
    'valid_path': './Buckeye',
}

temp.strategy = 'finetune'

device = "cuda:0"

if args['model'] == 'base':
    model = NeuFA_base(hparams)
elif args['model'] == 'tep':
    model = NeuFA_TeP(hparams)
elif args['model'] == 'mep':
    model = NeuFA_MeP(hparams)
elif args['model'] == 'temp':
    model = NeuFA_TeMP(hparams)

if hparams.strategy != 'finetune':
    if 'LJSpeech' in args['train_path']:
        train_dataset = LJSpeech(args['train_path'], reduction=hparams.reduction_rate)
    elif 'LibriSpeech' in args['train_path']:
        train_dataset = LibriSpeech(args['train_path'], reduction=hparams.reduction_rate)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=hparams.batch_size, shuffle=True, collate_fn=Collate(device), drop_last=True)

if hparams.strategy != 'pretrain':
    other_set, _, test_set, test_setp = train_test_split(dataset.wavs, './Buckeye')
    train_set, train_setp, valid_set, valid_setp = train_test_split(other_set.wavs, './Buckeye')
    print(train_set.reduction, valid_set.reduction)
    dev_dataloader = torch.utils.data.DataLoader(train_set, batch_size=hparams.batch_size, shuffle=True, collate_fn=Collate(device), drop_last=True)
    valid_dataloader = torch.utils.data.DataLoader(valid_set, batch_size=hparams.batch_size, shuffle=True, collate_fn=Collate(device), drop_last=True)

if hparams.strategy == 'semi2':
    dev_dataset2 = BuckeyePhoneme(args['dev_path'], reduction=hparams.reduction_rate)
    dev_dataloader2 = torch.utils.data.DataLoader(dev_dataset, batch_size=hparams.batch_size, shuffle=True, collate_fn=Collate(device), drop_last=True)
    valid_dataset2 = BuckeyePhoneme(args['valid_path'], reduction=hparams.reduction_rate)
    valid_dataloader2 = torch.utils.data.DataLoader(valid_dataset, batch_size=hparams.batch_size, shuffle=True, collate_fn=Collate(device), drop_last=True)

if args['load_model']:
    #model_dict = model.state_dict()
    #state_dict = torch.load(args.load_model)
    #state_dict = {k: v for k, v in state_dict.items() if not k.startswith('aligner.')}
    #model_dict.update(state_dict)
    #model.load_state_dict(model_dict)
    past_state = torch.load(args['load_model'], map_location='cpu')
    model.load_state_dict(past_state['model_state_dict'])
    save = past_state['train_loss_history']
    past_epoch = past_state['epoch']

model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=hparams.learning_rate, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
past_epoch = 0

if args['load_model']:
    optimizer.load_state_dict(past_state['optimizer_state_dict'])

if args['name'] is None:
    args['name'] = args['model']
#else:
#    args.name = args.model + '_' + args.name
save = Save(args['name'])
save.save_parameters(hparams)

def process(model, stage, data, step, batch):
    if isinstance(model, NeuFA_base):
        predicted = model(*data[:2])
        text_loss = model.text_loss(predicted[0], data[0])
        save.writer.add_scalar(f'{stage}/text loss', text_loss, step)
        speech_loss = model.mfcc_loss(predicted[1], data[1])
        save.writer.add_scalar(f'{stage}/speech loss', speech_loss, step)
        loss = hparams.text_loss * text_loss + hparams.speech_loss * speech_loss
        if stage == 'training':
            attention_loss = model.attention_loss(*predicted[2:4], hparams.attention_loss_alpha)
            save.writer.add_scalar(f'{stage}/attention loss', attention_loss, step)
            loss += hparams.attention_loss * attention_loss
        else:
            boundary_loss = model.boundary_loss(predicted[-1], data[2])
            save.writer.add_scalar(f'{stage}/boundary loss', boundary_loss, step)
            boundaries = model.extract_boundary(predicted[-1])
            boundary_mae = model.boundary_mae(boundaries, data[2])
            save.writer.add_scalar(f'{stage}/boundary mae', boundary_mae, step)
            loss += hparams.boundary_loss * boundary_loss
    if isinstance(model, NeuFA_TeP):
        tep_loss = model.length_loss(*predicted[4:6])
        tep_mse = model.length_loss(*predicted[4:6], normalize=False)
        save.writer.add_scalar(f'{stage}/tep loss', tep_loss, step)
        save.writer.add_scalar(f'{stage}/tep rmse', torch.sqrt(tep_mse), step)
        loss += hparams.tep_loss * tep_loss
    if isinstance(model, NeuFA_MeP):
        if isinstance(model, NeuFA_TeMP):
            mep_loss = model.length_loss(*predicted[6:8])
            mep_mse = model.length_loss(*predicted[6:8], normalize=False)
        else:
            mep_loss = model.length_loss(*predicted[4:6])
            mep_mse = model.length_loss(*predicted[4:6], normalize=False)
        save.writer.add_scalar(f'{stage}/mep loss', mep_loss, step)
        save.writer.add_scalar(f'{stage}/mep rmse', torch.sqrt(mep_mse), step)
        loss += hparams.mep_loss * mep_loss
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    save.save_log(stage, epoch, batch, step, loss)
    if step % 100 == 0:
        save.save_attention(stage, step, predicted[2][0], predicted[3][0])
        if stage != 'training':
            save.save_boundary(stage, step, boundaries[0], data[2][0], predicted[2][0].shape)
    if step % 20000 == 0:
        save.save_model(model, f'{step // 1000}k')

step = 1
for epoch in range(past_epoch + 1, hparams.max_epochs):
    save.logger.info('Epoch %d', epoch)

    batch = 1

    if hparams.strategy == 'pretrain':
        for data in train_dataloader:
            process(model, 'training', data, step, batch)
            step += 1
            batch += 1
        continue

    for data in dev_dataloader:
        if hparams.strategy == 'finetune':
            process(model, 'dev', data, step, batch)
        if hparams.strategy == 'semi':
            training_data = next(iter(train_dataloader))
            process(model, 'training', training_data, step, batch)
            process(model, 'dev', data, step, batch)
        if hparams.strategy == 'semi2':
            training_data = next(iter(train_dataloader))
            dev_data2 = next(iter(dev_dataloader2))
            process(model, 'training', training_data, step, batch)
            process(model, 'dev', data, step, batch)
            process(model, 'dev2', dev_data2, step, batch)
        batch += 1
        step += 1

    if step % 20000 == 0:
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'train_loss_history': save,
        }, f'drive/MyDrive/models/neufa{epoch}.pt')

    with torch.no_grad():
        predicted = []
        all_data = []
        for data in tqdm(valid_dataloader):
            all_data.append(data)
            predicted.append(model(*data[:2]))

        data = [i for i in zip(*all_data)]
        predicted = [i for i in zip(*predicted)]

        for i in range(len(data)):
            data[i] = [k for j in data[i] for k in j]

        for i in range(len(predicted)):
            predicted[i] = [k for j in predicted[i] for k in j]

        if isinstance(model, NeuFA_base):
            text_loss = model.text_loss(predicted[0], data[0])
            save.writer.add_scalar('test/text loss', text_loss, epoch)
            speech_loss = model.mfcc_loss(predicted[1], data[1])
            save.writer.add_scalar('test/speech loss', speech_loss, epoch)
            attention_loss = model.attention_loss(*predicted[2:4], 1)
            save.writer.add_scalar('test/attention loss', attention_loss, epoch)
            loss = hparams.text_loss * text_loss + hparams.speech_loss * speech_loss + hparams.attention_loss * attention_loss
            boundary_loss = model.boundary_loss(predicted[-1], data[2])
            save.writer.add_scalar('test/boundary loss', boundary_loss, epoch)
            boundaries = model.extract_boundary(predicted[-1])
            boundary_mae = model.boundary_mae(boundaries, data[2])
            save.writer.add_scalar(f'test/boundary mae', boundary_mae, epoch)
            loss += hparams.boundary_loss * boundary_loss
        if isinstance(model, NeuFA_TeP):
            tep_loss = model.length_loss(*predicted[4:6])
            tep_mse = model.length_loss(*predicted[4:6], normalize=False)
            save.writer.add_scalar('test/tep loss', tep_loss, epoch)
            save.writer.add_scalar('test/tep rmse', torch.sqrt(tep_mse), epoch)
            loss += hparams.tep_loss * tep_loss
        if isinstance(model, NeuFA_MeP):
            if isinstance(model, NeuFA_TeMP):
                mep_loss = model.length_loss(*predicted[6:8])
                mep_mse = model.length_loss(*predicted[6:8], normalize=False)
            else:
                mep_loss = model.length_loss(*predicted[4:6])
                mep_mse = model.length_loss(*predicted[4:6], normalize=False)
            save.writer.add_scalar('test/mep loss', mep_loss, epoch)
            save.writer.add_scalar('test/mep rmse', torch.sqrt(mep_mse), epoch)
            loss += hparams.mep_loss * mep_loss
        save.save_log('test', epoch, batch, epoch, loss)
        save.save_attention('test', epoch, predicted[2][0], predicted[3][0])
        save.save_boundary('test', epoch, boundaries[0], data[2][0], predicted[2][0].shape)

11738
8802
1 1


[I 231207 10:27:34 <ipython-input-20-98a99e4fe854>:119] Epoch 1
INFO:temp-20231207-102734:Epoch 1


(201, 39)
(354, 39)
(90, 39)
(217, 39)
(88, 39)
(31, 39)
(156, 39)
(24, 39)
(207, 39)
(295, 39)
(580, 39)
(222, 39)
(131, 39)
(441, 39)
(42, 39)
(114, 39)


[I 231207 10:27:38 <ipython-input-19-862c3eeb7ff6>:38] [temp-20231207-102734] dev epoch 1 batch 1 step 1 loss 60.742783
INFO:temp-20231207-102734:[temp-20231207-102734] dev epoch 1 batch 1 step 1 loss 60.742783


(81, 39)
(98, 39)
(166, 39)
(34, 39)
(21, 39)
(244, 39)
(281, 39)
(47, 39)
(147, 39)
(169, 39)
(776, 39)


KeyboardInterrupt: ignored

### Testing/Inference

In [None]:
g2p = G2P()

args = {
    'model': 'temp',
    'train_path': './Librispeech',
    'load_model': None,
    'name': None,
    'train_path': './LibriSpeech',
    'dev_path': './drive/MyDrive/BuckeyeTrain',
    'valid_path': './drive/MyDrive/BuckeyeTrain',
}

device = "cuda:0"

if args['model'] == 'base':
    model = NeuFA_base(hparams)
elif args['model'] == 'tep':
    model = NeuFA_TeP(hparams)
elif args['model'] == 'mep':
    model = NeuFA_MeP(hparams)
elif args['model'] == 'temp':
    model = NeuFA_TeMP(hparams)

test_dataset = Buckeye(args['test_path'], reduction=hparams.reduction_rate)
test_dataset = BuckeyePhoneme(args['test_path'], reduction=hparams.reduction_rate)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=hparams.batch_size, shuffle=False, collate_fn=Collate(device))

past_state = torch.load(args['load_model'], map_location='cpu')
model.load_state_dict(past_state['model_state_dict'])

model = model.to(device)
#optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)

with torch.no_grad():
    predicted = []
    all_data = []
    for data in tqdm(test_dataloader):
        all_data.append(data)
        predicted.append(model(*data[:2]))

    data = [i for i in zip(*all_data)]
    predicted = [i for i in zip(*predicted)]

    for i in range(len(data)):
        data[i] = [k for j in data[i] for k in j]

    for i in range(len(predicted)):
        predicted[i] = [k for j in predicted[i] for k in j]

    predicted[-1] = model.extract_boundary(predicted[-1])

    print(model.boundary_mae(predicted[-1], data[2]))
    sys.exit()

    p = [predicted[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 1500]
    q = [data[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 1500]
    print(1500, len(p), model.boundary_mae(p, q))
    p = [predicted[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 1000]
    q = [data[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 1000]
    print(1000, len(p), model.boundary_mae(p, q))
    p = [predicted[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 500]
    q = [data[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 500]
    print(500, len(p), model.boundary_mae(p, q))
    p = [predicted[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 250]
    q = [data[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 250]
    print(250, len(p), model.boundary_mae(p, q))
    p = [predicted[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 125]
    q = [data[-1][i] for i in range(len(data[0])) if data[1][i].shape[0] < 125]
    print(125, len(p), model.boundary_mae(p, q))

    count = [0 for i in range(1001)]
    for i in tqdm(range(len(data[0]))):
        file = test_dataset.wavs[i]
        texts, boundaries, p_boundaries = data[0][i], data[2][i], predicted[-1][i]
        texts = [g2p.id2symbol[j-1] for j in texts]
        output=[]

        for j in range(len(texts)):
            output.append([])
            output[-1].append(texts[j])
            for k in range(2):
                if boundaries[j][k] == -1:
                    output[-1].append('-')
                else:
                    output[-1].append('%.4f' % boundaries[j][k])
            output[-1].append('%.4f' % p_boundaries[j][0])
            output[-1].append('%.4f' % p_boundaries[j][1])
            for k in range(2):
                if boundaries[j][k] == -1:
                    output[-1].append('-')
                else:
                    output[-1].append('%.4f' % (p_boundaries[j][k] - boundaries[j][k]))
                    try:
                        count[int(abs(p_boundaries[j][k] - boundaries[j][k]) * 1000)] += 1
                    except:
                        count[-1] += 1
            output[-1] = '\t'.join(output[-1])

        with open(file.parent / f'{file.stem}.output.txt', 'w') as f:
            f.write('\n'.join(output))

    print(10,   sum(count[:10]), sum(count[:10])/sum(count))
    print(25,   sum(count[10:25]), sum(count[:25])/sum(count))
    print(50,   sum(count[25:50]), sum(count[:50])/sum(count))
    print(100,  sum(count[50:100]), sum(count[:100])/sum(count))
    print(200,  sum(count[100:200]), sum(count[:200])/sum(count))
    print(500,  sum(count[200:500]), sum(count[:500])/sum(count))
    print(1000, sum(count[500:1000]), sum(count[:1000])/sum(count))

In [None]:
#inference class
class NeuFA:

    def __init__(self, model_path='neufa.pt', device='cpu'):
        self.device = device
        past_state = torch.load(model_path, map_location='cpu')
        self.model = NeuFA_TeMP(hparams)
        self.model.load_state_dict(past_state['model_state_dict'])
        model.to(device)
        self.model.eval()
        self.g2p = G2P()

    def fit_to_words(self, matrix, words):
        phonemes = self.get_phonemes(words)

        result = []
        start = 0
        for word, phoneme in zip(words, phonemes):
            if len(phoneme) > 0:
                result.append(np.mean(matrix[start:start+len(phoneme)], axis=0, keepdims=True))
                start += len(phoneme)
            else:
                result.append(np.zeros((1, matrix.shape[-1])))
        result = np.concatenate(result)
        return result

    def get_words(self, text):
        if os.path.exists(text):
            with open(text) as f:
                text = f.readline().strip('\r\n').lower()
        text = ''.join([i for i in text if i in "abcedfghijklmnopqrstuvwxyz' "])
        words = text.split(' ')
        words = [i for i in words if i != '']

        return words

    def get_phonemes(self, words):
        phonemes = []
        for word in words:
            phonemes += [self.g2p.convert(word)]
        for j, phoneme in enumerate(phonemes):
            phonemes[j] = [i[:-1] if i.endswith(('0', '1', '2')) else i for i in phoneme]
            phonemes[j] = [self.g2p.symbol2id[i] + 1 for i in phoneme if i in self.g2p.symbols]
        return phonemes

    def load_text(self, text):
        words = self.get_words(text)
        phonemes = self.get_phonemes(words)
        phonemes = [j for i in phonemes for j in i]
        phonemes = np.array(phonemes)
        return torch.IntTensor(phonemes).to(self.device)

    def load_wav(self, wav):
        if os.path.exists(wav):
            wav, sample_rate = librosa.load(wav, mono=True)
        mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13, hop_length=int(sample_rate/100), n_fft=int(sample_rate/40), fmax=8000)
        delta = librosa.feature.delta(mfcc, width=3, order=1)
        delta2 = librosa.feature.delta(mfcc, width=3, order=2)
        mfcc = np.concatenate([mfcc, delta, delta2]).T.astype(np.float32)
        mean = mfcc.mean(axis=0, keepdims=False)
        std = mfcc.std(axis=0, keepdims=False)
        mfcc -= mean
        mfcc /= std
        return torch.FloatTensor(mfcc).to(self.device)

    def extract_boundary(self, p_boundaries, threshold=0.5):
        result = []
        for p_boundary in p_boundaries:
            result.append([])
            result[-1].append(np.array([i[i<threshold].shape[0] / 100 for i in p_boundary[:,0,:]]))
            result[-1].append(np.array([i[i<threshold].shape[0] / 100 for i in p_boundary[:,1,:]]))
            result[-1] = np.stack(result[-1], axis=-1)
        return result

    def align(self, text, wav):
        text = [self.load_text(text)]
        wav = [self.load_wav(wav)]
        with torch.no_grad():
            _, _, w1, w2, _, _, _, _, boundaries = self.model(text, wav)
            boundaries = self.extract_boundary(boundaries)
        return boundaries[0], w1[0].cpu().numpy(), w2[0].cpu().numpy()

In [None]:
args = {
    'model': 'temp',
    'train_path': './Librispeech',
    'load_model': None,
    'name': None,
    'train_path': './LibriSpeech',
    'dev_path': './drive/MyDrive/BuckeyeTrain',
    'valid_path': './drive/MyDrive/BuckeyeTrain',
    'gpu': -1,
    'input_text': None,
    'input_wav': None,
    'input_folder': None,
}

if args['gpu'] < 0:
    neufa = NeuFA()
else:
    neufa = NeuFA(device=f'cuda:{args["gpu"]}')

if args['input_folder']:
    texts = [i for i in Path(args['input_folder']).rglob('*.txt')]
    for text in tqdm(texts):
        wav = text.parent / f'{text.stem}.wav'
        words = neufa.get_words(text)
        boundaries, w_tts, w_asr = neufa.align(text, wav)
        #np.save(text.parent / f'{text.stem}.boundary.npy', boundaries)
        np.save(text.parent / f'{text.stem}.wasr.npy', neufa.fit_to_words(w_asr, words))
        #np.save(text.parent / f'{text.stem}.wtts.npy', w_tts)
else:
    boundaries, w1, w2 = neufa.align(args['input_text'], args['input_wav'])
    words = neufa.get_words(args['input_text'])
    phonemes = neufa.get_phonemes(words)
    start = 0
    for word, phoneme in zip(words, phonemes):
        if len(phoneme) > 0:
            #l = np.min(boundaries[start:start+len(phoneme)])
            #r = np.max(boundaries[start:start+len(phoneme)])
            l = boundaries[start, 0]
            r = boundaries[start+len(phoneme) - 1, 1]
            t = r - l
            print(word, l, r, '%.2f' % t)
        else:
            print(word, '-', '-')

        for p, boundary in zip(phoneme, boundaries[start:start+len(phoneme)]):
            print(neufa.g2p.id2symbol[p-1], boundary)

        start += len(phoneme)