In [None]:
## Github[preprocessor.py]: https://github.com/ming024/FastSpeech2/blob/master/preprocessor/preprocessor.py
## Github[config][preprocess]: https://github.com/ming024/FastSpeech2/blob/master/config/LibriTTS/preprocess.yaml

In [None]:
# dataset: "LibriTTS"

# path:
#   corpus_path: "/home/ming/Data/LibriTTS/train-clean-360"
#   lexicon_path: "lexicon/librispeech-lexicon.txt"
#   raw_path: "./raw_data/LibriTTS"
#   preprocessed_path: "./preprocessed_data/LibriTTS"

# preprocessing:
#   val_size: 512
#   text:
#     text_cleaners: ["english_cleaners"]
#     language: "en"
#   audio:
#     sampling_rate: 22050
#     max_wav_value: 32768.0
#   stft:
#     filter_length: 1024
#     hop_length: 256
#     win_length: 1024
#   mel:
#     n_mel_channels: 80
#     mel_fmin: 0
#     mel_fmax: 8000 # please set to 8000 for HiFi-GAN vocoder, set to null for MelGAN vocoder
#   pitch:
#     feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
#     normalization: True
#   energy:
#     feature: "phoneme_level" # support 'phoneme_level' or 'frame_level'
#     normalization: True

## import

In [1]:
import os
from pathlib import Path

import tgt
import numpy as np
import pandas as pd
import pyworld as pw
# import matplotlib.pyplot as plt

import librosa
import librosa.display
import IPython.display as ipd

from scipy.interpolate import interp1d
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# import audio as Audio
from stft import *


## Load Data

In [2]:
df = pd.read_csv('./train_part_20k_german.csv')
print(df.shape)
df.head()

(200000, 6)


Unnamed: 0,file_id,speaker_id,chapter_id,utterance_id,sentence,audio_path
0,10087_10388_000000,10087,10388,0,in deutschland protestierten friedrich wilhelm...,/data/speech-data/mls/mls_german_opus/train/au...
1,10087_10388_000001,10087,10388,1,bald feuerwerk bald florett bald knotenstock u...,/data/speech-data/mls/mls_german_opus/train/au...
2,10087_10388_000002,10087,10388,2,oder von stefan zweigs kultiviertem kosmopolit...,/data/speech-data/mls/mls_german_opus/train/au...
3,10087_10388_000003,10087,10388,3,und nun rührt es sich überall in deutschland s...,/data/speech-data/mls/mls_german_opus/train/au...
4,10087_10388_000004,10087,10388,4,ich weine weil man so allein ist man kann nich...,/data/speech-data/mls/mls_german_opus/train/au...


In [3]:
df2 = pd.read_csv("temp_train_german_tg.csv")
print(df2.shape)
df2.head()

(468823, 5)


Unnamed: 0,file_id,speaker_id,chapter_id,utterance_id,tg_path
0,10087_10388_000000,10087,10388,0,/data/speech-data/mls-align/mls_german_opus/tr...
1,10087_10388_000001,10087,10388,1,/data/speech-data/mls-align/mls_german_opus/tr...
2,10087_10388_000002,10087,10388,2,/data/speech-data/mls-align/mls_german_opus/tr...
3,10087_10388_000003,10087,10388,3,/data/speech-data/mls-align/mls_german_opus/tr...
4,10087_10388_000004,10087,10388,4,/data/speech-data/mls-align/mls_german_opus/tr...


In [None]:
# df = pd.concat([df, df2.tg_path], axis =1)
# print(df.shape)
# df.head()

## Sample

In [4]:
idx = 0
audio_path = df.audio_path.values[idx]
sentence = df.sentence.values[idx]
sample_tg_path = df2.tg_path.values[idx]

print(f"Audio Path: {audio_path}")
print(f"SENTENCE: {sentence}")
print()
sample_rate = 22050

audio, sr  = librosa.load(audio_path, sr= None)
## returned sr is 'orig_sr' of sound
print("ORIGINAL SAMPLE RATE: ", sr)
print("TARGET SAMPLE RATE", sample_rate)
print()
print(f"TG_PATH: {sample_tg_path}")
print(f"SENTENCE: {sentence}")

## SAMPLE
audio = librosa.resample(audio, orig_sr= sr, target_sr= sample_rate )
ipd.display(ipd.Audio(audio, rate = sample_rate))

Audio Path: /data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000000.opus
SENTENCE: in deutschland protestierten friedrich wilhelm foerster der pädagoge und der arzt nicolai in frankreich erhebt seine stimme vom gebrüll der kriegspropaganda umtobt

ORIGINAL SAMPLE RATE:  16000
TARGET SAMPLE RATE 22050

TG_PATH: /data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid
SENTENCE: in deutschland protestierten friedrich wilhelm foerster der pädagoge und der arzt nicolai in frankreich erhebt seine stimme vom gebrüll der kriegspropaganda umtobt


In [5]:
sample_tg_path, audio_path, sentence

('/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid',
 '/data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000000.opus',
 'in deutschland protestierten friedrich wilhelm foerster der pädagoge und der arzt nicolai in frankreich erhebt seine stimme vom gebrüll der kriegspropaganda umtobt')

# Preprocessor Class
## `def __init__(self, config):`

In [213]:
out_dir

'/home/heiscold/prac/'

In [6]:
# config = config
# in_dir = # "./raw_data/LibriTTS" # config["path"]["raw_path"]
# out_dir = # "./preprocessed_data/LibriTTS" # config["path"]["preprocessed_path"]
val_size = 512 #config["preprocessing"]["val_size"]
sampling_rate = 22050 #config["preprocessing"]["audio"]["sampling_rate"]
hop_length = 1024 #config["preprocessing"]["stft"]["hop_length"]

In [7]:
# assert config["preprocessing"]["pitch"]["feature"] in [
#     "phoneme_level",
#     "frame_level",
# ]
# assert config["preprocessing"]["energy"]["feature"] in [
#     "phoneme_level",
#     "frame_level",
# ]

pitch_phoneme_averaging = (
   True # config["preprocessing"]["pitch"]["feature"] == "phoneme_level"
)
energy_phoneme_averaging = (
   True, # config["preprocessing"]["energy"]["feature"] == "phoneme_level"
)

pitch_normalization = True  #config["preprocessing"]["pitch"]["normalization"]
energy_normalization = True #config["preprocessing"]["energy"]["normalization"]

pitch_phoneme_averaging, energy_phoneme_averaging, pitch_normalization, energy_normalization

(True, (True,), True, True)

In [8]:
### STFT 실험
STFT = TacotronSTFT(
    1024, #config["preprocessing"]["stft"]["filter_length"],
    1024, #config["preprocessing"]["stft"]["hop_length"],
    1024, #config["preprocessing"]["stft"]["win_length"],
    80, # config["preprocessing"]["mel"]["n_mel_channels"],
    22050, #config["preprocessing"]["audio"]["sampling_rate"],
    0, #config["preprocessing"]["mel"]["mel_fmin"],
    8000, #config["preprocessing"]["mel"]["mel_fmax"],
)

audio = torch.tensor(audio /(np.max(audio)+1), dtype = torch.float).view(1, -1)
out = STFT.stft_fn(audio)
out ## torch.Size([1, 1, 253952])

(80, 513)


tensor([[[ 2.3345e-06,  2.1004e-06, -5.8281e-06,  ...,  1.8575e-02,
           2.5053e-02,  3.7650e-02]]])

## `def build_from_path(self)`

In [9]:
!pwd

/home/heiscold/prac


In [10]:
import os

out_dir ='/home/heiscold/prac/'

# os.makedirs((os.path.join(self.out_dir, "mel")), exist_ok=True)
# os.makedirs((os.path.join(self.out_dir, "pitch")), exist_ok=True)
# os.makedirs((os.path.join(self.out_dir, "energy")), exist_ok=True)
# os.makedirs((os.path.join(self.out_dir, "duration")), exist_ok=True)

In [11]:
print("Processing Data ...")
out = list() ## LIST
n_frames = 0
pitch_scaler = StandardScaler()
energy_scaler = StandardScaler()

Processing Data ...


In [12]:
sample_tg_path, audio_path

('/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid',
 '/data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000000.opus')

In [149]:
lang = 'german'
tg_base_path = f"/data/speech-data/mls-align/mls_{lang}_opus/train/"
speaker = '10087'
basename ='10087_10388_000000'
tg_path = os.path.join(
    tg_base_path,  speaker, "{}.TextGrid".format(basename)
)


tg_path

'/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid'

In [150]:
os.path.exists(tg_path)

True

In [157]:
df.head()

Unnamed: 0,file_id,speaker_id,chapter_id,utterance_id,sentence,audio_path
0,10087_10388_000000,10087,10388,0,in deutschland protestierten friedrich wilhelm...,/data/speech-data/mls/mls_german_opus/train/au...
1,10087_10388_000001,10087,10388,1,bald feuerwerk bald florett bald knotenstock u...,/data/speech-data/mls/mls_german_opus/train/au...
2,10087_10388_000002,10087,10388,2,oder von stefan zweigs kultiviertem kosmopolit...,/data/speech-data/mls/mls_german_opus/train/au...
3,10087_10388_000003,10087,10388,3,und nun rührt es sich überall in deutschland s...,/data/speech-data/mls/mls_german_opus/train/au...
4,10087_10388_000004,10087,10388,4,ich weine weil man so allein ist man kann nich...,/data/speech-data/mls/mls_german_opus/train/au...


In [165]:
for i, row in df.iterrows():
    print(f"i :{i}")
    # print(f"ROW: {row}")
        
    speaker = str(row['speaker_id'])
    print(f"SPEAKER: {speaker}")
    speakers[speaker] = i ## SPEAKER_ID: STR # i = index number
    wav_name = row['audio_path']
    print(f"WAV NAME {wav_name}")
    # basename = wav_name.split(".")[0] ## This looks like file_id
    basename = row['file_id']
    print(f"BASENAME {basename}")
    print()
    break

i :0
SPEAKER: 10087
WAV NAME /data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000000.opus
BASENAME 10087_10388_000000



In [166]:
tg_path = os.path.join(
    tg_base_path, speaker, "{}.TextGrid".format(basename)
)

tg_path 

'/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid'

In [197]:
# Compute pitch, energy, duration, and mel-spectrogram
speakers = {}
lang = 'german'
tg_base_path = f"/data/speech-data/mls-align/mls_{lang}_opus/train/"

# for i, speaker in enumerate(tqdm(os.listdir(in_dir))):
for i, row in df.iterrows():
    
    speaker = str(row['speaker_id'])
    speakers[speaker] = i ## SPEAKER_ID: STR # i = index number
    wav_name = row['audio_path']
    # basename = wav_name.split(".")[0] ## This looks like file_id
    basename = row['file_id']
    # tg_path = os.path.join(
    #     self.out_dir, "TextGrid", speaker, "{}.TextGrid".format(basename)
    # )
    tg_path = os.path.join(
        tg_base_path, speaker, "{}.TextGrid".format(basename)
    )
    # print(basename, speaker) 
    # 10087_10388_000000 10087

    # print(tg_path, wav_name) 
    # /data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid /
    # # data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000000.opus
    # break

    if os.path.exists(tg_path): ## True
        ret = process_utterance(speaker, basename, wav_name, tg_path)
        if ret is None:
            continue
        else:
            info, pitch, energy, n = ret
        out.append(info)

    if len(pitch) > 0:
        pitch_scaler.partial_fit(pitch.reshape((-1, 1)))
    if len(energy) > 0:
        energy_scaler.partial_fit(energy.reshape((-1, 1)))

    n_frames += n
    if i == 2:
        break
   

In [198]:
n_frames

901

In [199]:
out[0]

'10087_10388_000000|10087|{ɪ n d ɔʏ tʃ l a n t p ʁ ɔ tʰ ɛ s t iː ɐ tʰ n̩ f ʁ iː d ʁ ɪ ç v ɪ l h ɛ l m f ɔ ɐ s t ɐ d eː ʁ pʰ ɛ d a ɡ oː ɡ ə ʊ n d eː ʁ aː ɐ ts t n ɪ kʰ ɔ l aj ɪ n f ʁ a ŋ k ʁ aj ç ɛ ɐ h eː p t z aj n ə ʃ t ɪ m ə f oː m ɡ ə b ʁ ʏ l d eː ʁ k ʁ iː k s p ʁ ɔ pʰ a ɡ a n d a ʊ m tʰ oː p t}|in deutschland protestierten friedrich wilhelm foerster der pädagoge und der arzt nicolai in frankreich erhebt seine stimme vom gebrüll der kriegspropaganda umtobt'

In [168]:
tg_path
## /data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid

'/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid'

In [169]:
os.path.exists(tg_path)

True

In [170]:
out

[]

In [181]:
speaker, basename, wav_name, tg_path

('10087',
 '10087_10388_000000',
 '/data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000000.opus',
 '/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid')

In [189]:
ret1 = process_utterance(speaker, basename, wav_name, tg_path)

In [194]:
# ret1 ## WOrks

In [200]:
print("Computing statistic quantities ...")
# Perform normalization if necessary
if pitch_normalization:
    pitch_mean = pitch_scaler.mean_[0]
    pitch_std = pitch_scaler.scale_[0]
else:
    # A numerical trick to avoid normalization...
    pitch_mean = 0
    pitch_std = 1
if energy_normalization:
    energy_mean = energy_scaler.mean_[0]
    energy_std = energy_scaler.scale_[0]
else:
    energy_mean = 0
    energy_std = 1

Computing statistic quantities ...


In [203]:
pitch_min, pitch_max = normalize(
    os.path.join(out_dir, "pitch"), pitch_mean, pitch_std
)
print(pitch_min, pitch_max )
energy_min, energy_max = normalize(
    os.path.join(out_dir, "energy"), energy_mean, energy_std
)
print(energy_min, energy_max)

-3.8276065196789077 5.509731732399673
-1.1317514 4.9153857


In [206]:
import os
import random
import json


In [207]:

# Save files
with open(os.path.join(out_dir, "speakers.json"), "w") as f:
    f.write(json.dumps(speakers))

with open(os.path.join(out_dir, "stats.json"), "w") as f:
    stats = {
        "pitch": [
            float(pitch_min),
            float(pitch_max),
            float(pitch_mean),
            float(pitch_std),
        ],
        "energy": [
            float(energy_min),
            float(energy_max),
            float(energy_mean),
            float(energy_std),
        ],
    }
    f.write(json.dumps(stats))


In [208]:
print(
    "Total time: {} hours".format(
        n_frames * hop_length / sampling_rate / 3600
    )
)

random.shuffle(out)
out = [r for r in out if r is not None]

Total time: 0.011622877299067776 hours


In [209]:

# Write metadata
with open(os.path.join(out_dir, "train.txt"), "w", encoding="utf-8") as f:
    for m in out[val_size :]:
        f.write(m + "\n")
with open(os.path.join(out_dir, "val.txt"), "w", encoding="utf-8") as f:
    for m in out[: val_size]:
        f.write(m + "\n")


In [211]:
# val_size

In [None]:
def build_from_path(self):
    os.makedirs((os.path.join(self.out_dir, "mel")), exist_ok=True)
    os.makedirs((os.path.join(self.out_dir, "pitch")), exist_ok=True)
    os.makedirs((os.path.join(self.out_dir, "energy")), exist_ok=True)
    os.makedirs((os.path.join(self.out_dir, "duration")), exist_ok=True)

    print("Processing Data ...")
    out = list()
    n_frames = 0
    pitch_scaler = StandardScaler()
    energy_scaler = StandardScaler()

    # Compute pitch, energy, duration, and mel-spectrogram
    speakers = {}
    lang = 'german'
    tg_base_path = f"/data/speech-data/mls-align/mls_{lang}_opus/train/"

    # for i, speaker in enumerate(tqdm(os.listdir(in_dir))):
    for i, row in self.df.iterrows():
        
        speaker = str(row['speaker_id'])
        speakers[speaker] = i ## SPEAKER_ID: STR # i = index number
        wav_name = row['audio_path']
        # basename = wav_name.split(".")[0] ## This looks like file_id
        basename = row['file_id']
        # tg_path = os.path.join(
        #     self.out_dir, "TextGrid", speaker, "{}.TextGrid".format(basename)
        # )
        tg_path = os.path.join(
            tg_base_path, speaker, "{}.TextGrid".format(basename)
        )
        # print(basename, speaker) 
        # 10087_10388_000000 10087

        # print(tg_path, wav_name) 
        # /data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid /
        # # data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000000.opus
        # break

    if os.path.exists(tg_path): ## True
        ret = self.process_utterance(speaker, basename, wav_name, tg_path)
        if ret is None:
            continue
        else:
            info, pitch, energy, n = ret
        out.append(info)

    if len(pitch) > 0:
        pitch_scaler.partial_fit(pitch.reshape((-1, 1)))
    if len(energy) > 0:
        energy_scaler.partial_fit(energy.reshape((-1, 1)))

    n_frames += n
    # if i == 2:
    #     break
   
    print("Computing statistic quantities ...")
    # Perform normalization if necessary
    if self.pitch_normalization:
        pitch_mean = pitch_scaler.mean_[0]
        pitch_std = pitch_scaler.scale_[0]
    else:
        # A numerical trick to avoid normalization...
        pitch_mean = 0
        pitch_std = 1
    if self.energy_normalization:
        energy_mean = energy_scaler.mean_[0]
        energy_std = energy_scaler.scale_[0]
    else:
        energy_mean = 0
        energy_std = 1

    pitch_min, pitch_max = self.normalize(
        os.path.join(self.out_dir, "pitch"), pitch_mean, pitch_std
    )
    energy_min, energy_max = self.normalize(
        os.path.join(self.out_dir, "energy"), energy_mean, energy_std
    )

    # Save files
    with open(os.path.join(self.out_dir, "speakers.json"), "w") as f:
        f.write(json.dumps(speakers))

    with open(os.path.join(self.out_dir, "stats.json"), "w") as f:
        stats = {
            "pitch": [
                float(pitch_min),
                float(pitch_max),
                float(pitch_mean),
                float(pitch_std),
            ],
            "energy": [
                float(energy_min),
                float(energy_max),
                float(energy_mean),
                float(energy_std),
            ],
        }
        f.write(json.dumps(stats))

    print(
        "Total time: {} hours".format(
            n_frames * self.hop_length / self.sampling_rate / 3600
        )
    )

    random.shuffle(out)
    out = [r for r in out if r is not None]

    # Write metadata
    with open(os.path.join(self.out_dir, "train.txt"), "w", encoding="utf-8") as f:
        for m in out[self.val_size :]:
            f.write(m + "\n")
    with open(os.path.join(self.out_dir, "val.txt"), "w", encoding="utf-8") as f:
        for m in out[: self.val_size]:
            f.write(m + "\n")

    return out

## `   def process_utterance(self, speaker, basename):`

In [34]:
# def process_utterance(self, speaker, basename):

# wav_path = os.path.join(self.in_dir, speaker, "{}.wav".format(basename))
# text_path = os.path.join(self.in_dir, speaker, "{}.lab".format(basename))
# tg_path = os.path.join(self.out_dir, "TextGrid", speaker, "{}.TextGrid".format(basename) )

print(tg_path)
print(wav_name)
basename, speaker    # 10087_10388_000000 10087

/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid
/data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000000.opus


('10087_10388_000000', '10087')

In [35]:
df2.tg_path.values[0] == tg_path

True

In [36]:
# Get alignments
textgrid = tgt.io.read_textgrid(tg_path)
textgrid

<tgt.core.TextGrid at 0x7fe968057370>

In [59]:
phone, duration, start, end = get_alignment(
    textgrid.get_tier_by_name("phones")
)
# phone, duration, start, end 
start , end
# #  (124(=길이), 124(=길이), 0.49, 10.58)

(0.49, 10.58)

In [61]:
text = "{" + " ".join(phone) + "}"
# print(text)
# {ɪ n d ɔʏ tʃ l a n t p ʁ ɔ tʰ ɛ s t iː ɐ tʰ n̩ f ʁ iː d ʁ ɪ ç v ɪ l h ɛ l m f ɔ ɐ s t ɐ d eː ʁ pʰ ɛ d a ɡ oː ɡ ə ʊ n d eː ʁ aː ɐ ts t n ɪ kʰ ɔ l aj ɪ n f ʁ a ŋ k ʁ aj ç ɛ ɐ h eː p t z aj n ə ʃ t ɪ m ə f oː m ɡ ə b ʁ ʏ l d eː ʁ k ʁ iː k s p ʁ ɔ pʰ a ɡ a n d a ʊ m tʰ oː p t}
# if start >= end:
#     return None

{ɪ n d ɔʏ tʃ l a n t p ʁ ɔ tʰ ɛ s t iː ɐ tʰ n̩ f ʁ iː d ʁ ɪ ç v ɪ l h ɛ l m f ɔ ɐ s t ɐ d eː ʁ pʰ ɛ d a ɡ oː ɡ ə ʊ n d eː ʁ aː ɐ ts t n ɪ kʰ ɔ l aj ɪ n f ʁ a ŋ k ʁ aj ç ɛ ɐ h eː p t z aj n ə ʃ t ɪ m ə f oː m ɡ ə b ʁ ʏ l d eː ʁ k ʁ iː k s p ʁ ɔ pʰ a ɡ a n d a ʊ m tʰ oː p t}


In [63]:
# Read and trim wav files
wav_path = wav_name
wav, _ = librosa.load(wav_path, sr = None) # (184800,)
print(wav.shape) 
wav = wav[
    int(sampling_rate * start) : int(sampling_rate * end)
].astype(np.float32) # (173996,)
print(wav.shape)

(184800,)
(173996,)


In [65]:
##Read raw text --> We got sentence
# with open(text_path, "r") as f:
#     raw_text = f.readline().strip("\n")
raw_text = sentence.strip("\n")
raw_text

'in deutschland protestierten friedrich wilhelm foerster der pädagoge und der arzt nicolai in frankreich erhebt seine stimme vom gebrüll der kriegspropaganda umtobt'

In [68]:
# Compute fundamental frequency
## pw.dio: raw pitch extractor
pitch, t = pw.dio(
    wav.astype(np.float64),
    sampling_rate,
    frame_period=hop_length / sampling_rate * 1000, ## ms?
)
print(len(t), pitch.shape) # 170, (170,)
# pw.stonemask : # pitch refinement
pitch = pw.stonemask(wav.astype(np.float64), pitch, t, sampling_rate)
print(pitch.shape) # (170,)
pitch = pitch[: sum(duration)]
print( pitch.shape) # (170,)

# if np.sum(pitch != 0) <= 1:
#     return None

170 (170,)
(170,)
(170,)


In [70]:
import numpy as np
from scipy.io.wavfile import write


def get_mel_from_wav(audio, _stft):
    audio = torch.clip(torch.FloatTensor(audio).unsqueeze(0), -1, 1)
    audio = torch.autograd.Variable(audio, requires_grad=False)
    melspec, energy = _stft.mel_spectrogram(audio)
    melspec = torch.squeeze(melspec, 0).numpy().astype(np.float32)
    energy = torch.squeeze(energy, 0).numpy().astype(np.float32)

    return melspec, energy

def griffin_lim(magnitudes, stft_fn, n_iters=30):
    """
    PARAMS
    ------
    magnitudes: spectrogram magnitudes
    stft_fn: STFT class with transform (STFT) and inverse (ISTFT) methods
    """

    angles = np.angle(np.exp(2j * np.pi * np.random.rand(*magnitudes.size())))
    angles = angles.astype(np.float32)
    angles = torch.autograd.Variable(torch.from_numpy(angles))
    signal = stft_fn.inverse(magnitudes, angles).squeeze(1)

    for i in range(n_iters):
        _, angles = stft_fn.transform(signal)
        signal = stft_fn.inverse(magnitudes, angles).squeeze(1)
    return signal

In [72]:
# Compute mel-scale spectrogram and energy
# mel_spectrogram, energy = Audio.tools.get_mel_from_wav(wav, self.STFT)
mel_spectrogram, energy = get_mel_from_wav(wav, STFT)
print(mel_spectrogram.shape, energy.shape) # Shape/Length: (80, 170) (170,)

mel_spectrogram = mel_spectrogram[:, : sum(duration)]
energy = energy[: sum(duration)]
print(mel_spectrogram.shape, energy.shape, len(duration)) # Shape/Length: (80, 170) (170,), 124

(80, 170) (170,)
(80, 170) (170,) 124


In [95]:
sum(duration)

188

In [78]:
print(pitch.shape) # 170
np.where(pitch!=0)[0].shape # 67

(170,)


(67,)

In [80]:
pitch[np.where(pitch != 0)[0]].shape
pitch[np.where(pitch != 0)[0][0]], pitch[np.where(pitch != 0)[0][-1]]

(68.63053143749191, 278.22930664321774)

In [81]:
# if self.pitch_phoneme_averaging:
if pitch_phoneme_averaging: # True
    # perform linear interpolation
    nonzero_ids = np.where(pitch != 0)[0] # 170 -> 67
    # interp1d : https://docs.scipy.org/doc/scipy/reference/generated/scipy.interpolate.interp1d.html
    interp_fn = interp1d(
        nonzero_ids, # 67
        pitch[nonzero_ids], # 67
        fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]), # (68.63053143749191, 278.22930664321774)
        bounds_error=False, 
        # kind = 'linear' # Default is ‘linear’.
    )
    pitch = interp_fn(np.arange(0, len(pitch)))

    # Phoneme-level average
    pos = 0
    for i, d in enumerate(duration):
        if d > 0:
            pitch[i] = np.mean(pitch[pos : pos + d])
        else:
            pitch[i] = 0
        pos += d
    pitch = pitch[: len(duration)]

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [82]:
# if self.energy_phoneme_averaging:
if energy_phoneme_averaging: # True
    # Phoneme-level average
    pos = 0
    for i, d in enumerate(duration):
        if d > 0:
            energy[i] = np.mean(energy[pos : pos + d])
        else:
            energy[i] = 0
        pos += d
    energy = energy[: len(duration)]

  ret = ret.dtype.type(ret / rcount)


In [84]:
# print(out_dir)

# Save files
dur_filename = "{}-duration-{}.npy".format(speaker, basename)
np.save(os.path.join(out_dir, "duration", dur_filename), duration)

# pitch_filename = "{}-pitch-{}.npy".format(speaker, basename)
# np.save(os.path.join(self.out_dir, "pitch", pitch_filename), pitch)

# energy_filename = "{}-energy-{}.npy".format(speaker, basename)
# np.save(os.path.join(self.out_dir, "energy", energy_filename), energy)

# mel_filename = "{}-mel-{}.npy".format(speaker, basename)
# np.save(
#     os.path.join(self.out_dir, "mel", mel_filename),
#     mel_spectrogram.T,
# )


In [104]:
values = pitch
if np.isnan(values).any():
    p25 = np.nanpercentile(values, 25)
    p75 = np.nanpercentile(values, 75)
# p25 = np.percentile(values, 25)
# p75 = np.percentile(values, 75)
print(p25, p75)
lower = p25 - 1.5 * (p75 - p25)
upper = p75 + 1.5 * (p75 - p25)
normal_indices = np.logical_and(values > lower, values < upper)
# values[normal_indices]

259.8190095397955 321.36566791364186


In [115]:

# "|".join([basename, speaker, text, raw_text]),
# # return (
#     "|".join([basename, speaker, text, raw_text]),
#     self.remove_outlier(pitch),
#     self.remove_outlier(energy),
#     mel_spectrogram.shape[1],
# )

remove_outlier(pitch).shape, remove_outlier(energy).shape, mel_spectrogram.shape[1],

((94,), (112,), 170)

In [188]:
def process_utterance(speaker, basename, wav_name, tg_path):
    wav_path = wav_name
    # wav_path = os.path.join(self.in_dir, speaker, "{}.wav".format(basename))
    # text_path = os.path.join(self.in_dir, speaker, "{}.lab".format(basename))
    # tg_path = os.path.join(
    #     self.out_dir, "TextGrid", speaker, "{}.TextGrid".format(basename)
    # )

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name("phones")
    ) ## (124(=길이), 124(=길이), 0.49, 10.58)

    ## TEXT = {ɪ n d ɔʏ tʃ l a n t p ʁ ɔ tʰ ɛ s t iː ɐ tʰ n̩ ...
    text = "{" + " ".join(phone) + "}"
    if start >= end:
        return None

    # Read and trim wav files
    wav, _ = librosa.load(wav_path) ## numpy # (184800,)
    wav = wav[ ## sampling_rate = 22050
        int(sampling_rate * start) : int(sampling_rate * end)
    ].astype(np.float32) # (173996,)

    # Read raw text
    # with open(text_path, "r") as f:
    #     raw_text = f.readline().strip("\n")
    raw_text = sentence.strip("\n")

    # Compute fundamental frequency
    # pw: https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder
    pitch, t = pw.dio( ## raw pitch extractor
        wav.astype(np.float64),
        sampling_rate, ## 22050
        frame_period=hop_length / sampling_rate * 1000, ## frame_perid = 1024/ 22050 * 1000 # ms
    ) 
    ## pitch, t Shape = # 170, (170,)
    ## sum(duration) = 188
    ## pitch refinement
    pitch = pw.stonemask(wav.astype(np.float64), pitch, t, sampling_rate)

    ## pitch
    pitch = pitch[: sum(duration)] # (170,)
    if np.sum(pitch != 0) <= 1:
        return None

    # Compute mel-scale spectrogram and energy
    # mel_spectrogram, energy = Audio.tools.get_mel_from_wav(wav, self.STFT)
    mel_spectrogram, energy = get_mel_from_wav(wav, STFT)
    # Shape/Length: (80, 170) (170,)
    mel_spectrogram = mel_spectrogram[:, : sum(duration)]
    energy = energy[: sum(duration)]
    # Shape/Length: (80, 170) (170,), 124


    if pitch_phoneme_averaging: # True
        # perform linear interpolation
        nonzero_ids = np.where(pitch != 0)[0] # 170 -> 67
        # interp1d : https://docs.scipy.org/doc/scipy/reference/generated/scipy.interpolate.interp1d.html
        interp_fn = interp1d(
            nonzero_ids, # 67
            pitch[nonzero_ids], # 67
            fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]), # (68.63053143749191, 278.22930664321774)
            bounds_error=False, 
            # kind = 'linear' # Default is ‘linear’.
        )
        pitch = interp_fn(np.arange(0, len(pitch)))

        # Phoneme-level average
        pos = 0
        for i, d in enumerate(duration):
            if d > 0:
                pitch[i] = np.mean(pitch[pos : pos + d])
            else:
                pitch[i] = 0
            pos += d
        pitch = pitch[: len(duration)]

    # if self.energy_phoneme_averaging:
    if energy_phoneme_averaging: # True
        # Phoneme-level average
        pos = 0
        for i, d in enumerate(duration):
            if d > 0:
                energy[i] = np.mean(energy[pos : pos + d])
            else:
                energy[i] = 0
            pos += d
        energy = energy[: len(duration)]

    # Save files
    dur_filename = "{}-duration-{}.npy".format(speaker, basename)
    np.save(os.path.join(out_dir, "duration", dur_filename), duration)

    pitch_filename = "{}-pitch-{}.npy".format(speaker, basename)
    np.save(os.path.join(out_dir, "pitch", pitch_filename), pitch)

    energy_filename = "{}-energy-{}.npy".format(speaker, basename)
    np.save(os.path.join(out_dir, "energy", energy_filename), energy)

    mel_filename = "{}-mel-{}.npy".format(speaker, basename)
    np.save(
        os.path.join(out_dir, "mel", mel_filename),
        mel_spectrogram.T,
        )

    return (
        "|".join([basename, speaker, text, raw_text]),
        remove_outlier(pitch),
        remove_outlier(energy),
        mel_spectrogram.shape[1],
    )

In [None]:
 
 ### 복붙용!
def process_utterance(self, speaker, basename, wav_name, tg_path):
    
    wav_path = wav_name
    # wav_path = os.path.join(self.in_dir, speaker, "{}.wav".format(basename))
    # text_path = os.path.join(self.in_dir, speaker, "{}.lab".format(basename))
    # tg_path = os.path.join(
    #     self.out_dir, "TextGrid", speaker, "{}.TextGrid".format(basename)
    # )

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = self.get_alignment(
        textgrid.get_tier_by_name("phones")
    ) ## (124(=길이), 124(=길이), 0.49, 10.58)

    ## TEXT = {ɪ n d ɔʏ tʃ l a n t p ʁ ɔ tʰ ɛ s t iː ɐ tʰ n̩ ...
    text = "{" + " ".join(phone) + "}"
    if start >= end:
        return None

    # Read and trim wav files
    wav, _ = librosa.load(wav_path) ## numpy # (184800,)
    wav = wav[ ## sampling_rate = 22050
        int(self.sampling_rate * start) : int(self.sampling_rate * end)
    ].astype(np.float32) # (173996,)

    # Read raw text
    # with open(text_path, "r") as f:
    #     raw_text = f.readline().strip("\n")
    raw_text = sentence.strip("\n")

    # Compute fundamental frequency
    # pw: https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder
    pitch, t = pw.dio( ## raw pitch extractor
    wav.astype(np.float64),
    self.sampling_rate, ## 22050
        frame_period=self.hop_length / self.sampling_rate * 1000, ## frame_perid = 1024/ 22050 * 1000 # ms
    ) 
    ## pitch, t Shape = # 170, (170,)
    ## sum(duration) = 188
    ## pitch refinement
    pitch = pw.stonemask(wav.astype(np.float64), pitch, t, self.sampling_rate)

    ## pitch
    pitch = pitch[: sum(duration)] # (170,)
    if np.sum(pitch != 0) <= 1:
        return None

    # Compute mel-scale spectrogram and energy
    # mel_spectrogram, energy = Audio.tools.get_mel_from_wav(wav, self.STFT)
    mel_spectrogram, energy = get_mel_from_wav(wav, self.STFT)
    # Shape/Length: (80, 170) (170,)
    mel_spectrogram = mel_spectrogram[:, : sum(duration)]
    energy = energy[: sum(duration)]
    # Shape/Length: (80, 170) (170,), 124


    if self.pitch_phoneme_averaging: # True
        # perform linear interpolation
        nonzero_ids = np.where(pitch != 0)[0] # 170 -> 67
        # interp1d : https://docs.scipy.org/doc/scipy/reference/generated/scipy.interpolate.interp1d.html
        interp_fn = interp1d(
            nonzero_ids, # 67
            pitch[nonzero_ids], # 67
            fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]), # (68.63053143749191, 278.22930664321774)
            bounds_error=False, 
            # kind = 'linear' # Default is ‘linear’.
        )
        pitch = interp_fn(np.arange(0, len(pitch)))

        # Phoneme-level average
        pos = 0
        for i, d in enumerate(duration):
            if d > 0:
                pitch[i] = np.mean(pitch[pos : pos + d])
            else:
                pitch[i] = 0
            pos += d
        pitch = pitch[: len(duration)]

    # if self.energy_phoneme_averaging:
    if self.energy_phoneme_averaging: # True
        # Phoneme-level average
        pos = 0
        for i, d in enumerate(duration):
            if d > 0:
                energy[i] = np.mean(energy[pos : pos + d])
            else:
                energy[i] = 0
            pos += d
        energy = energy[: len(duration)]

    # Save files
    dur_filename = "{}-duration-{}.npy".format(speaker, basename)
    np.save(os.path.join(self.out_dir, "duration", dur_filename), duration)

    pitch_filename = "{}-pitch-{}.npy".format(speaker, basename)
    np.save(os.path.join(self.out_dir, "pitch", pitch_filename), pitch)

    energy_filename = "{}-energy-{}.npy".format(speaker, basename)
    np.save(os.path.join(self.out_dir, "energy", energy_filename), energy)

    mel_filename = "{}-mel-{}.npy".format(speaker, basename)
    np.save(
        os.path.join(self.out_dir, "mel", mel_filename),
        mel_spectrogram.T,
    )

    return (
        "|".join([basename, speaker, text, raw_text]),
        self.remove_outlier(pitch),
        self.remove_outlier(energy),
        mel_spectrogram.shape[1],
    )

## `def remove_outlier(self, values):`

In [113]:
### Robust Scaler Thing?

def remove_outlier(values):
    values = np.array(values)

    if np.isnan(values).any():
        p25 = np.nanpercentile(values, 25)
        p75 = np.nanpercentile(values, 75)
    else:   
        p25 = np.percentile(values, 25)
        p75 = np.percentile(values, 75)
        
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    normal_indices = np.logical_and(values > lower, values < upper)

    return values[normal_indices]

In [None]:
### 복붙용
### Robust Scaler Thing?
def remove_outlier(self, values):
    values = np.array(values)

    if np.isnan(values).any():
        p25 = np.nanpercentile(values, 25)
        p75 = np.nanpercentile(values, 75)
    else:   
        p25 = np.percentile(values, 25)
        p75 = np.percentile(values, 75)
        
    p25 = np.percentile(values, 25)
    p75 = np.percentile(values, 75)
    lower = p25 - 1.5 * (p75 - p25)
    upper = p75 + 1.5 * (p75 - p25)
    normal_indices = np.logical_and(values > lower, values < upper)

    return values[normal_indices]

### `def get_alignment(tier = textgrid.get_tier_by_name("phones")):`

In [37]:
# Get alignments
print(tg_path)
textgrid = tgt.io.read_textgrid(tg_path)
textgrid.get_tier_by_name("phones")

/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid


IntervalTier(start_time=0.0, end_time=11.55, name="phones", objects=[Interval(0.49, 0.57, "ɪ"), Interval(0.57, 0.62, "n"), Interval(0.62, 0.67, "d"), Interval(0.67, 0.79, "ɔʏ"), Interval(0.79, 0.92, "tʃ"), Interval(0.92, 0.96, "l"), Interval(0.96, 1.04, "a"), Interval(1.04, 1.07, "n"), Interval(1.07, 1.13, "t"), Interval(1.13, 1.17, "p"), Interval(1.17, 1.21, "ʁ"), Interval(1.21, 1.25, "ɔ"), Interval(1.25, 1.31, "tʰ"), Interval(1.31, 1.37, "ɛ"), Interval(1.37, 1.46, "s"), Interval(1.46, 1.51, "t"), Interval(1.51, 1.57, "iː"), Interval(1.57, 1.62, "ɐ"), Interval(1.62, 1.68, "tʰ"), Interval(1.68, 1.76, "n̩"), Interval(1.76, 1.84, "f"), Interval(1.84, 1.91, "ʁ"), Interval(1.91, 1.98, "iː"), Interval(1.98, 2.03, "d"), Interval(2.03, 2.09, "ʁ"), Interval(2.09, 2.15, "ɪ"), Interval(2.15, 2.22, "ç"), Interval(2.22, 2.26, "v"), Interval(2.26, 2.31, "ɪ"), Interval(2.31, 2.35, "l"), Interval(2.35, 2.39, "h"), Interval(2.39, 2.42, "ɛ"), Interval(2.42, 2.48, "l"), Interval(2.48, 2.53, "m"), Interv

In [39]:
 textgrid.get_tier_by_name("phones")[0]

Interval(0.49, 0.57, "ɪ")

In [40]:
t = textgrid.get_tier_by_name("phones")[0]
t.start_time, t.end_time, t.text

(0.49, 0.57, 'ɪ')

In [44]:
textgrid.get_tier_by_name("phones")._objects[:3]

[Interval(0.49, 0.57, "ɪ"),
 Interval(0.57, 0.62, "n"),
 Interval(0.62, 0.67, "d")]

In [46]:
# phone, duration, start, end = self.get_alignment(textgrid.get_tier_by_name("phones"))
# def get_alignment(tier = textgrid.get_tier_by_name("phones")):
sil_phones = ["sil", "sp", "spn"]

phones = []
durations = []
start_time = 0
end_time = 0
end_idx = 0

In [47]:
tier = textgrid.get_tier_by_name("phones")

for t in tier._objects:
    s, e, p = t.start_time, t.end_time, t.text
    # s, e, p  = Interval(0.49, 0.57, "ɪ")

    # Trim leading silences
    if phones == []:
        if p in sil_phones:
            # p = "ɪ"
            # sil_phones = ["sil", "sp", "spn"]
            continue
        else:
            # s = 0.49
            start_time = s

    if p not in sil_phones:
        # For ordinary phones
        # p = "ɪ"
        phones.append(p)
        end_time = e # 0.57
        end_idx = len(phones)
    else:
        # For silent phones
        # p = "ɪ"
        phones.append(p)

    durations.append(
        int(
            np.round(e * sampling_rate / hop_length) # np.roun(e * 22050 / 1024)  # e = 0.57 ## self.sampling_rate, self.hop_length
            - np.round(s * sampling_rate / hop_length) # -np.roun(s * 22050 / 1024)  # s = 0.49
        )
    )

In [49]:
len(phones), len(durations)

(124, 124)

In [52]:
# Trim tailing silences
phones = phones[:end_idx]
durations = durations[:end_idx]

len(phones), len(durations), start_time, end_time


(124, 124, 0.49, 10.58)

In [54]:
def get_alignment(tier):
    # tier = textgrid.get_tier_by_name("phones")
    sil_phones = ["sil", "sp", "spn"]

    phones = []
    durations = []
    start_time = 0
    end_time = 0
    end_idx = 0
    
    for t in tier._objects:
        s, e, p = t.start_time, t.end_time, t.text
        # s, e, p  = Interval(0.49, 0.57, "ɪ")

        # Trim leading silences
        if phones == []:
            if p in sil_phones:
                # p = "ɪ"
                # sil_phones = ["sil", "sp", "spn"]
                continue
            else:
                # s = 0.49
                start_time = s

        if p not in sil_phones:
            # For ordinary phones
            # p = "ɪ"
            phones.append(p)
            end_time = e # 0.57
            end_idx = len(phones)
        else:
            # For silent phones
            # p = "ɪ"
            phones.append(p)

        durations.append(
            int(
                np.round(e * sampling_rate / hop_length) # np.roun(e * 22050 / 1024)  # e = 0.57 ## self.sampling_rate, self.hop_length
                - np.round(s * sampling_rate / hop_length)  # -np.roun(s * 22050 / 1024)  # s = 0.49
            )
        )

    # Trim tailing silences
    phones = phones[:end_idx]
    durations = durations[:end_idx]

    return phones, durations, start_time, end_time #  (124(=길이), 124(=길이), 0.49, 10.58)


### 복붙용

In [None]:
## 복붙용

def get_alignment(self, tier):
    # tier = textgrid.get_tier_by_name("phones")
    sil_phones = ["sil", "sp", "spn"]

    phones = []
    durations = []
    start_time = 0
    end_time = 0
    end_idx = 0
    
    for t in tier._objects:
        s, e, p = t.start_time, t.end_time, t.text
        # s, e, p  = Interval(0.49, 0.57, "ɪ")

        # Trim leading silences
        if phones == []:
            if p in sil_phones:
                # p = "ɪ"
                # sil_phones = ["sil", "sp", "spn"]
                continue
            else:
                # s = 0.49
                start_time = s

        if p not in sil_phones:
            # For ordinary phones
            # p = "ɪ"
            phones.append(p)
            end_time = e # 0.57
            end_idx = len(phones)
        else:
            # For silent phones
            # p = "ɪ"
            phones.append(p)

        durations.append(
            int(
                np.round(e * self.sampling_rate / self.hop_length) # np.roun(e * 22050 / 1024)  # e = 0.57 ## self.sampling_rate, self.hop_length
                - np.round(s * self.sampling_rate / self.hop_length)  # -np.roun(s * 22050 / 1024)  # s = 0.49
            )
        )

    # Trim tailing silences
    phones = phones[:end_idx]
    durations = durations[:end_idx]

    return phones, durations, start_time, end_time #  (124(=길이), 124(=길이), 0.49, 10.58)


## `def normalize(self, in_dir, mean, std):`

In [202]:
def normalize(in_dir, mean, std):
    max_value = np.finfo(np.float64).min
    min_value = np.finfo(np.float64).max
    for filename in os.listdir(in_dir):
        filename = os.path.join(in_dir, filename)
        values = (np.load(filename) - mean) / std
        np.save(filename, values)

        max_value = max(max_value, max(values))
        min_value = min(min_value, min(values))

    return min_value, max_value

In [None]:
def normalize(self, in_dir, mean, std):
    max_value = np.finfo(np.float64).min
    min_value = np.finfo(np.float64).max
    for filename in os.listdir(in_dir):
        filename = os.path.join(in_dir, filename)
        values = (np.load(filename) - mean) / std
        np.save(filename, values)

        max_value = max(max_value, max(values))
        min_value = min(min_value, min(values))

    return min_value, max_value