In [None]:
### tgt library: https://github.com/hbuschme/TextGridTools
### pyworld library: https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder

## import

In [1]:
import os
from pathlib import Path

import tgt
import numpy as np
import pandas as pd
import pyworld as pw
# import matplotlib.pyplot as plt

import librosa
import librosa.display
import IPython.display as ipd


from scipy.interpolate import interp1d
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

# import audio as Audio
from stft import *


In [2]:
!pwd

/home/heiscold/prac


## Load Data

In [3]:
df = pd.read_csv("train_mls_german.csv")
print(df.shape)
df.head()

(469942, 6)


Unnamed: 0,file_id,speaker_id,chapter_id,utterance_id,sentence,audio_path
0,10087_10388_000000,10087,10388,0,in deutschland protestierten friedrich wilhelm...,/data/speech-data/mls/mls_german_opus/train/au...
1,10087_10388_000001,10087,10388,1,bald feuerwerk bald florett bald knotenstock u...,/data/speech-data/mls/mls_german_opus/train/au...
2,10087_10388_000002,10087,10388,2,oder von stefan zweigs kultiviertem kosmopolit...,/data/speech-data/mls/mls_german_opus/train/au...
3,10087_10388_000003,10087,10388,3,und nun rührt es sich überall in deutschland s...,/data/speech-data/mls/mls_german_opus/train/au...
4,10087_10388_000004,10087,10388,4,ich weine weil man so allein ist man kann nich...,/data/speech-data/mls/mls_german_opus/train/au...


In [4]:
idx = 11
audio_path = df.audio_path.values[idx]
sentence = df.sentence.values[idx]

print(f"Audio Path: {audio_path}")
print(f"SENTENCE: {sentence}")

sample_rate = 22050

audio, sr  = librosa.load(audio_path, sr= None)
## returned sr is 'orig_sr' of sound
print("ORIGINAL SAMPLE RATE: ", sr)
print("TARGET SAMPLE RATE", sample_rate)
print(f"SENTENCE: {sentence}")

## SAMPLE
audio = librosa.resample(audio, orig_sr= sr, target_sr= sample_rate )
ipd.display(ipd.Audio(audio, rate = sample_rate))

Audio Path: /data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000011.opus
SENTENCE: es ist unglücklicherweise noch alles realität wer damals als ludendorff hüben clemenceau drüben regierte sich zornig aufbäumte ist auch heute noch nicht offizier in der armee der vernünftigen leute sondern erst dabei ein paar korporale zusammenzutrommeln
ORIGINAL SAMPLE RATE:  16000
TARGET SAMPLE RATE 22050
SENTENCE: es ist unglücklicherweise noch alles realität wer damals als ludendorff hüben clemenceau drüben regierte sich zornig aufbäumte ist auch heute noch nicht offizier in der armee der vernünftigen leute sondern erst dabei ein paar korporale zusammenzutrommeln


In [6]:
#### STFT 실험 #####
# stft_fn = STFT(1024, 1024, 1024)
# s = audio / (np.max(audio) +1)
# out = stft_fn(torch.tensor(s.reshape(1, -1), dtype = torch.float))
# out

## TextGrid Path: Sample(German)

In [5]:
lang = 'german'
tg_train_base_path = f'/data/speech-data/mls-align/mls_{lang}_opus/train/' 

lang, tg_train_base_path

('german', '/data/speech-data/mls-align/mls_german_opus/train/')

In [6]:
all_train_tgt_paths = []
train_tg_path = [(str(p.stem), tg_train_base_path) for p in Path(tg_train_base_path).glob('*/*' + ".TextGrid")]
all_train_tgt_paths.extend(train_tg_path)
all_train_tgt_paths = sorted(all_train_tgt_paths)
len(all_train_tgt_paths)

468823

In [7]:
# 469942 - 468823 # 1119
all_train_tgt_paths[0]

('10087_10388_000000', '/data/speech-data/mls-align/mls_german_opus/train/')

In [24]:
df2 = pd.DataFrame()

df2['file_id'] = [s[0] for s in all_train_tgt_paths]
df2['tg_path'] = [s[1] + s[0].split("_")[0] + "/" + s[0] + ".TextGrid" for s in all_train_tgt_paths]
df2['speaker_id']= [tuple(file_id.split("_"))[0] for file_id in df2.file_id.to_list()]
df2['chapter_id']= [tuple(file_id.split("_"))[1] for file_id in df2.file_id.to_list()]
df2['utterance_id']= [str(tuple(file_id.split("_"))[2]) for file_id in df2.file_id.to_list()]

print(df2.shape)
df2.head()

(468823, 5)


Unnamed: 0,file_id,tg_path,speaker_id,chapter_id,utterance_id
0,10087_10388_000000,/data/speech-data/mls-align/mls_german_opus/tr...,10087,10388,0
1,10087_10388_000001,/data/speech-data/mls-align/mls_german_opus/tr...,10087,10388,1
2,10087_10388_000002,/data/speech-data/mls-align/mls_german_opus/tr...,10087,10388,2
3,10087_10388_000003,/data/speech-data/mls-align/mls_german_opus/tr...,10087,10388,3
4,10087_10388_000004,/data/speech-data/mls-align/mls_german_opus/tr...,10087,10388,4


In [25]:
df2.columns

Index(['file_id', 'tg_path', 'speaker_id', 'chapter_id', 'utterance_id'], dtype='object')

In [26]:
df2 = df2[['file_id', 'speaker_id', 'chapter_id', 'utterance_id', 'tg_path',]]
df2 = df2.sort_values(by=[df2.columns[1],df2.columns[2],df2.columns[3]],ascending=True)
df2 = df2.reset_index(drop = True) # index reset
print(df2.shape)
df2.head()

(468823, 5)


Unnamed: 0,file_id,speaker_id,chapter_id,utterance_id,tg_path
0,10087_10388_000000,10087,10388,0,/data/speech-data/mls-align/mls_german_opus/tr...
1,10087_10388_000001,10087,10388,1,/data/speech-data/mls-align/mls_german_opus/tr...
2,10087_10388_000002,10087,10388,2,/data/speech-data/mls-align/mls_german_opus/tr...
3,10087_10388_000003,10087,10388,3,/data/speech-data/mls-align/mls_german_opus/tr...
4,10087_10388_000004,10087,10388,4,/data/speech-data/mls-align/mls_german_opus/tr...


In [27]:
df2.tg_path.values[0]

'/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid'

In [31]:
## Save
df2.to_csv("temp_train_german_tg.csv", index=False)

## Sample

In [28]:
idx = 0
audio_path = df.audio_path.values[idx]
sentence = df.sentence.values[idx]

print(f"Audio Path: {audio_path}")
print(f"SENTENCE: {sentence}")

sample_rate = 22050

audio, sr  = librosa.load(audio_path, sr= None)
## returned sr is 'orig_sr' of sound
print("ORIGINAL SAMPLE RATE: ", sr)
print("TARGET SAMPLE RATE", sample_rate)
print(f"SENTENCE: {sentence}")

## SAMPLE
audio = librosa.resample(audio, orig_sr= sr, target_sr= sample_rate )
ipd.display(ipd.Audio(audio, rate = sample_rate))

Audio Path: /data/speech-data/mls/mls_german_opus/train/audio/10087/10388/10087_10388_000000.opus
SENTENCE: in deutschland protestierten friedrich wilhelm foerster der pädagoge und der arzt nicolai in frankreich erhebt seine stimme vom gebrüll der kriegspropaganda umtobt
ORIGINAL SAMPLE RATE:  16000
TARGET SAMPLE RATE 22050
SENTENCE: in deutschland protestierten friedrich wilhelm foerster der pädagoge und der arzt nicolai in frankreich erhebt seine stimme vom gebrüll der kriegspropaganda umtobt


In [29]:
sample_tg_path = df2.tg_path.values[idx]
sample_tg_path

'/data/speech-data/mls-align/mls_german_opus/train/10087/10087_10388_000000.TextGrid'

In [30]:
# Get alignments
textgrid = tgt.io.read_textgrid(sample_tg_path)
textgrid.get_tier_by_name("phones")

IntervalTier(start_time=0.0, end_time=11.55, name="phones", objects=[Interval(0.49, 0.57, "ɪ"), Interval(0.57, 0.62, "n"), Interval(0.62, 0.67, "d"), Interval(0.67, 0.79, "ɔʏ"), Interval(0.79, 0.92, "tʃ"), Interval(0.92, 0.96, "l"), Interval(0.96, 1.04, "a"), Interval(1.04, 1.07, "n"), Interval(1.07, 1.13, "t"), Interval(1.13, 1.17, "p"), Interval(1.17, 1.21, "ʁ"), Interval(1.21, 1.25, "ɔ"), Interval(1.25, 1.31, "tʰ"), Interval(1.31, 1.37, "ɛ"), Interval(1.37, 1.46, "s"), Interval(1.46, 1.51, "t"), Interval(1.51, 1.57, "iː"), Interval(1.57, 1.62, "ɐ"), Interval(1.62, 1.68, "tʰ"), Interval(1.68, 1.76, "n̩"), Interval(1.76, 1.84, "f"), Interval(1.84, 1.91, "ʁ"), Interval(1.91, 1.98, "iː"), Interval(1.98, 2.03, "d"), Interval(2.03, 2.09, "ʁ"), Interval(2.09, 2.15, "ɪ"), Interval(2.15, 2.22, "ç"), Interval(2.22, 2.26, "v"), Interval(2.26, 2.31, "ɪ"), Interval(2.31, 2.35, "l"), Interval(2.35, 2.39, "h"), Interval(2.39, 2.42, "ɛ"), Interval(2.42, 2.48, "l"), Interval(2.48, 2.53, "m"), Interv