In [None]:
def create_meta_file(dataset_name, dataset_root_dir, output_metafile_name, audio_sample_rate, num_fft_freqs, spectrograms=True, phonemes=False):
        """Create the meta-file and spectrograms (mel and linear, optionally) or phonemized utterances (optionally).
        
        Format details:
            Every line of the metadata file contains info about one dataset item.
            The line has following format 
                'id|speaker|language|audio_file_path|mel_spectrogram_path|linear_spectrogram_path|text|phonemized_text'
            And the following must hold
                'audio_file_path' can be empty if loading just spectrograms
                'text' should be carefully normalized and should contain interpunction
                'phonemized_text' can be empty if loading just raw text  
        
        Arguments:
            dataset_name (string): Name of the dataset, loaders.py should contain a function for loading with a corresponding name.
            dataset_root_dir (string): Root directory from which is the dataset build and to which are spectrograms and the meta-file saved..
            output_metafile_name (string): Name of the output meta-file.
            audio_sample_rate (int): Sample rate of audios, used if spectrograms is set True.
            num_fft_freqs (int): Number of frequency bands used during spectrogram computation, used if spectrograms is set True.
        Keyword arguments:
            spectrograms (boolean, default True): If true, spetrograms (both mel and linear) are computed and saved.
            phonemes (boolean, default True): If true, phonemized variants of utterances are computed and saved.
        """

        path = '/content/drive/MyDrive/speech_project'
        # save current sample rate and fft freqs hyperparameters, as we may process dataset with different sample rate
        if spectrograms:
            old_sample_rate = hp.sample_rate
            hp.sample_rate = audio_sample_rate
            old_fft_freqs = hp.num_fft
            hp.num_fft = num_fft_freqs

        # load metafiles, an item is a list like: [text, audiopath, speaker_id, language_code]
        items = dataset.loaders.get_loader_by_name(dataset_name)(path)
        # build dictionaries for translation to IPA from source languages, see utils.text for details
        if phonemes:
            text_lang_pairs = [(i[0], hp.languages[0] if i[3] == "" else i[3]) for i in items]
            phoneme_dicts = text.build_phoneme_dicts(text_lang_pairs)

        # prepare directories which will store spectrograms
        if spectrograms:
            spectrogram_dirs = [os.path.join(path, 'spectrograms'), 
                                os.path.join(path, 'linear_spectrograms')]
            for x in spectrogram_dirs:
                if not os.path.exists(x): os.makedirs(x)

        # iterate through items and build the meta-data_root = '/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data'file
        metafile_path = os.path.join(path, output_metafile_name)
        #print(metafile_path)
        with open(metafile_path, 'w', encoding='utf-8') as f:
            Logger.progress(0, prefix='Building metafile:')
            for i in range(len(items)):
                raw_text, audio_path, speaker, language = items[i]
                if language == "": language = hp.languages[0]
                phonemized_text = text.to_phoneme(raw_text, False, language, phoneme_dicts[language]) if phonemes else ""     
                spectrogram_paths = "|"
                if spectrograms:
                    spec_name = f'{str(i).zfill(6)}.npy'
                    #print(os.path.join(dataset_root_dir, audio_path))
                    try:                 
                      audio_data = audio.load(os.path.join(dataset_root_dir, audio_path))
                    except:
                      print(audio_path)
                      continue
                    np.save(os.path.join(spectrogram_dirs[0], spec_name), audio.spectrogram(audio_data, True))
                    np.save(os.path.join(spectrogram_dirs[1], spec_name), audio.spectrogram(audio_data, False))
                    spectrogram_paths = os.path.join('spectrograms', spec_name) + '|' + os.path.join('linear_spectrograms', spec_name)
                print(f'{str(i).zfill(6)}|{speaker}|{language}|{audio_path}|{spectrogram_paths}|{raw_text}|{phonemized_text}', file=f)
                Logger.progress((i + 1) / len(items), prefix='Building metafile:')
        
        # restore the original sample rate and fft freq values
        if spectrograms:
            hp.sample_rate = old_sample_rate
            hp.num_fft = old_fft_freqs

In [None]:
#data_root = '/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data'
#create_meta_file('wav_data', data_root, "train.txt", 16000, 1102, spectrograms=True, phonemes=False)

In [None]:
f = open('/content/drive/MyDrive/speech_project/metafile_all.txt','r')
lines = f.readlines()
count = 0
id_prev =-10
with open('/content/drive/MyDrive/speech_project/train.txt','w') as f_train:
  with open('/content/drive/MyDrive/speech_project/metafile_updated.txt','w') as out:
    for line in lines:
      id = line.split('|')[1]
      if id!= id_prev:
        #print("id {0} ,, id_prev {1} ".format(id,id_prev))
        f_train.write(line)
        id_prev= id
      else:
        out.write(line)
f.close()

In [None]:
import random

f = open('/content/drive/MyDrive/speech_project/metafile_updated.txt','r')
lines = f.readlines()
print(lines[0:5000])
random.shuffle(lines)
print(lines[0:5000])
train = lines[0:5869]
val = lines[5869:7127]
test= lines[7127::]
with open('/content/drive/MyDrive/speech_project/val.txt','w') as f:
  for i in val:
    f.write(i)
with open('/content/drive/MyDrive/speech_project/test.txt','w') as f:
  for i in test:
    f.write(i)
with open('/content/drive/MyDrive/speech_project/train.txt','a') as f:
  for i in train:
    f.write(i)

['000001|1|ar|/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data/1/1-2.wav |spectrograms/000001.npy|linear_spectrograms/000001.npy|أول أيام التداول بعد عطلة استمرت خمسة أيام|\n', '000002|1|ar|/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data/1/1-3.wav |spectrograms/000002.npy|linear_spectrograms/000002.npy|مدعومة بأنباء طيبة من شركات كبرى|\n', '000004|2|ar|/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data/2/2-02.wav |spectrograms/000004.npy|linear_spectrograms/000004.npy|لمتابعة المفاوضات مع مصر|\n', '000005|2|ar|/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data/2/2-03.wav |spectrograms/000005.npy|linear_spectrograms/000005.npy|حول اتفاق للتبادل الحر|\n', '000006|2|ar|/content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data/2/2-04.wav |spectrograms/000006.npy|linear_spectrograms/000006.npy|يذكر أن الصادرات المصرية إلى الولايات المتحدة|\n', '00000

In [None]:
len(lines)

8383

In [1]:
import sys
import os
import IPython
from IPython.display import Audio

In [2]:
os.chdir(os.path.expanduser("~"))
    
tacotron_dir = "Multilingual_Text_to_Speech"
if not os.path.exists(tacotron_dir):
  ! git clone https://github.com/dina-adel/Multilingual_Text_to_Speech

wavernn_dir = "WaveRNN"
if not os.path.exists(wavernn_dir):
  ! git clone https://github.com/Tomiinek/$wavernn_dir

Cloning into 'Multilingual_Text_to_Speech'...
remote: Enumerating objects: 211, done.[K
remote: Counting objects: 100% (211/211), done.[K
remote: Compressing objects: 100% (147/147), done.[K
remote: Total 1428 (delta 131), reused 138 (delta 64), pack-reused 1217[K
Receiving objects: 100% (1428/1428), 44.26 MiB | 35.08 MiB/s, done.
Resolving deltas: 100% (884/884), done.
Cloning into 'WaveRNN'...
remote: Enumerating objects: 981, done.[K
remote: Total 981 (delta 0), reused 0 (delta 0), pack-reused 981[K
Receiving objects: 100% (981/981), 242.14 MiB | 47.77 MiB/s, done.
Resolving deltas: 100% (564/564), done.


In [3]:
! mkdir -p checkpoints
os.chdir(os.path.join(os.path.expanduser("~"), "checkpoints"))

tacotron_chpt = "generated_switching.pyt"
if not os.path.exists(os.path.join(os.path.expanduser("~"), "checkpoints", tacotron_chpt)):
  ! curl -O -L "https://github.com/Tomiinek/Multilingual_Text_to_Speech/releases/download/v1.0/$tacotron_chpt" 

wavernn_chpt = "wavernn_weight.pyt"
if not os.path.exists(os.path.join(os.path.expanduser("~"), "checkpoints", wavernn_chpt)):
  ! curl -O -L "https://github.com/Tomiinek/Multilingual_Text_to_Speech/releases/download/v1.0/$wavernn_chpt"     

os.chdir(os.path.expanduser("~"))

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   631  100   631    0     0   7602      0 --:--:-- --:--:-- --:--:--  7602
100  163M  100  163M    0     0  51.5M      0  0:00:03  0:00:03 --:--:-- 55.6M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   626  100   626    0     0   6955      0 --:--:-- --:--:-- --:--:--  6879
100 18.1M  100 18.1M    0     0  35.6M      0 --:--:-- --:--:-- --:--:-- 35.6M


In [4]:
! pip install -q -U soundfile
! pip install -q -U phonemizer
! pip install -q -U epitran
! apt-get install festival espeak-ng mbrola

[K     |████████████████████████████████| 51kB 7.3MB/s 
[K     |████████████████████████████████| 194kB 13.7MB/s 
[K     |████████████████████████████████| 51kB 9.2MB/s 
[K     |████████████████████████████████| 143kB 21.7MB/s 
[K     |████████████████████████████████| 71kB 10.5MB/s 
[K     |████████████████████████████████| 276kB 55.1MB/s 
[?25h  Building wheel for unicodecsv (setup.py) ... [?25l[?25hdone
  Building wheel for marisa-trie (setup.py) ... [?25l[?25hdone
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following additional packages will be installed:
  alsa-utils espeak-ng-data festlex-cmu festlex-poslex festvox-kallpc16k
  libespeak-ng1 libestools2.5 libfftw3-single3 libnewt0.52 libpcaudio0
  libsonic0 sgml-base whiptail
Suggested packages:
  pidgin-festival festival-freebsoft-utils libfftw3-bin libfftw3-dev
  mbrola-voice espeak cicero sgml-base-doc
The following NEW packages will be installed:
  alsa-utils e

In [5]:
os.chdir(os.path.join(os.path.expanduser("~"), tacotron_dir))
if "utils" in sys.modules: del sys.modules["utils"]
import os
import time
import datetime
import math
import numpy as np
import torch
from torch.utils.data import DataLoader

import dataset.loaders
from dataset.dataset import TextToSpeechDatasetCollection, TextToSpeechCollate, TextToSpeechDataset
from params.params import Params as hp
from utils import audio, text
from modules.tacotron2 import Tacotron, TacotronLoss
from utils.logging import Logger
from utils.samplers import RandomImbalancedSampler, PerfectBatchSampler
from utils import lengths_to_mask, to_gpu

In [6]:
#!ls /content/drive/MyDrive/speech_project/Multilingual_Text_to_Speech/data/wav_data/1

In [None]:
!python3 /root/Multilingual_Text_to_Speech/train.py --base_directory /root/Multilingual_Text_to_Speech --hyper_parameters ar --data_root /root/Multilingual_Text_to_Speech/data

In [None]:
!ls 