In [1]:
# Following pip packages need to be installed:
# !pip install git+https://github.com/huggingface/transformers sentencepiece datasets

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf
from datasets import load_dataset
import numpy as np 
import librosa
from PyPDF2 import PdfReader

# using nltk to break text in proper sentences
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to /home/andre/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device='cuda')
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device='cuda')
# load xvector containing speaker's voice characteristics from a dataset
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0).to(device='cuda') # 7306 - female - 6000 male

Found cached dataset cmu-arctic-xvectors (/home/andre/.cache/huggingface/datasets/Matthijs___cmu-arctic-xvectors/default/0.0.1/a62fea1f9415e240301ea0042ffad2a3aadf4d1caa7f9a8d9512d631723e781f)


In [3]:
from IPython.display import clear_output  

def pretty_print(long_string, nwords=14, clear=True):
    long_string = long_string.replace('\n', '')
    words = long_string.split(' ')
    lines = [ ' '.join(words[i-nwords:i]).strip() for i in range(nwords, len(words)+nwords, nwords)]    
    if clear:
        clear_output()
    out = "\n".join(lines)
    print(out)
    return out 

def write_speech_phrases(verses, slower=True, file='speech'):
    """
    since text comes from broken sentences to better read (like 80 characters column)
    it's not possible to overflow the 600 tokens inputs maximum
    """         
    output = ()
    text = ''
    for verse in verses:       
        # tokenize
        inputs_gpu = processor(text=verse.strip(), return_tensors="pt").to(device='cuda') # spaces ' ' cause weird stuff
        speech_gpu = model.generate_speech(inputs_gpu["input_ids"], speaker_embeddings, vocoder=vocoder)        
        output = output + (speech_gpu.cpu().numpy(),np.zeros(4500)) # end of line pause
        text += '['+verse
        del inputs_gpu, speech_gpu
    output = np.concatenate(output)
    srate  = 16000
    if slower:        
        slowdown_factor = 0.7 # Define the desired slowdown factor
        # Resample the audio data using librosa
        output = librosa.resample(output, orig_sr=srate, target_sr=int(srate * slowdown_factor))
        # Calculate the new sample rate
        srate = int(srate * slowdown_factor)
    sf.write(file+'.wav', 
        output, 
        samplerate=srate)
    with open(file+'sub.txt', 'wt') as f:
        f.write(pretty_print(text))

In [4]:
def make_audio_book_pdf(file, bookname, startpage=0):
    reader = PdfReader(file)
    number_of_pages = len(reader.pages)
    for i, page in enumerate(reader.pages[startpage:]):
        text = page.extract_text()
        write_speech_verses(text.split('\n'), True, f"{bookname}_{i:03d}")

def make_audio_book_text(file, bookname, start=0, max_words=8000):
    """based on richard baxter books from guttenberg project utf-8 text format"""
    with open(file, 'r') as f:
        text = f.read()
    sentences = sent_tokenize(text)
    input_text = ''        
    for i, sentence in enumerate(sentences[start:]): # sentence to start with
        words = len(input_text + ' ' + sentence)        
        if words < max_words:
            input_text += ' ' + sentence 
        else:
            # sice \n \n\n are used for pretty format reading from guttenberg format
            # that's also a good splitting for tts sound breaking 
            write_speech_phrases(input_text.split('\n'), True, f"{bookname}_{i+start:03d}")
            input_text = sentence                 

In [5]:
# make_audio_book_pdf("/home/andre/Downloads/philip_e._tetlock_-_superforecasting_the_art_and_science_of_prediction.pdf", 
#     "superforecasting", 7)

make_audio_book_text('richard_baxter_a_christian_directory_vol3_4.txt','richard_baxter_a_christian_directory_vol3_4',63)

[[11] Psal. xlii. ; lxxxiv. [12] 2 Tim. iii. 5; 1 Tim. iv. 7.
[13] Isa. xxix. 13; Matt. xv. 8; xi. 23, 24; 2 Sam. xv. 25,
28, 29. [14] Luke x. 42. [15] 2 Chron. i. 10-12. [16] Eph. vi.
18; Luke xxi. 36; Rev. iii. 3; Col. iv. 2; Matt. xiii. 33-37. [17]
Jude 19. CHAPTER II. DIRECTIONS ABOUT THE MANNER OF WORSHIP, TO AVOID ALL CORRUPTIONS,
AND[FALSE, UNACCEPTABLE WORSHIPPING OF GOD. The lamentable contentions that have arisen about the manner
of God's[worship, and the cruelty, and blood, and divisions, and uncharitable[revilings which have thence
followed, and also the necessary regard[that every christian must have to worship God according
to his will,[do make it needful that I give you some directions in this
case. _Direct._ I. Be sure that you seriously and faithfully practise that[inward worship of
God, in which the life of religion doth consist: as[to love him above all,
to fear him, believe him, trust him, delight in[him, be zealous for him; and
that your hearts be sanctified unto G

KeyboardInterrupt: 