### Import packages

In [None]:
import IPython.display as ipd
import librosa
import os
import numpy as np
import pandas as pd

from libsoni.util.utils import mix_sonification_and_original

Fs = 16000
SILENCE_SEC = 0.5

### Install required packages for TTS in addition

In [None]:
!pip install -q torch torchaudio omegaconf

### Load model

In [None]:
import torch

model, symbols, sample_rate, example_text, apply_tts = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                                                      model='silero_tts',
                                                                      language='en',
                                                                      speaker='lj_16khz')

In [None]:
# This is a workaround for the TTS model, as it receives the input sentences as a list
def merge_utterances(utterances, silence_sec=SILENCE_SEC):
    utterance_len = 0
    for idx, audio in enumerate(utterances):
        utterance_len += len(audio)
        if idx != len(utterances) - 1:
            utterance_len += int(Fs * silence_sec) 

    utterance = np.zeros(utterance_len)

    utterance_idx = 0
    for idx, audio in enumerate(utterances):
        utterance[utterance_idx:utterance_idx+len(audio)] = audio
        utterance_idx+=len(audio)
        if idx != len(utterances) - 1:
            utterance_idx += int(Fs * silence_sec) 
            
    return utterance

## Scenario: Beethoven's Piano Sonata in G Major, Op.14 No.2, 1st Movement

In [None]:
# Read audio
audio, _ = librosa.load('data_audio/demo_tts/Beethoven_Op014No2-01_Kempff.wav', sr=Fs, duration=90)
df_tts = pd.read_csv('data_csv/demo_tts/Beethoven_Op014No2-01_Kempff.csv', delimiter= ';')

In [None]:
list_sonifications = list()
for idx, row in df_tts.iterrows():
    sentences = [sentence+'.' for sentence in row['utterance'].split('.') if sentence]    
    utterances = apply_tts(texts=sentences, 
                           model=model, 
                           sample_rate=sample_rate, 
                           symbols=symbols, 
                           device=torch.device('cpu'))
    utterance = merge_utterances(utterances)
    list_sonifications.append((row['start'], utterance))

sonification_utterance = np.zeros(int(list_sonifications[-1][0] * Fs) + len(list_sonifications[-1][1]))
for start, sonification in list_sonifications:
    sonification_utterance[int(start*Fs):int(start*Fs)+len(sonification)] = sonification

In [None]:
stereo_sonification = mix_sonification_and_original(audio, sonification_utterance, panning=1.0)

ipd.Audio(stereo_sonification, rate=Fs)