# Requirements

In [None]:
#!apt update

In [None]:
#!pip install git+https://github.com/coqui-ai/TTS --no-deps

In [None]:
#!pip install -r voicebox_requirements.txt

In [None]:
#!pip install espeakng
#!pip install speechbrain

In [None]:
#!pip install -r requirements_1.txt

In [None]:
#import nltk
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

# Imports

In [20]:
# Style overrides
from IPython.core.display import HTML
css = open("../../QA/overrides.css", "r").readlines()
css = "".join(css)
css = f"<style>{css}</style>"
HTML(css)

In [21]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [22]:
# Auto reload imports
%load_ext autoreload
%autoreload 2

In [23]:
import sys
sys.path.append('../reader')

In [24]:
# TODO: Make tool to print all current deps and version

In [25]:
import IPython

In [26]:
import os
import requests
import json
import numpy as np
from time import time
import scipy.io.wavfile
import nltk
from pydub import AudioSegment

In [27]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Plot Config
#plt.style.use(["seaborn-darkgrid"])
plt.rcParams["figure.figsize"] = (20, 6)
plt.rcParams["figure.facecolor"] = "white"
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["xtick.color"] = "black"
font = {'family' : 'sans-serif',
        'weight' : 'bold',
        'size'   : 22}
plt.rc('font', **font)
# Fonts
SMALL_FONT = 22
MEDIUM_FONT = 32
LARGE_FONT = 44
plt.rc('font', size=MEDIUM_FONT)         
plt.rc('axes', titlesize=MEDIUM_FONT)
plt.rc('axes', labelsize=MEDIUM_FONT)
plt.rc('xtick', labelsize=SMALL_FONT)
plt.rc('ytick', labelsize=SMALL_FONT)
plt.rc('legend', fontsize=MEDIUM_FONT)
plt.rc('figure', titlesize=LARGE_FONT) 

pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 1000)

In [28]:
!export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/lib/libcudart.so

In [29]:
from voicebox import VoiceBox

In [30]:
import logging
logging.basicConfig(
     #filename='DockProc.log',
     level=logging.DEBUG, 
     format= '[%(asctime)s] {%(lineno)d} %(levelname)s - %(message)s',
     datefmt='%H:%M:%S'
 )

logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("urllib3").setLevel(logging.WARNING)
logging.getLogger("speechbrain").setLevel(logging.WARNING)
logging.getLogger("espeakng").setLevel(logging.WARNING)
logging.getLogger("ffmpeg").setLevel(logging.WARNING)

logger = logging.getLogger("VoiceBox")
logger.setLevel("DEBUG")


In [31]:
device = "cpu"

In [32]:
import platform
print(platform.python_version_tuple())

('3', '10', '11')


In [33]:
import torch
torch.__version__

'2.1.1+cu121'

In [34]:
import transformers
transformers.__version__

'4.37.1'

In [35]:
import nltk
nltk.__version__

'3.8.1'

In [36]:
#import speechbrain
#speechbrain.__version__

In [37]:
import TTS
TTS.__version__

'0.19.1'

In [38]:
import datasets
datasets.__version__

'2.14.5'

In [39]:
#!pip install deepspeed

# Define inputs

In [40]:
texts = [
    "Hello how's it going?",
    "I am doing well just hanging out"
]

# Functions

In [41]:
def get_read_speed(text, wav):
    words = nltk.tokenize.word_tokenize(text)
    word_count = 0
    for word in words:
        if word in [","]:
            continue
        word_count += 1
    audio_length = len(wav) / 24050
    words_per_second = word_count / audio_length
    return words_per_second, audio_length

# SST

## Init model

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
sst_processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
sst_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
sst_model.config.forced_decoder_ids = None

In [None]:
sst_model.cuda()

In [None]:
def sst(wav, source_sample_rate=24050, target_sample_rate = 16000):
    
    
    if source_sample_rate != target_sample_rate:
        #index_stirng = str(index).zfill(3)
        #filename = f"bad_{index_stirng}.wav"
        filename = "cache.wav"
        scipy.io.wavfile.write(filename, source_sample_rate, np.array(wav))
        target_filename = filename.replace(".wav", f"_{target_sample_rate/1000}.wav")

        os.system(f'ffmpeg -y -hide_banner -loglevel error -i {filename} -ar {target_sample_rate} {target_filename}')
        sample_rate, wav = scipy.io.wavfile.read(target_filename)
    
    
    
    input_features = sst_processor(wav, sampling_rate=target_sample_rate, return_tensors="pt").input_features 
    input_features = input_features.to("cuda")
    predicted_ids = sst_model.generate(input_features)
    transcription = sst_processor.batch_decode(predicted_ids, skip_special_tokens=True)
    return transcription

# Init model

In [None]:
# Defaulat voicebox
#voice = VoiceBox(
#            logger=logger, 
#            config_filename="voicebox_config.json",
#            speaker_wav="data/gits_3.wav"
#)

In [None]:
import os
print(os.environ.get('CUDA_PATH'))

In [None]:
#voice.config["synth_params"]["temperature"] = 0.01

# Run model

In [None]:

#wav, rate, wavs = voice.read_text(texts[1] + " " + texts[2]) 

In [None]:
#wav = wav.astype(int).tolist()

In [None]:
#IPython.display.display(IPython.display.Audio(wav, rate=rate, autoplay=False))

# Microservice test

In [None]:
# Health check
r = requests.get("http://192.168.1.120:8100/test")
r.status_code

## 3090 TI

In [47]:
# Define input
#text = "pew pew"
text = texts[1]
# Form payload to api
payload = {'text': text, 'time': 'time', 'priority' : "100.0"}

start_time = time()
r = requests.post("http://192.168.1.120:8100/tts", data=json.dumps(payload))
wav = r.json()["wav"]
end_time = time()
run_time = end_time - start_time
words_per_sec = len(text.split(" ")) / run_time
print (f"{run_time = :.2f}s")
print (f"{words_per_sec = :.2f}")
print (f"Run_time ratio = {(len(wav) / 24000) / run_time :.2f}")

run_time = 1.32s
words_per_sec = 5.31
Run_time ratio = 1.63


In [48]:
r

<Response [200]>

In [49]:
IPython.display.display(IPython.display.Audio(wav, rate=24000, autoplay=False))

In [50]:
plt.plot(wav)

[<matplotlib.lines.Line2D at 0x7f7f8706ebf0>]

## Live update config

In [12]:
from time import time
import json
import requests
config = json.load(open("voicebox_config.json", "r"))


In [13]:
config["synth_params"]["repetition_penalty"] = 2.2
config["synth_params"]["length_penalty"] = -1.5
config["synth_params"]["temperature"] = 0.00000005
config["synth_params"]["gpt_cond_len"] = 2
#config["synth_params"]["gpt_cond_len"] = 3
#config["vocoder"]["speed_up"] = 1.18
#config["vocoder"]["speaker_wav"] = "data/trey.wav"
#config["vocoder"]["speaker_wav"] = None
config["vocoder"]["speed_up"] = 1.27
config["vocoder"]["speaker_wav"] = "data/gits_3.wav"

In [14]:
payload = config
start_time = time()
r = requests.post("http://192.168.1.120:8100/set-config", data=json.dumps(payload))