#Running MMS-TTS inference in Colab
In this notebook, we give an example on how to run text-to-speech inference using MMS TTS models.

By default, we run inference on a GPU.  If you want to perform CPU inference, go to "Runtiime" menu -> "Change runtime type" and set "Hardware accelerator" to "None" before running.

## 1. Preliminaries
This section installs necessary python packages for the other sections. Run it first.

In [None]:
%pwd
!git clone https://github.com/jaywalnut310/vits.git
!python --version
%cd vits/

!pip install Cython==0.29.21
!pip install librosa==0.8.0
!pip install phonemizer==2.2.1
!pip install scipy
!pip install numpy
!pip install torch
!pip install torchvision
!pip install matplotlib
!pip install Unidecode==1.1.1
!pip install scipy
!pip install fastapi
!pip install colabcode
!pip install pydub

%cd monotonic_align/
%mkdir monotonic_align
!python3 setup.py build_ext --inplace
%cd ../
%pwd

Cloning into 'vits'...
remote: Enumerating objects: 81, done.[K
remote: Total 81 (delta 0), reused 0 (delta 0), pack-reused 81[K
Receiving objects: 100% (81/81), 3.33 MiB | 6.79 MiB/s, done.
Resolving deltas: 100% (22/22), done.
Python 3.10.12
/content/vits
Collecting Cython==0.29.21
  Downloading Cython-0.29.21-py2.py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.2/974.2 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Cython
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.36
    Uninstalling Cython-0.29.36:
      Successfully uninstalled Cython-0.29.36
Successfully installed Cython-0.29.21
Collecting librosa==0.8.0
  Downloading librosa-0.8.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting resampy>=0.2.2 (from librosa==0.

## 2. Choose a language and download its checkpoint
Find the ISO code for your target language [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html). You can find more details about the languages we currently support for TTS in this [table](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html).

In [None]:
import os
import subprocess
import locale
locale.getpreferredencoding = lambda: "UTF-8"

def download(lang, tgt_dir="./"):
  lang_fn, lang_dir = os.path.join(tgt_dir, lang+'.tar.gz'), os.path.join(tgt_dir, lang)
  cmd = ";".join([
        f"wget https://dl.fbaipublicfiles.com/mms/tts/{lang}.tar.gz -O {lang_fn}",
        f"tar zxvf {lang_fn}"
  ])
  print(f"Download model for language: {lang}")
  subprocess.check_output(cmd, shell=True)
  print(f"Model checkpoints in {lang_dir}: {os.listdir(lang_dir)}")
  return lang_dir

LANG = "eng"
ckpt_dir = download(LANG)

## 3. Load the checkpoint

In [None]:
from IPython.display import Audio
import os
import re
import glob
import json
import tempfile
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import numpy as np
import commons
import utils
import argparse
import subprocess
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from scipy.io.wavfile import write

def preprocess_char(text, lang=None):
    """
    Special treatement of characters in certain languages
    """
    print(lang)
    if lang == 'ron':
        text = text.replace("ț", "ţ")
    return text

class TextMapper(object):
    def __init__(self, vocab_file):
        self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
        self.SPACE_ID = self.symbols.index(" ")
        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

    def text_to_sequence(self, text, cleaner_names):
        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
        Args:
        text: string to convert to a sequence
        cleaner_names: names of the cleaner functions to run the text through
        Returns:
        List of integers corresponding to the symbols in the text
        '''
        sequence = []
        clean_text = text.strip()
        for symbol in clean_text:
            symbol_id = self._symbol_to_id[symbol]
            sequence += [symbol_id]
        return sequence

    def uromanize(self, text, uroman_pl):
        iso = "xxx"
        with tempfile.NamedTemporaryFile() as tf, \
             tempfile.NamedTemporaryFile() as tf2:
            with open(tf.name, "w") as f:
                f.write("\n".join([text]))
            cmd = f"perl " + uroman_pl
            cmd += f" -l {iso} "
            cmd +=  f" < {tf.name} > {tf2.name}"
            os.system(cmd)
            outtexts = []
            with open(tf2.name) as f:
                for line in f:
                    line =  re.sub(r"\s+", " ", line).strip()
                    outtexts.append(line)
            outtext = outtexts[0]
        return outtext

    def get_text(self, text, hps):
        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
        if hps.data.add_blank:
            text_norm = commons.intersperse(text_norm, 0)
        text_norm = torch.LongTensor(text_norm)
        return text_norm

    def filter_oov(self, text):
        val_chars = self._symbol_to_id
        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
        print(f"text after filtering OOV: {txt_filt}")
        return txt_filt

def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
    txt = preprocess_char(txt, lang=lang)
    is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
    if is_uroman:
        with tempfile.TemporaryDirectory() as tmp_dir:
            if uroman_dir is None:
                cmd = f"git clone git@github.com:isi-nlp/uroman.git {tmp_dir}"
                print(cmd)
                subprocess.check_output(cmd, shell=True)
                uroman_dir = tmp_dir
            uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
            print(f"uromanize")
            txt = text_mapper.uromanize(txt, uroman_pl)
            print(f"uroman text: {txt}")
    txt = txt.lower()
    txt = text_mapper.filter_oov(txt)
    return txt

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Run inference with {device}")
vocab_file = f"{ckpt_dir}/vocab.txt"
config_file = f"{ckpt_dir}/config.json"
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
hps = utils.get_hparams_from_file(config_file)
print(hps)
text_mapper = TextMapper(vocab_file)
net_g = SynthesizerTrn(
    len(text_mapper.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
net_g.to(device)
_ = net_g.eval()

g_pth = f"{ckpt_dir}/G_100000.pth"
print(f"load {g_pth}")

_ = utils.load_checkpoint(g_pth, net_g, None)

## 4. Generate an audio given text
Specify the sentence you want to synthesize and generate the audio

In [None]:
def getAudio():
#   txt = """Game_1 was a thrilling match between two evenly matched teams. Both sides started off the game aggressively, with the home team launching a series of attacks in the opening minutes. However, the away team managed to keep the pressure on and eventually took the lead with a well-taken goal. The home team responded well and equalised shortly afterwards, but the away team retook the lead shortly before half-time.

# The second half was a tense affair, with both teams having chances to take the lead. The home team had the best chances but were unable to convert them into goals. The away team then managed to break the deadlock with a well-taken goal to make the score 3-1. The home team pushed for an equaliser but were unable to find the back of the net, and the away team held on to secure a hard-fought victory."""
  txt="""Wow, folks, you won't believe the action we're witnessing in this Dota 2 match! The teams are locked in an epic showdown, and the intensity is off the charts. The strategies, the plays, it's all happening right here!

We've got some incredible heroes in this game, and they're showcasing their skills like never before. The clashes are explosive, and the team fights are absolutely jaw-dropping!
"""
  print(f"text: {txt}")
  txt = preprocess_text(txt, text_mapper, hps, lang=LANG)
  stn_tst = text_mapper.get_text(txt, hps)
  with torch.no_grad():
      x_tst = stn_tst.unsqueeze(0).to(device)
      x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
      hyp = net_g.infer(
          x_tst, x_tst_lengths, noise_scale=.667,
          noise_scale_w=0.8, length_scale=1.0
      )[0][0,0].cpu().float().numpy()

  print(f"Generated audio")
  return {"hyp": hyp,"hps":hps}

In [None]:
!pip install fastapi colabcode

In [None]:
from typing import Union

from fastapi import FastAPI
from colabcode import ColabCode
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
app = FastAPI()
from scipy.io import wavfile
from pydub import AudioSegment


sample_rate = 44100  # Adjust this to your desired sample rate


@app.get("/")
async def read_root():

    val =getAudio()
    hyp = val["hyp"]
    hps = val["hps"]
    print(type(hyp))
    wavfile.write("output.wav", hps.data.sampling_rate, hyp)
    # Load the base audio file
    audio = AudioSegment.from_file("output.wav")
    # Increase pitch and speed for excitement
    return FileResponse("output.wav", media_type="audio/wav")



@app.get("/items/{item_id}")
def read_item(item_id: int, q: Union[str, None] = None):
    return {"item_id": item_id, "q": q}
cc = ColabCode(port =5000,code =False)
cc.run_app(app=app)

In [None]:
import os
print(os.getcwd())


In [None]:
!pip install scipy
!pip install fastapi
!pip install colabcode