In [None]:
!git clone https://github.com/CjangCjengh/vits.git

In [None]:
!pip install Cython==0.29.21 librosa==0.8.0 unidecode==1.3.4 jamo==0.4.1 pypinyin==0.44.0 jieba==0.42.1
!pip install cn2an==0.5.17 inflect==6.0.0 eng_to_ipa==0.0.2 ko_pron==1.3 indic_transliteration==2.3.37 num_thai==0.0.5 opencc==1.1.1

!pip uninstall cmake -y
!pip install pyopenjtalk==0.2.0
!pip install cmake

!pip install transformers==4.27.4
!pip install icetk cpm_kernels gradio
!pip install accelerate==0.17.1
!pip install protobuf==3.20.0


In [None]:
import torch
import transformers
print(torch.version.cuda,transformers.__version__)

In [None]:
import os
os.chdir('/kaggle/working/vits')
%cd monotonic_align
!python setup.py build_ext --inplace
os.chdir('/kaggle/working/vits')
from os.path import join, exists
import torch.nn.functional as F
import commons
import utils
from models import SynthesizerTrn
os.chdir('/kaggle/working/vits/text')
from japanese import japanese_to_romaji_with_accent, japanese_to_ipa, japanese_to_ipa2, japanese_to_ipa3
from symbols import symbols

from scipy.io.wavfile import write
import re
os.chdir('/kaggle/working')

# japanese_cleaners
_pad        = '_'
_punctuation = ',.!?-'
_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
# Export all symbols:
symbols = [_pad] + list(_punctuation) + list(_letters)
# Special symbol ids
SPACE_ID = symbols.index(" ")
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}

def text_to_sequence(text):
    sequence = []
    clean_text = japanese_cleaners(text)
    for symbol in clean_text:
        if symbol not in _symbol_to_id.keys():
            continue
        symbol_id = _symbol_to_id[symbol]
        sequence += [symbol_id]
    return sequence

def japanese_cleaners(text):
    text = japanese_to_romaji_with_accent(text)
    text = re.sub(r'([A-Za-z])$', r'\1.', text)
    return text
def japanese_cleaners2(text):
    return japanese_cleaners(text).replace('ts', 'ʦ').replace('...', '…')

def get_text(text, hps):
    text_norm = text_to_sequence(text)
    if hps.data.add_blank:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    return text_norm

In [None]:
# 翻译
from transformers import (
  T5Tokenizer,
  MT5ForConditionalGeneration,
  Text2TextGenerationPipeline,
)

path = "engmatic-earth/mt5-zh-ja-en-trimmed-fine-tuned-v1"
pipe = Text2TextGenerationPipeline(
    model=MT5ForConditionalGeneration.from_pretrained(path),
    tokenizer=T5Tokenizer.from_pretrained(path),
    device=1,
)

In [None]:
from huggingface_hub import snapshot_download
filePath = snapshot_download(repo_id="THUDM/chatglm-6b")
print(filePath)

In [None]:
# 加载ChatGLM模型
from transformers import AutoTokenizer, AutoModel, AutoConfig
# from accelerate import load_checkpoint_and_dispatch
from accelerate import load_checkpoint_and_dispatch, init_empty_weights

tokenizer = AutoTokenizer.from_pretrained(filePath, device_map="auto", trust_remote_code=True)
config = AutoConfig.from_pretrained(filePath, trust_remote_code=True)
# 由于模型较大，使用accelerate库进行分布式加载
with init_empty_weights():
    chat_model = AutoModel.from_config(config, trust_remote_code=True)
chat_model = load_checkpoint_and_dispatch(
    chat_model, filePath, device_map="auto", no_split_module_classes=["GLMBlock"]
).half()
chat_model = chat_model.eval()

In [None]:
# 模型参数量
num_parameters = 0
parameters = chat_model.parameters()
for parameter in parameters:
    num_parameters += parameter.numel()
num_parameters

In [None]:
!apt install git-lfs
!git lfs install
!git clone https://huggingface.co/spaces/skytnt/moe-tts

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
hps = utils.get_hparams_from_file('/kaggle/working/moe-tts/saved_model/0/config.json')
net_g = SynthesizerTrn(
    len(symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    n_speakers=hps.data.n_speakers,
    **hps.model).to(device)
net_g.eval()
_ = utils.load_checkpoint('/kaggle/working/moe-tts/saved_model/0/model.pth', net_g, None)

In [None]:
import IPython
speaker_id = 0
history = []
# 调整参数
max_length, top_p, temperature=2048, 0.7, 0.95

# enable_conversation：是否启动对话模式（即翻译+生成语音），设定部分可以选择不翻译
enable_conversation = False

while True:
    try:
        sen = input('你：')
        if not enable_conversation and sen == '对话':
            print('启动对话模式')
            enable_conversation = True
            continue
        if enable_conversation and sen == '取消对话':
            print('取消对话模式')
            enable_conversation = False
            continue
        if sen == '角色':
            while True:
                print('SpeakerID: ')
                print('0: Nene\n1: Meguru\n2: Yoshino\n3: Mako\n4: Murasame\n5: Koharu\n6: Nanami')
                input_id = int(input('Your speaker ID: '))
                if input_id >= 0 and input_id < 7:
                    speaker_id = input_id
                    print(f'Your speaker ID is {speaker_id}')
                    break
                else:
                    print("无效角色ID，SpeakerID必须在0-6之间")
            continue
        
        response, history = chat_model.chat(tokenizer, sen, max_length=max_length, 
                            top_p=top_p, temperature=temperature, history=history)
#         print(history)
        if enable_conversation:
            with torch.no_grad():
                sentence = "zh2ja" + response
                res = pipe(sentence, max_length=200, num_beams=4)
                translated = res[0]['generated_text']
                translated = translated.replace(' ','、')
                # 合成语音
                stn_tst = get_text(translated, hps)
                x_tst = stn_tst.to(device).unsqueeze(0)
                x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
                # LongTensor里的数字对应不同的角色声音
                sid = torch.LongTensor([speaker_id]).to(device)
                audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1)[0][0,0].data.cpu().float().numpy()
            write("output.wav", hps.data.sampling_rate, audio)
            
        print('你老婆：', response)
        if enable_conversation:
            print('翻译：', translated)
            audio = IPython.display.Audio("output.wav", rate=hps.data.sampling_rate, autoplay=True)
            IPython.display.display(audio)
    except KeyboardInterrupt:
            break