## Installs

In [None]:
!pip install pydub
!pip install faster-whisper
!pip install translate
!pip install gTTS
!pip install TTS
!pip install -U TTS
!pip install transformers==4.33
!pip install --upgrade deepspeed
!pip install moviepy
!pip install -U deep-translator
!apt install ffmpeg
!pip install spleeter
!pip install pypinyin

In [None]:
from moviepy.editor import VideoFileClip
from faster_whisper import WhisperModel
from deep_translator import GoogleTranslator
from IPython.display import Audio

import os
import torch
import torchaudio
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from pydub import AudioSegment
from pydub.silence import split_on_silence
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Models initial

In [None]:
target_language = "English" #@param ["English", "Spanish", "French", "German", "Italian", "Portuguese", "Polish", "Turkish", "Russian", "Dutch", "Czech", "Arabic", "Chinese (Simplified)",'Japanese','Hungarian','Korean']

In [None]:
model_size = "large-v2"
model_whisper = WhisperModel(model_size, device="cuda", compute_type="float16")

Downloading (…)37e8b/tokenizer.json:   0%|          | 0.00/2.20M [00:00<?, ?B/s]

Downloading (…)08837e8b/config.json:   0%|          | 0.00/2.80k [00:00<?, ?B/s]

Downloading (…)37e8b/vocabulary.txt:   0%|          | 0.00/460k [00:00<?, ?B/s]

Downloading model.bin:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

In [None]:
print("Loading model...")
config = XttsConfig()
config.load_json("/content/drive/MyDrive/data_rut/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="/content/drive/MyDrive/data_rut", use_deepspeed=True)

model.cuda()

## Audio preparation

In [None]:
# taking audio from video

def extract_audio(video_file, output_audio_file):
    video_clip = VideoFileClip(video_file)
    audio_clip = video_clip.audio
    audio_clip.write_audiofile(output_audio_file)
    audio_clip.close()

audio_file = 'audio.mp3'
video_file = '/content/drive/MyDrive/vids/13.mp4'
extract_audio(video_file, audio_file)

MoviePy - Writing audio in audio.mp3


                                                                   

MoviePy - Done.




In [None]:
segments, info = model_whisper.transcribe("audio.mp3", beam_size=5)
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

start, end, start_train, end_train, text = [], [], [], [], []
for segment in segments:
    end_train.append(segment.end * 1000)
    start_train.append(segment.start * 1000)

    start.append(segment.start * 1000)
    end.append(segment.end* 1000)
    text.append(segment.text)
d = {'start': start, 'end': end, 'start_train': start_train, 'end_train':end_train, 'text': text}
df = pd.DataFrame(data=d)

# Фильтрация строк, где все буквы написаны заглавными
df = df[~df['text'].str.isupper()]

Detected language 'ru' with probability 0.999023


In [None]:
# Mapping between full names and ISO 639-1 codes
language_mapping = {
    'English': 'en',
    'Spanish': 'es',
    'French': 'fr',
    'German': 'de',
    'Italian': 'it',
    'Portuguese': 'pt',
    'Polish': 'pl',
    'Turkish': 'tr',
    'Russian': 'ru',
    'Dutch': 'nl',
    'Czech': 'cs',
    'Arabic': 'ar',
    'Chinese (Simplified)': 'zh',
    'Japanese': 'ja',
    'Hungarian': 'hu',
    'Korean': 'ko'
}

target_language_code = language_mapping[target_language]
if target_language_code == 'zh':
  fl = 'zh-CN'

  translator = GoogleTranslator(source='auto', target=fl)
else:
  translator = GoogleTranslator(source='auto', target=target_language_code)

data = df
dt = data['text']
translated_series = dt.apply(translator.translate)
df['translated'] = translated_series
df

Unnamed: 0,start,end,start_train,end_train,text,translated
0,0.0,8000.0,0.0,8000.0,"Ещё один день, у команд позади",Another day behind the teams
1,8000.0,10500.0,8000.0,10500.0,"Вы давай поскорей, результат не тяни","Come on quickly, don’t delay the result"
2,10500.0,13000.0,10500.0,13000.0,"Эксперты на связи, трекеры тоже","Experts are in touch, trackers too"
3,13000.0,15500.0,13000.0,15500.0,Наш хокотон на гонку похожий,Our hocoton is like a race
4,15500.0,17500.0,15500.0,17500.0,"Вы не спали уже сутки, мозг как мотор","You haven't slept for 24 hours, your brain is ..."
5,17500.0,20000.0,17500.0,20000.0,"Гудит поток мыслей, что выше был скор",The stream of thoughts is buzzing that the hig...
6,20000.0,22500.0,20000.0,22500.0,"Движение вперёд, главная задача",Moving forward is the main task
7,22500.0,25000.0,22500.0,25000.0,Здесь важны скиллы и чуть-чуть удачи,Skills and a little luck are important here
8,25000.0,27500.0,25000.0,27500.0,Выиграешь ли ты или победит сосед,Will you win or will your neighbor win?
9,27500.0,29500.0,27500.0,29500.0,"Это не важно, здесь проигравших нет","It doesn't matter, there are no losers here"


## Some speakers + Music

In [None]:
!spleeter separate -o output/ audio.mp3

INFO:spleeter:Downloading model archive https://github.com/deezer/spleeter/releases/download/v1.4.0/2stems.tar.gz
INFO:spleeter:Validating archive checksum
INFO:spleeter:Extracting downloaded 2stems archive
INFO:spleeter:2stems model file(s) extracted
INFO:spleeter:File output/audio/vocals.wav written succesfully
INFO:spleeter:File output/audio/accompaniment.wav written succesfully


In [None]:
def convert_wav_to_mp3(input_wav, output_mp3):
    # Загрузка WAV файла
    sound = AudioSegment.from_wav(input_wav)

    # Сохранение в MP3 формат
    sound.export(output_mp3, format="mp3")

# Пример использования
input_wav = "/content/output/audio/vocals.wav"
output_mp3 = "vocals.mp3"

convert_wav_to_mp3(input_wav, output_mp3)

In [None]:
if not os.path.exists('audio_files'):
    os.mkdir('audio_files')

def text_to_speech_and_save(text, filename, gpt_cond_latent, speaker_embedding, index):
    string = text

    out = model.inference(
        string.replace('.', '').capitalize(),
        target_language_code,
        gpt_cond_latent,
        speaker_embedding,
        temperature=0.8, # Add custom parameters here
    )
    torchaudio.save(f"out1.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)

    audio_file = "/content/out1.wav"
    AudioSegment.from_wav(f"/content/out1.wav").export(f'/content/out0.mp3', format="mp3")

    audio_file = "out0.mp3"
    audio = AudioSegment.from_file(audio_file, format="mp3")
    segments = split_on_silence(audio, min_silence_len=500, silence_thresh=-50)
    if len(segments) > 0:
      output = segments[0]
      for segment in segments[1:]:
          output += segment
    else:
        output = AudioSegment.silent(duration=df.end[index] - df.start[index])

    if len(output) < df.end[index] - df.start[index]:
        output += AudioSegment.silent(duration=df.end[index] - df.start[index])
    print(len(output))
    output.export(filename, format="mp3")

def get_conditioning_latents(audio_path):
    print("Computing speaker latents...")
    gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=audio_path)
    return gpt_cond_latent, speaker_embedding

# Определите общую длительность аудио, чтобы создать пустую дорожку
total_duration = df['end'].max()

# Создайте начальную пустую аудиофайл
combined_audio = AudioSegment.silent(duration=total_duration)

# Перебирайте фразы и вставляйте их в новую дорожку с учетом времени начала
for index, row in df.iterrows():
    text = row['translated']
    start_time = row['start']
    end_time = row['end']

    # Вырезаем отрезок аудио с помощью pydub и сохраняем в WAV
    input_audio_path = '/content/vocals.mp3'  # Замените путь на ваш оригинальный аудиофайл
    output_audio_path = f'audio_files/audio_{index}.mp3'

    original_audio = AudioSegment.from_mp3(input_audio_path)
    segment = original_audio[start_time:end_time]  # Преобразование секунд в миллисекунды
    segment.export(output_audio_path, format="mp3")

    # Получаем speaker latents
    audio_path = [output_audio_path]
    gpt_cond_latent, speaker_embedding = get_conditioning_latents(audio_path)

    # Преобразуем текст в речь и сохраняем
    text_to_speech_and_save(text, output_audio_path, gpt_cond_latent, speaker_embedding,index)

    print(f'Фраза {index}: "{text}" сохранена в {output_audio_path}')

    # Загрузите аудиофайл
    audio = AudioSegment.from_mp3(output_audio_path)

    # Определите длительность фразы
    duration = end_time - start_time

    # Попробуйте выполнить код, если возникает ошибка - пропустите ее
    try:
        # Проверка на ненулевой и положительный duration перед делением
        if duration > 0:
            speedup_factor = len(audio) / duration

            # Проверка на ненулевой speedup_factor
            if speedup_factor != 0:
                audio = audio.speedup(playback_speed=speedup_factor)
                print(speedup_factor)
            # Вставьте аудио на пустую дорожку
            combined_audio = combined_audio.overlay(audio, position=start_time)
            print(output_audio_path)
    except ZeroDivisionError:
        print(f"Пропускаем файл {output_audio_path}, так как произошла ошибка деления на ноль")

# Сохраните новую звуковую дорожку
combined_audio.export('combined_audio.mp3', format='mp3')


Computing speaker latents...
9941
Фраза 0: "Another day behind the teams" сохранена в audio_files/audio_0.mp3
1.242625
audio_files/audio_0.mp3
Computing speaker latents...
4827
Фраза 1: "Come on quickly, don’t delay the result" сохранена в audio_files/audio_1.mp3
1.9308
audio_files/audio_1.mp3
Computing speaker latents...
3296
Фраза 2: "Experts are in touch, trackers too" сохранена в audio_files/audio_2.mp3
1.3184
audio_files/audio_2.mp3
Computing speaker latents...
2923
Фраза 3: "Our hocoton is like a race" сохранена в audio_files/audio_3.mp3
1.1692
audio_files/audio_3.mp3
Computing speaker latents...
3893
Фраза 4: "You haven't slept for 24 hours, your brain is like a motor" сохранена в audio_files/audio_4.mp3
1.9465
audio_files/audio_4.mp3
Computing speaker latents...
3381
Фраза 5: "The stream of thoughts is buzzing that the higher was fast" сохранена в audio_files/audio_5.mp3
1.3524
audio_files/audio_5.mp3
Computing speaker latents...
3008
Фраза 6: "Moving forward is the main task" 

<_io.BufferedRandom name='combined_audio.mp3'>

## gluing audio and video

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!ffmpeg -i "/content/output/audio/accompaniment.wav" -i "/content/combined_audio.mp3" -filter_complex amix=inputs=2:duration=first:dropout_transition=3 "output.mp3" -y

In [None]:
!ffmpeg -i "/content/drive/MyDrive/vids/13.mp4" -i "output.mp3" -filter_complex "[0:a]volume=0.0[v0];[1:a]volume=2.0[v1];[v0][v1]amix=inputs=2:duration=first" -c:v copy -c:a aac -strict experimental "/content/translated_output.mp4"

In [None]:
df = df[['start', 'end','translated']]

In [None]:
df.to_csv('data.csv')

## One voice

In [None]:
!spleeter separate -o output/ audio.mp3

INFO:spleeter:File output/audio/vocals.wav written succesfully
INFO:spleeter:File output/audio/accompaniment.wav written succesfully


In [None]:
print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=['/content/output/audio/vocals.wav'])

Computing speaker latents...


In [None]:
from gtts import gTTS
import pandas as pd
import os

if not os.path.exists('audio_files'):
    os.mkdir('audio_files')

def text_to_speech_and_save(text, filename, gpt_cond_latent, speaker_embedding):
  out = model.inference(
    text.replace(',', '.'),
    "en",
    gpt_cond_latent,
    speaker_embedding,
    temperature=0.8,
  )
  torchaudio.save("/content/out.wav", torch.tensor(out["wav"]).unsqueeze(0), 24000)

  AudioSegment.from_wav("/content/out.wav").export(filename, format="mp3")

for index, row in df.iterrows():
    text = row['translated']
    start_time = row['start']
    end_time = row['end']



    filename = f'audio_files/audio_{index}.mp3'
    text_to_speech_and_save(text, filename,gpt_cond_latent, speaker_embedding)

    print(f'Фраза {index}: "{text}" сохранена в {filename}')


In [None]:
from pydub import AudioSegment
import pandas as pd


# Определите общую длительность аудио, чтобы создать пустую дорожку
total_duration = df['end'].max()

# Создайте начальную пустую аудиофайл
combined_audio = AudioSegment.silent(duration=total_duration)

# Перебирайте фразы и вставляйте их в новую дорожку с учетом времени начала
for index, row in df.iterrows():
    text = row['translated']
    start_time = row['start']
    end_time = row['end']

    audio_filename = f'audio_files/audio_{index}.mp3'

    # Загрузите аудиофайл
    audio = AudioSegment.from_mp3(audio_filename)

    # Определите длительность фразы
    duration = end_time - start_time

    speedup_factor = len(audio) / duration

    audio = audio.speedup(playback_speed=speedup_factor)

    # Вставьте аудио на пустую дорожку
    combined_audio = combined_audio.overlay(audio, position=start_time)
    print(audio_filename)

# Сохраните новую звуковую дорожку
combined_audio.export('combined_audio.mp3', format='mp3')

In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!ffmpeg -i "/content/output/audio/accompaniment.wav" -i "/content/combined_audio.mp3" -filter_complex amix=inputs=2:duration=first:dropout_transition=3 "output.mp3" -y

In [None]:
!ffmpeg -i "/content/drive/MyDrive/vids/16.mp4" -i "output.mp3" -filter_complex "[0:a]volume=0.0[v0];[1:a]volume=2.0[v1];[v0][v1]amix=inputs=2:duration=first" -c:v copy -c:a aac -strict experimental "/content/output16.mp4"

In [None]:
df = df[['start', 'end','translated']]

In [None]:
df.to_csv('data.csv')

## Google Speech

In [None]:
from gtts import gTTS
import pandas as pd
import os

if not os.path.exists('audio_files'):
    os.mkdir('audio_files')

def text_to_speech_and_save(text, filename):
    tts = gTTS(text, lang='en', tld='com', slow=False)
    tts.save(filename)

for index, row in df.iterrows():
    text = row['translated']
    start_time = row['start']
    end_time = row['end']

    filename = f'audio_files/audio_{index}.mp3'
    text_to_speech_and_save(text, filename)

    print(f'Фраза {index}: "{text}" сохранена в {filename}')


In [None]:
from pydub import AudioSegment
import pandas as pd


# Определите общую длительность аудио, чтобы создать пустую дорожку
total_duration = df['end'].max()

# Создайте начальную пустую аудиофайл
combined_audio = AudioSegment.silent(duration=total_duration)

# Перебирайте фразы и вставляйте их в новую дорожку с учетом времени начала
for index, row in df.iterrows():
    text = row['translated']
    start_time = row['start']
    end_time = row['end']

    audio_filename = f'audio_files/audio_{index}.mp3'

    # Загрузите аудиофайл
    audio = AudioSegment.from_mp3(audio_filename)

    # Определите длительность фразы
    duration = end_time - start_time

    if len(audio) > duration:
        # Если аудиофайл длиннее, ускорьте его
        speedup_factor = len(audio) / duration
        audio = audio.speedup(playback_speed=speedup_factor)

    # Вставьте аудио на пустую дорожку
    combined_audio = combined_audio.overlay(audio, position=start_time)

# Сохраните новую звуковую дорожку
combined_audio.export('combined_audio.mp3', format='mp3')


In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
!ffmpeg -i "/content/drive/MyDrive/Hak_Rutube/51.mp4" -i "combined_audio.mp3" -filter_complex "[0:a]volume=0.0[v0];[1:a]volume=1.0[v1];[v0][v1]amix=inputs=2:duration=first" -c:v copy -c:a aac -strict experimental "/content/final_video_combined2.mp4"

## Lips sync

In [None]:
import torch

try:
    del tts
except NameError:
    print("Voice model already deleted")

try:
    del model
except NameError:
    print("Whisper model already deleted")

torch.cuda.empty_cache()

In [None]:
!git clone https://github.com/ajay-sainy/Wav2Lip-GFPGAN.git
basePath = "/content/Wav2Lip-GFPGAN"
%cd {basePath}

In [None]:
wav2lipFolderName = 'Wav2Lip-master'
gfpganFolderName = 'GFPGAN-master'
wav2lipPath = basePath + '/' + wav2lipFolderName
gfpganPath = basePath + '/' + gfpganFolderName

!wget 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth' -O {wav2lipPath}'/face_detection/detection/sfd/s3fd.pth'
!gdown https://drive.google.com/uc?id=1fQtBSYEyuai9MjBOF8j7zZ4oQ9W2N64q --output {wav2lipPath}'/checkpoints/'

In [None]:
wav2lipFolderName = 'Wav2Lip-master'
gfpganFolderName = 'GFPGAN-master'
wav2lipPath = basePath + '/' + wav2lipFolderName
gfpganPath = basePath + '/' + gfpganFolderName

!wget 'https://www.adrianbulat.com/downloads/python-fan/s3fd-619a316812.pth' -O {wav2lipPath}'/face_detection/detection/sfd/s3fd.pth'
!gdown https://drive.google.com/uc?id=1fQtBSYEyuai9MjBOF8j7zZ4oQ9W2N64q --output {wav2lipPath}'/checkpoints/'

In [None]:
!pip install -r requirements.txt

In [None]:
import os
outputPath = basePath+'/outputs'
inputAudioPath = '/content/drive/MyDrive/data/audios/audio1.mp3'
inputVideoPath = '/content/drive/MyDrive/data/videos/test_video.mp4'
lipSyncedOutputPath = basePath + '/outputs/result.mp4'

if not os.path.exists(outputPath):
  os.makedirs(outputPath)

!cd $wav2lipFolderName && python inference.py \
--checkpoint_path checkpoints/wav2lip.pth \
--face {inputVideoPath} \
--audio {inputAudioPath} \
--outfile {lipSyncedOutputPath}

In [None]:
!cd $gfpganFolderName && python setup.py develop
!wget https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.3.pth -P {gfpganFolderName}'/experiments/pretrained_models'

In [None]:
import cv2
from tqdm import tqdm
from os import path

import os

inputVideoPath = outputPath+'/result.mp4'
unProcessedFramesFolderPath = outputPath+'/frames'

if not os.path.exists(unProcessedFramesFolderPath):
  os.makedirs(unProcessedFramesFolderPath)

vidcap = cv2.VideoCapture(inputVideoPath)
numberOfFrames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = vidcap.get(cv2.CAP_PROP_FPS)
print("FPS: ", fps, "Frames: ", numberOfFrames)

for frameNumber in tqdm(range(numberOfFrames)):
    _,image = vidcap.read()
    cv2.imwrite(path.join(unProcessedFramesFolderPath, str(frameNumber).zfill(4)+'.jpg'), image)


In [None]:
!cd $gfpganFolderName && \
  python inference_gfpgan.py -i $unProcessedFramesFolderPath -o $outputPath -v 1.3 -s 2 --only_center_face --bg_upsampler None

In [None]:
import os
restoredFramesPath = outputPath + '/restored_imgs/'
processedVideoOutputPath = outputPath

dir_list = os.listdir(restoredFramesPath)
dir_list.sort()

import cv2
import numpy as np

batch = 0
batchSize = 300
from tqdm import tqdm
for i in tqdm(range(0, len(dir_list), batchSize)):
  img_array = []
  start, end = i, i+batchSize
  print("processing ", start, end)
  for filename in  tqdm(dir_list[start:end]):
      filename = restoredFramesPath+filename;
      img = cv2.imread(filename)
      if img is None:
        continue
      height, width, layers = img.shape
      size = (width,height)
      img_array.append(img)


  out = cv2.VideoWriter(processedVideoOutputPath+'/batch_'+str(batch).zfill(4)+'.avi',cv2.VideoWriter_fourcc(*'DIVX'), 30, size)
  batch = batch + 1

  for i in range(len(img_array)):
    out.write(img_array[i])
  out.release()


In [None]:
concatTextFilePath = outputPath + "/concat.txt"
concatTextFile=open(concatTextFilePath,"w")
for ips in range(batch):
  concatTextFile.write("file batch_" + str(ips).zfill(4) + ".avi\n")
concatTextFile.close()

concatedVideoOutputPath = outputPath + "/concated_output.avi"
!ffmpeg -y -f concat -i {concatTextFilePath} -c copy {concatedVideoOutputPath}

finalProcessedOuputVideo = processedVideoOutputPath+'/final_with_audio.avi'
!ffmpeg -y -i {concatedVideoOutputPath} -i {inputAudioPath} -map 0 -map 1:a -c:v copy -shortest {finalProcessedOuputVideo}

from google.colab import files
files.download(finalProcessedOuputVideo)