### VoxLingua

##Dependencies

In [None]:
!pip install accelerate
!pip install sentencepiece
!pip install transformers
!pip install spacy-transformers

## Extracting Audio
This cell is used to extract audio from a video.

In [None]:
#Converting .mp4 to wav file
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip

def convert_video_to_audio(video_path, audio_path):
    try:
        video = VideoFileClip(video_path)
        audio = video.audio
        audio.write_audiofile(audio_path, codec='pcm_s16le')  # Saving as WAV format
        print("Audio file successfully created!")
    except Exception as e:
        print(f"Error: {e}")


video_path = "/content/video.mp4"  # TODO:Upgrade to support other codecs
file_name = video_path[9:-4]
audio_path = video_path[0:-4] + ".wav"
convert_video_to_audio(video_path, audio_path)

## Wisper Model
This model helps recognize the language of the video and transcript it.
Also, this cell splits the transcript into one sentence chunk and stores it in a list.

In [None]:
#Speech To Text
from transformers import pipeline
import torch

pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large-v3")

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

result = pipe(audio_path)
Transcript = result['text']
print(result)
print(result.keys())

## Translation Model
This model translates the transcript on a per-sentence basis and outputs it back in the same format.

In [None]:
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

model_t = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer_t = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")

Translated_Transcript = []

splited_transcript = Transcript.split(".")

for i in splited_transcript:
  tokenizer_t.src_lang = "en"
  encoded_hi = tokenizer_t(i, return_tensors="pt")
  generated_tokens = model_t.generate(**encoded_hi, forced_bos_token_id=tokenizer_t.get_lang_id("hi")).to(device)
  l = tokenizer_t.batch_decode(generated_tokens, skip_special_tokens=True)
  Translated_Transcript.append(l)

Misc Cell for handling TTS model.

In [5]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

## Restarting Session
These cells stores the local variables in a file then on restarting the session reload the variables.
This is a workaround for limited resources.

In [None]:
!pip install joblib

Saving Cells

In [None]:
import joblib
from google.colab import files
joblib.dump(result,  'result.pkl')
joblib.dump(Transcript,  'Transcript.pkl')
joblib.dump(Translated_Transcript,  'Translated_Transcript.pkl')
joblib.dump(splited_transcript,  'splited_transcript.pkl')
joblib.dump(video_path,'video_path.pkl')

Reloading Cells

In [2]:
import joblib
result = joblib.load('result.pkl')
Transcript = joblib.load('Transcript.pkl')
Translated_Transcript = joblib.load('Translated_Transcript.pkl')
splited_transcript = joblib.load('splited_transcript.pkl')
video_path = joblib.load('video_path.pkl')

In [3]:
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

## TTS Model
This model converts the translated transcript into proper pronunciation of that language with a voice cloning feature.


In [None]:
!pip install TTS

In [None]:
!pip install pydub

In [None]:
from TTS.api import TTS

tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
tts.to(device)

In addition to voice cloning all the separated sentences got merged into a single wav file

In [None]:
import os
from pydub import AudioSegment

output_directory = "/content/sentence_audio/"
final_output_path = "/content/output.wav"

os.makedirs(output_directory, exist_ok=True)

combined_audio = AudioSegment.empty()

for idx, sentence in enumerate(Translated_Transcript):
    text_translated = sentence[0]
    temp_file_path = os.path.join(output_directory, f"sentence_{idx}.wav")
    tts.tts_to_file(
        text=text_translated,
        file_path=temp_file_path,
        speaker_wav="/content/video.wav",
        language="hi"
    )

    audio_segment = AudioSegment.from_wav(temp_file_path)
    combined_audio += audio_segment

combined_audio.export(final_output_path, format="wav")

for idx in range(len(Translated_Transcript)):
    temp_file_path = os.path.join(output_directory, f"sentence_{idx}.wav")
    if os.path.exists(temp_file_path):
        os.remove(temp_file_path)

## Merging Video and Audio
Separated Video File And Audio file merged into one.

In [None]:
from moviepy.editor import VideoFileClip, AudioFileClip

original_video = VideoFileClip(video_path)

new_audio = AudioFileClip("output.wav")

final_clip = original_video.set_audio(new_audio)

final_clip.write_videofile("final_video.mp4")