## Tranformer model training for Malayalam to English TTS

In [4]:
import os
import torchaudio
from transformers import Wav2Vec2Processor, AutoTokenizer
import pandas as pd

### code for data and list initialising

In [9]:
data_path = '/home/dev/synth/projects/tranformers-text-text-translation/transformer/NISP_MALAYALAM_CLEANED'
sampling_rate = 16000


In [10]:

# Paths
data_path = 'path/to/data'
sampling_rate = 16000

# Ensure no local directory conflicts with the model name
assert not os.path.isdir('facebook/wav2vec2-large-xlsr-53'), "Local directory conflict with model name"

# Initialize processor and tokenizer
try:
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-xlsr-53")
except Exception as e:
    print(f"Error loading Wav2Vec2Processor: {e}")
    try:
        processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")  # Try a different model
    except Exception as e:
        print(f"Error loading fallback Wav2Vec2Processor: {e}")
        raise

try:
    tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert", use_fast=True)
except Exception as e:
    print(f"Error loading AutoTokenizer: {e}")
    try:
        tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased", use_fast=True)  # Try a different tokenizer
    except Exception as e:
        print(f"Error loading fallback AutoTokenizer: {e}")
        raise

# Initialize lists for storing data
speech_inputs = []
text_inputs = []

# Function to preprocess audio
def preprocess_speech(file_path):
    speech, sr = torchaudio.load(file_path)
    if sr != sampling_rate:
        resampler = torchaudio.transforms.Resample(sr, sampling_rate)
        speech = resampler(speech)
    return processor(speech.squeeze().numpy(), sampling_rate=sampling_rate).input_values[0]

# Traverse directories and process files
for root, dirs, files in os.walk(data_path):
    if 'audio.wav' in files and 'audio.txt' in files:
        audio_path = os.path.join(root, 'audio.wav')
        text_path = os.path.join(root, 'audio.txt')

        # Preprocess and store audio
        speech_inputs.append(preprocess_speech(audio_path))

        # Read and tokenize text
        with open(text_path, 'r', encoding='utf-8') as file:
            text = file.read().strip()
            tokenized_text = tokenizer(text, padding='max_length', truncation=True, return_tensors='pt')
            text_inputs.append(tokenized_text['input_ids'][0])

# Convert lists to tensors
speech_inputs = torch.tensor(speech_inputs, dtype=torch.float32)
text_inputs = torch.stack(text_inputs)

# Save preprocessed data to file
torch.save((speech_inputs, text_inputs), 'preprocessed_data.pt')

Error loading Wav2Vec2Processor: Can't load tokenizer for 'facebook/wav2vec2-large-xlsr-53'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'facebook/wav2vec2-large-xlsr-53' is the correct path to a directory containing all relevant files for a Wav2Vec2CTCTokenizer tokenizer.
Error loading AutoTokenizer: Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.


### Trainig script


In [None]:
import torch
from transformers import Wav2Vec2ForCTC, Trainer, TrainingArguments

# Load preprocessed data
speech_inputs, text_inputs = torch.load('preprocessed_data.pt')

# Prepare the training dataset
class SpeechDataset(torch.utils.data.Dataset):
    def __init__(self, speech_inputs, text_inputs):
        self.speech_inputs = speech_inputs
        self.text_inputs = text_inputs

    def __len__(self):
        return len(self.speech_inputs)

    def __getitem__(self, idx):
        return {
            'input_values': self.speech_inputs[idx],
            'labels': self.text_inputs[idx]
        }

train_dataset = SpeechDataset(speech_inputs, text_inputs)

# Model definition
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-xlsr-53", vocab_size=len(tokenizer))

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=16,
    num_train_epochs=3,
    logging_dir='./logs',
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Train
trainer.train()

### Google TTS and SST integration

In [None]:
import sounddevice as sd
import wavio
import wave
import soundfile as sf
from google.cloud import texttospeech, speech_v1p1beta1 as speech
from googletrans import Translator

# Initialize Google TTS and STT
tts_client = texttospeech.TextToSpeechClient()
stt_client = speech.SpeechClient()
translator = Translator()

def text_to_speech(text, language_code='ml-IN'):
    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice = texttospeech.VoiceSelectionParams(language_code=language_code, ssml_gender=texttospeech.SsmlVoiceGender.NEUTRAL)
    audio_config = texttospeech.AudioConfig(audio_encoding=texttospeech.AudioEncoding.MP3)
    response = tts_client.synthesize_speech(input=synthesis_input, voice=voice, audio_config=audio_config)
    return response.audio_content

def speech_to_text(audio_content, language_code='ml-IN'):
    audio = speech.RecognitionAudio(content=audio_content)
    config = speech.RecognitionConfig(language_code=language_code)
    response = stt_client.recognize(config=config, audio=audio)
    return response.results[0].alternatives[0].transcript

def translate_text(text, src='ml', dest='en'):
    return translator.translate(text, src=src, dest=dest).text

def record_audio(filename, duration, fs):
    print("Recording...")
    recording = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='int16')
    sd.wait()
    wavio.write(filename, recording, fs, sampwidth=2)
    print("Recording complete.")

# Parameters for recording
duration = 10  # seconds
fs = 16000  # Sample rate
filename = 'input.wav'

# Record audio from user
record_audio(filename, duration, fs)

# Convert recorded audio to text
with open(filename, 'rb') as audio_file:
    audio_content = audio_file.read()
transcribed_text = speech_to_text(audio_content)

# Translate transcribed text to English
translated_text = translate_text(transcribed_text)

print(f"Transcribed Text: {transcribed_text}")
print(f"Translated Text: {translated_text}")
