In [1]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
import json
# Set the number of CPU cores to be used
os.environ["TF_CONFIG"] = json.dumps({
    "task": {"type": "worker", "index": 0},
    "cluster": {"worker": ["localhost:0", "localhost:1", "localhost:2", "localhost:3", "localhost:4", "localhost:5", "localhost:6", "localhost:7"]}
})

In [2]:

# Function to load audio samples from directory
def load_audio_samples(directory, max_length=None, num_samples=200):
    audio_samples = []
    filenames = os.listdir(directory)
    if num_samples is not None:
        filenames = np.random.choice(filenames, num_samples, replace=False)
    for filename in filenames:
        if filename.endswith(".npy"):
            file_path = os.path.join(directory, filename)
            audio = np.load(file_path)
            if max_length is not None:
                # Pad or truncate audio samples to a fixed length
                audio = pad_or_truncate(audio, max_length)
            audio_samples.append(audio)
    return np.array(audio_samples)


In [3]:

# Function to pad or truncate audio samples to a fixed length
def pad_or_truncate(audio, max_length):
    if len(audio) < max_length:
        audio = np.pad(audio, (0, max_length - len(audio)), mode='constant')
    elif len(audio) > max_length:
        audio = audio[:max_length]
    return audio

# Load audio samples from directories
eng_audio_directory = "NumpyDataset/eng_audios"
hindi_audio_directory = "NumpyDataset/hin_audios"


In [4]:

# Find the maximum length of audio samples
print(os.getcwd())
max_length_eng = max(len(np.load(os.path.join(eng_audio_directory, filename))) for filename in os.listdir(eng_audio_directory))
max_length_hindi = max(len(np.load(os.path.join(hindi_audio_directory, filename))) for filename in os.listdir(hindi_audio_directory))
max_length = max(max_length_eng, max_length_hindi)
import os
X_audio = load_audio_samples(eng_audio_directory, max_length=max_length)
print(X_audio.shape)
y_audio = load_audio_samples(hindi_audio_directory, max_length=max_length)
print(y_audio.shape)


c:\Users\mehul\Downloads\epoch\speech-to-speech-EPOCH\translation
(200, 529344)
(200, 529344)


In [5]:

# Define the encoder
encoder_input = layers.Input(shape=(None, 1))  # Input shape is (timesteps, features)
encoder_lstm = layers.LSTM(256, return_state=True)
_, encoder_state_h, encoder_state_c = encoder_lstm(encoder_input)
encoder_states = [encoder_state_h, encoder_state_c]


In [6]:

# Define the decoder
decoder_input = layers.Input(shape=(None, 1))  # Input shape is (timesteps, features)
decoder_lstm = layers.LSTM(256, return_sequences=True, return_state=True)
decoder_output, _, _ = decoder_lstm(decoder_input, initial_state=encoder_states)
decoder_dense = layers.Dense(1, activation='linear')
decoder_output = decoder_dense(decoder_output)


In [7]:

# Define the seq2seq model
model = models.Model([encoder_input, decoder_input], decoder_output)

# Compile the model
model.compile(optimizer='adam', loss='mse')


In [8]:

# Train the model
model.fit([X_audio[:, :, np.newaxis], y_audio[:, :, np.newaxis]], y_audio[:, :, np.newaxis], batch_size=32, epochs=10, validation_split=0.2)

# Save the model
model.save('speech_translation_model.h5')


Epoch 1/10
