In [2]:
import os
import librosa
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences

In [3]:
# Make sure we are using GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

audio_dir = "./AUDIO"
csv_file = "./TEXT/AUDIO.csv"

audio_files = os.listdir(audio_dir)

x_train = []
y_train = []

# Load the CSV file
df = pd.read_csv(csv_file)

Num GPUs Available:  0


In [4]:
for file in audio_files:
    if not file.endswith(".mp3"):
        continue

    file_path = os.path.join(audio_dir, file)

    y, sr = librosa.load(file_path, sr=None, mono=True)

    spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=1024)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc = np.transpose(mfcc, (1, 0))

    x_train.append(mfcc)

    # Find the matching text in the CSV file
    matched_text = df.loc[df['Video Matching'] == file, 'Text'].values[0]

    y_train.append(matched_text)

In [5]:
from keras.utils import to_categorical
# Create a set of all unique characters in your text data
characters = set(char for label in y_train for char in label)


# Create a dictionary to map characters to unique IDs
char_to_id = {char: id for id, char in enumerate(characters)}

# Convert your labels to integer IDs
y_train_ids = [[char_to_id[char] for char in label] for label in y_train]

# Convert your labels to one-hot encoded format
y_train_onehot = [to_categorical(label, num_classes=len(characters)) for label in y_train_ids]

# Determine the length of the longest MFCC array
max_len = max(mfcc.shape[0] for mfcc in x_train)

# Pad the label sequences and convert to numpy array
y_train_padded = pad_sequences(y_train_onehot, maxlen=max_len, padding='post')

# Determine the length of the longest MFCC array
max_len = max(mfcc.shape[0] for mfcc in x_train)

# Pad the MFCC arrays and convert to numpy array
x_train = np.array([np.pad(mfcc, ((0, max_len - mfcc.shape[0]), (0, 0))) for mfcc in x_train])

In [6]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.LSTM(128, input_shape=(None, 13), return_sequences=True)) 
model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(len(characters), activation='softmax')))

In [7]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [10]:
with tf.device('/device:GPU:0'):
    model.fit(x_train, y_train_padded, epochs=10000, batch_size=16)

Epoch 1/10000
Epoch 2/10000
Epoch 3/10000
Epoch 4/10000
Epoch 5/10000
Epoch 6/10000
Epoch 7/10000
Epoch 8/10000
Epoch 9/10000
Epoch 10/10000
Epoch 11/10000
Epoch 12/10000
Epoch 13/10000
Epoch 14/10000
Epoch 15/10000
Epoch 16/10000
Epoch 17/10000
Epoch 18/10000
Epoch 19/10000
Epoch 20/10000
Epoch 21/10000
Epoch 22/10000
Epoch 23/10000
Epoch 24/10000
Epoch 25/10000
Epoch 26/10000
Epoch 27/10000
Epoch 28/10000
Epoch 29/10000
Epoch 30/10000
Epoch 31/10000
Epoch 32/10000
Epoch 33/10000
Epoch 34/10000
Epoch 35/10000
Epoch 36/10000
Epoch 37/10000
Epoch 38/10000
Epoch 39/10000
Epoch 40/10000
Epoch 41/10000
Epoch 42/10000
Epoch 43/10000
Epoch 44/10000
Epoch 45/10000
Epoch 46/10000
Epoch 47/10000
Epoch 48/10000
Epoch 49/10000
Epoch 50/10000
Epoch 51/10000
Epoch 52/10000
Epoch 53/10000
Epoch 54/10000
Epoch 55/10000
Epoch 56/10000
Epoch 57/10000
Epoch 58/10000
Epoch 59/10000
Epoch 60/10000
Epoch 61/10000
Epoch 62/10000
Epoch 63/10000
Epoch 64/10000
Epoch 65/10000
Epoch 66/10000
Epoch 67/10000
Epoc

KeyboardInterrupt: 