In [89]:
import os
import pandas as pd
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.utils import Sequence
from sklearn.model_selection import train_test_split
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Input, Dense, LSTM, TimeDistributed, Lambda
from tensorflow.keras.models import Model



In [90]:
# Path to transcripts
transcript_path = 'C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/transcripts.csv'

# Check if the file exists
if os.path.exists(transcript_path):
    print(f"Found transcripts.csv at: {transcript_path}")
    # Load transcripts
    transcripts_df = pd.read_csv(transcript_path)
else:
    print(f"transcripts.csv not found at: {transcript_path}")
    # Handle the error as needed
    raise FileNotFoundError(f"transcripts.csv not found at: {transcript_path}")

# Ensure all text is in uppercase
transcripts_df['text'] = transcripts_df['text'].str.upper()



Found transcripts.csv at: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/transcripts.csv


In [91]:
# Split into training and validation sets
train_df, val_df = train_test_split(transcripts_df, test_size=0.1, random_state=42)



In [96]:
class AudioDataGenerator(Sequence):
    def __init__(self, df, batch_size=32, sample_rate=16000, shuffle=True):
        self.df = df
        self.batch_size = batch_size
        self.sample_rate = sample_rate
        self.shuffle = shuffle
        self.indices = np.arange(len(self.df))
        self.on_epoch_end()

    def __len__(self):
        return int(np.floor(len(self.df) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index * self.batch_size:(index + 1) * self.batch_size]
        batch_df = self.df.iloc[batch_indices]

        X, y = self.__data_generation(batch_df)

        # Print batch information
        print(f"Batch {index}:")
        print(f"X shape: {X.shape}")
        print(f"y shape: {y.shape}")

        return X, y

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices)

    def __data_generation(self, batch_df):
        X = np.empty((self.batch_size, self.sample_rate, 1))
        y = np.empty(self.batch_size, dtype=object)

        for i, (file_name, text) in enumerate(zip(batch_df['filename'], batch_df['text'])):
            audio_path = os.path.join('data/audio', file_name)
            print(f"Loading file: {audio_path}")  # Print the file path
            if os.path.exists(audio_path):
                try:
                    audio, _ = librosa.load(audio_path, sr=self.sample_rate)
                    X[i,] = np.expand_dims(audio, axis=-1)
                    y[i] = text
                except Exception as e:
                    print(f"Error loading {audio_path}: {e}")
                    X[i,] = np.zeros((self.sample_rate, 1))  # Fill with zeros or handle appropriately
                    y[i] = ''  # Handle empty label
            else:
                print(f"File not found: {audio_path}")
                X[i,] = np.zeros((self.sample_rate, 1))  # Fill with zeros or handle appropriately
                y[i] = ''  # Handle empty label

        return X, y


In [95]:
# Create data generators
train_generator = AudioDataGenerator(train_df)
val_generator = AudioDataGenerator(val_df)


In [66]:
def ctc_lambda_func(args):
    y_pred, labels, input_length, label_length = args
    return K.ctc_batch_cost(labels, y_pred, input_length, label_length)


In [73]:
input_data = Input(name='the_input', shape=(None, 1), dtype='float32')
lstm_1 = LSTM(128, return_sequences=True, name='lstm_1')(input_data)
lstm_2 = LSTM(128, return_sequences=True, name='lstm_2')(lstm_1)
y_pred = TimeDistributed(Dense(len(characters) + 2, activation='softmax'), name='y_pred')(lstm_2)  # +2 for space and blank

labels = Input(name='the_labels', shape=[None], dtype='float32')
input_length = Input(name='input_length', shape=[1], dtype='int64')
label_length = Input(name='label_length', shape=[1], dtype='int64')

loss_out = Lambda(ctc_lambda_func, output_shape=(1,), name='ctc')([y_pred, labels, input_length, label_length])

model = Model(inputs=[input_data, labels, input_length, label_length], outputs=loss_out)
model.compile(loss={'ctc': lambda y_true, y_pred: y_pred}, optimizer='adam')

In [74]:
# Model summary
model.summary()


In [75]:
# Define CTC loss
def ctc_loss(y_true, y_pred):
    input_length = tf.fill([tf.shape(y_pred)[0]], tf.shape(y_pred)[1])
    label_length = tf.shape(y_true)[1]
    return tf.nn.ctc_loss(y_true, y_pred, input_length, label_length, logits_time_major=False, blank_index=-1)


In [76]:
# Compile the model
model.compile(optimizer='adam', loss=ctc_loss)


In [25]:
import os
print(os.getcwd())  # Print the current working directory


C:\Users\acker\PycharmProjects\FBVRS\Data Training


In [77]:
# Train the model
history = model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=50,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)]
)


Loading file: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Ba_148
File not found: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Ba_148
Loading file: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Ha_10
File not found: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Ha_10
Loading file: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\A_7
File not found: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\A_7
Loading file: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Ba_107
File not found: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Ba_107
Loading file: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Da_81
File not found: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Da_81
Loading file: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Ga_31
File not found: C:/Users/acker/PycharmProjects/FBVRS/Data Training/data/audio\Ga_31
Load

ValueError: Layer "functional_6" expects 4 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'data:0' shape=(None, 16000, 1) dtype=float64>]

In [None]:
# Save the trained model
model.save('fine_tuned_model.h5')
