In [1]:
# Import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Sample data: You can replace this with a full text dataset (e.g., Shakespeare's works)
data = """Shall I compare thee to a summer's day?
Thou art more lovely and more temperate:
Rough winds do shake the darling buds of May,
And summer's lease hath all too short a date."""

# Preprocessing the data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1

# Create input sequences
input_sequences = []
for line in data.split("\n"):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences and create predictors/label
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
X, y = input_sequences[:,:-1], input_sequences[:,-1]

# Convert labels to categorical
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X, y, epochs=100, verbose=1)

# Generate text
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word = tokenizer.index_word[np.argmax(predicted)]
        seed_text += " " + predicted_word
    return seed_text

# Generate new text
print(generate_text("Shall I compare", 10, max_sequence_len))


Epoch 1/100




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 3.4025
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.1379 - loss: 3.3965
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1379 - loss: 3.3904
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - accuracy: 0.1379 - loss: 3.3836
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.1724 - loss: 3.3756
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.1724 - loss: 3.3658
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.1379 - loss: 3.3534
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.1034 - loss: 3.3374
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [None]:
# Install necessary libraries

# Import libraries
import numpy as np
import tensorflow as tf
from datasets import load_dataset
import librosa
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Step 1: Load the train-clean-100 subset of the LibriSpeech dataset (trusting remote code)
librispeech = load_dataset("librispeech_asr", "clean", split="train.100", trust_remote_code=True)

# Preprocess audio: Convert to MFCCs (Mel-frequency cepstral coefficients)
def preprocess_audio(audio_array, sampling_rate, n_mfcc=13):
    mfccs = librosa.feature.mfcc(y=audio_array, sr=sampling_rate, n_mfcc=n_mfcc)
    return mfccs.T  # Transpose to have time steps as rows

# Step 2: Prepare the input features (MFCCs) and target text labels
def prepare_data(dataset, num_samples=100):
    X, y = [], []
    for i in range(num_samples):
        audio_sample = dataset[i]['audio']['array']
        sampling_rate = dataset[i]['audio']['sampling_rate']
        transcription = dataset[i]['text']
        
        # Convert audio to MFCC features
        mfcc_features = preprocess_audio(audio_sample, sampling_rate)
        X.append(mfcc_features)
        y.append(transcription)
    
    return X, y

# Prepare a small subset for training
X_train, y_train = prepare_data(librispeech, num_samples=100)

# Step 3: Padding sequences to ensure equal input sizes for the model
def pad_sequences(sequences, maxlen):
    return np.array([np.pad(seq, ((0, maxlen - len(seq)), (0, 0)), mode='constant') for seq in sequences])

# Find max length to pad
max_len = max([x.shape[0] for x in X_train])

# Pad sequences
X_train_padded = pad_sequences(X_train, maxlen=max_len)

# Step 4: Encode the target text labels into integer sequences
# Create character-level tokenization
chars = sorted(set(''.join(y_train)))
char_to_index = {char: idx+1 for idx, char in enumerate(chars)}  # +1 to reserve 0 for padding
index_to_char = {idx: char for char, idx in char_to_index.items()}

def text_to_int_sequence(text):
    return [char_to_index[char] for char in text]

y_train_encoded = [text_to_int_sequence(txt) for txt in y_train]

# Pad the encoded labels
y_train_padded = pad_sequences(y_train_encoded, maxlen=max_len)

# Step 5: Define the RNN model
def create_rnn_model(input_shape, output_dim):
    input_data = Input(name='input', shape=input_shape)
    x = LSTM(128, return_sequences=True)(input_data)
    x = LSTM(128, return_sequences=True)(x)
    x = TimeDistributed(Dense(128, activation='relu'))(x)
    output_data = TimeDistributed(Dense(output_dim, activation='softmax'))(x)
    
    model = Model(inputs=input_data, outputs=output_data)
    return model

# Define input and output shapes
input_shape = (max_len, X_train_padded.shape[2])
output_dim = len(char_to_index) + 1  # +1 for padding

# Create the model
model = create_rnn_model(input_shape, output_dim)

# Step 6: Compile the model
model.compile(optimizer=Adam(), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()

# Step 7: Train the model
# Convert labels to the required shape
y_train_padded = np.expand_dims(y_train_padded, axis=-1)  # Add extra dimension for sparse_categorical_crossentropy

# Train the model
history = model.fit(X_train_padded, y_train_padded, epochs=5, batch_size=16)

# Step 8: Generate predictions for a test sample
def decode_sequence(int_sequence):
    return ''.join([index_to_char[i] for i in int_sequence if i != 0])  # 0 is for padding

# Predict on a new sample
test_sample = X_train_padded[0:1]  # Select the first sample for prediction
predicted_sequence = model.predict(test_sample)

# Convert prediction to text
predicted_text = decode_sequence(np.argmax(predicted_sequence[0], axis=-1))
print("Predicted transcription:", predicted_text)

# Original text
print("Original transcription:", y_train[0])
