In [1]:
import numpy as np
import os
import librosa

### Convert training data to mel spectrograms ###

def wav_to_mel_spectrogram(y, sr=48000, hop_length=512, n_fft=2048, n_mels=128):
    # Compute the mel spectrogram
    mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
    
    # Convert to log scale
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
    
    return log_mel_spectrogram

file_path = './trainingdata_v2.wav'
sr = 48000

# Load the lengthy audio file in chunks
frame_length = sr * 10  # 10 seconds per frame
hop_size = sr * 5       # 5 seconds overlap

# Store mel spectrograms and corresponding audio slices
X_train = []
y_train = []

stream = librosa.stream(file_path, block_length=10, frame_length=frame_length, hop_length=hop_size, mono=True)

for y in stream:
    mel_spec = wav_to_mel_spectrogram(y)
    
    # Using the entire frame for audio data
    X_train.append(mel_spec)
    y_train.append(y[:frame_length])

# Padding the last array in X_train
expected_columns = (frame_length // 512)  # 512 is the hop_length
# Check the shape of the last element in X_train
if X_train[-1].shape[1] != expected_columns:
    padded_shape = (128, max(expected_columns, X_train[-1].shape[1]))
    padded = np.zeros(padded_shape)
    padded[:, :X_train[-1].shape[1]] = X_train[-1]
    X_train[-1] = padded[:, :expected_columns]

# Padding the last array in y_train
if y_train[-1].shape[0] != frame_length:
    padded = np.zeros(frame_length)
    padded[:y_train[-1].shape[0]] = y_train[-1]
    y_train[-1] = padded

# Step 1: Identify Inconsistent Shapes

# Find the most common shape
shapes = [x.shape for x in X_train]
most_common_shape = max(set(shapes), key=shapes.count)

print("Most common shape:", most_common_shape)

# Print the shapes that are different from the most common shape
for i, shape in enumerate(shapes):
    if shape != most_common_shape:
        print(f"Array at index {i} has shape: {shape}")

# Step 2: Correct the Shapes by Padding

# Define the expected number of columns based on the most common shape
expected_columns = most_common_shape[1]

# Pad the last mel spectrogram in X_train
if X_train[-1].shape[1] != expected_columns:
    padded = np.zeros((128, expected_columns))
    padded[:, :X_train[-1].shape[1]] = X_train[-1]
    X_train[-1] = padded

# Once again, try converting the list to a numpy array
X_train = np.array(X_train)

# Sanity check
assert len(X_train) == len(y_train), "Mismatched lengths between X_train and y_train!"

# Convert lists to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)

print("Shapes after preprocessing:")
print("X_train:", X_train.shape)
print("y_train:", y_train.shape)


Most common shape: (128, 5157)
Array at index 199 has shape: (128, 937)
Shapes after preprocessing:
X_train: (200, 128, 5157)
y_train: (200, 480000)


In [2]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, Add, Activation, Multiply, Flatten, Dense
from tensorflow.keras.models import Model

2023-11-01 13:10:26.385609: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def chunk_data(data, chunk_size, overlap):
    """
    Chunk 3D data along the last axis.
    
    :param data: 3D numpy array with shape (num_samples, num_features, time_steps)
    :param chunk_size: Size of each chunk along the time_steps axis
    :param overlap: Overlap between consecutive chunks
    :return: Chunked data as a 3D numpy array
    """
    num_samples, num_features, time_steps = data.shape
    step_size = chunk_size - overlap
    num_chunks = (time_steps - overlap) // step_size
    
    # Prepare an array to hold the chunked data
    chunks = np.zeros((num_samples * num_chunks, num_features, chunk_size))
    
    chunk_idx = 0
    for i in range(num_samples):
        for j in range(0, time_steps - chunk_size + 1, step_size):
            chunks[chunk_idx] = data[i, :, j:j+chunk_size]
            chunk_idx += 1
            
    return chunks

# Define parameters
chunk_size = 128  # This can be adjusted based on the model's receptive field and other considerations.
overlap = 64  # Half the chunk size for a 50% overlap.

# Chunk the mel spectrogram data
X_chunked = chunk_data(X_train, chunk_size, overlap)

print("Chunked data shape:", X_chunked.shape)


Chunked data shape: (15800, 128, 128)


In [4]:
def chunk_data_2d(data, chunk_size, overlap):
    """
    Chunk 2D data along the last axis.
    
    :param data: 2D numpy array with shape (num_samples, time_steps)
    :param chunk_size: Size of each chunk along the time_steps axis
    :param overlap: Overlap between consecutive chunks
    :return: Chunked data as a 2D numpy array
    """
    num_samples, time_steps = data.shape
    step_size = chunk_size - overlap
    num_chunks = (time_steps - overlap) // step_size
    
    # Prepare an array to hold the chunked data
    chunks = np.zeros((num_samples * num_chunks, chunk_size))
    
    chunk_idx = 0
    for i in range(num_samples):
        for j in range(0, time_steps - chunk_size + 1, step_size):
            chunks[chunk_idx] = data[i, j:j+chunk_size]
            chunk_idx += 1
            
    return chunks

# Chunk the target data
y_chunked = chunk_data_2d(y_train, chunk_size, overlap)

print("Chunked y data shape:", y_chunked.shape)


Chunked y data shape: (1499800, 128)


In [None]:
print("Shape of the generated mel spectrogram:", generated_mel_spectrogram.shape)
generated_mel_spectrogram = np.array(generated_mel_spectrogram)

# Step 2: Convert each chunk of the mel spectrogram sequence back to waveform
def mel_to_audio(mel_spectrogram, sr=48000, hop_length=256, n_iter=50):
    # Inverse mel scale
    mel_to_linear = librosa.feature.inverse.mel_to_stft(mel_spectrogram, sr=sr)
    # Spectrogram to waveform using Griffin-Lim
    waveform = librosa.griffinlim(mel_to_linear, hop_length=hop_length, n_iter=n_iter)
    return waveform

audio_outputs = []
for idx, chunk in enumerate(generated_mel_spectrogram):
    audio_outputs.append(mel_to_audio(chunk, sr=48000, hop_length=256))
    if (idx + 1) % 100 == 0:  # Print progress every 100 chunks
        print(f"Processed {idx + 1} chunks out of {len(generated_mel_spectrogram)}")

audio_output = np.concatenate(audio_outputs)
print("All chunks processed and concatenated!")

# Step 3: Save the waveform as a .wav file
sf.write('generated_output.wav', audio_output, 48000)
print("'generated_output.wav' saved successfully!")


In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Conv1D, Multiply, Add, Activation, Flatten, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import MeanSquaredError
from tensorflow.keras.metrics import MeanAbsoluteError
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt

# Model Definition
def generate_model(input_shape):
    inp = Input(shape=input_shape)

    # Gate mechanism 1
    conv_a = Conv1D(40, 3, padding='same', activation='sigmoid')(inp)
    conv_b = Conv1D(40, 3, padding='same', activation='tanh')(inp)
    mult_1 = Multiply()([conv_a, conv_b])
    skip_1 = Conv1D(40, 1, padding='same')(mult_1)
    skip_1_connect = Conv1D(40, 1, padding='same')(inp)
    res_1 = Add()([skip_1, skip_1_connect])

    # Repeated blocks
    skips = []
    for _ in range(5):
        conv_a = Conv1D(40, 3, padding='same', activation='sigmoid')(res_1)
        conv_b = Conv1D(40, 3, padding='same', activation='tanh')(res_1)
        mult = Multiply()([conv_a, conv_b])
        skip = Conv1D(40, 1, padding='same')(mult)
        skips.append(skip)
        skip_connect = Conv1D(40, 1, padding='same')(res_1)
        res_1 = Add()([skip, skip_connect])

    sum_skips = Add()(skips)
    act = Activation('relu')(sum_skips)
    out_conv1 = Conv1D(40, 1, padding='same', activation='relu')(act)
    
    # Adjusting the output to match target shape
    out_conv2 = Conv1D(1, 1, padding='same')(out_conv1)  # One filter for audio wave sequence
    
    model = Model(inputs=inp, outputs=out_conv2)
    
    return model

# Assuming y_chunked is already prepared, with shape (1499800, 128)
y_input = y_chunked[:-1]   # All except the last one
y_target = y_chunked[1:]   # All except the first one

# Reshape the data to fit the model's expected input shape
y_input = y_input.reshape(-1, 128, 1)
y_target = y_target.reshape(-1, 128, 1)

# Create TensorFlow datasets for training
train_dataset = tf.data.Dataset.from_tensor_slices((y_input, y_target))
train_dataset = train_dataset.batch(32).shuffle(buffer_size=1024).cache().prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

# Wavenet-like architecture
model = generate_model(input_shape=(128, 1))

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss='mse', metrics=['mae'])

# Callbacks
callbacks = [
    keras.callbacks.ModelCheckpoint('best_weights.h5', save_best_only=True, monitor='loss', mode='min'),
    keras.callbacks.EarlyStopping(monitor='loss', patience=10, verbose=1, mode='min'),
    keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.1, patience=5, verbose=1, mode='min')
]

# Train the model
history = model.fit(train_dataset, epochs=100, callbacks=callbacks)

# Save the entire model to a HDF5 file.
model.save('ai_music_wavenet_wavfile_1_0_0.h5')

loaded_model = keras.models.load_model('ai_music_wavenet_wavfile_1_0_0.h5')

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.legend()
plt.title('Loss Evolution')

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')   # changed 'mean_absolute_error' to 'mae'
plt.legend()
plt.title('Metric Evolution')

plt.tight_layout()
plt.show()


Epoch 1/100
Epoch 2/100


  saving_api.save_model(


Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100

In [None]:
import soundfile as sf

# Assuming the model is already trained and loaded
# loaded_model = keras.models.load_model('ai_music_wavenet_wavfile_1_0_0.h5')

def generate_audio_sequence(model, seed_sequence, target_length):
    """
    Generate a sequence of audio samples.
    
    :param model: The trained WaveNet model
    :param seed_sequence: A seed audio sequence to start the generation
    :param target_length: The target number of samples to generate
    :return: The generated audio sequence
    """
    generated_audio = seed_sequence
    while generated_audio.shape[0] < target_length:
        # Predict the next sample
        next_sample = model.predict(generated_audio[-128:].reshape(1, 128, 1))
        # Append the sample
        generated_audio = np.append(generated_audio, next_sample[0, -1, 0])  # Append the last value

        # Print progress
        if generated_audio.shape[0] % 48000 == 0:
            print(f"Generated {generated_audio.shape[0] / 48000} seconds of audio...")

    # If we have exceeded the target length, trim the sequence
    if generated_audio.shape[0] > target_length:
        generated_audio = generated_audio[:target_length]
    return generated_audio

# Generate 30 seconds of audio
seed_sequence = y_input[0]  # Use the first sequence from y_chunked as a seed
generated_sequence = generate_audio_sequence(loaded_model, seed_sequence, 720000)

# Save the generated audio sequence to a WAV file
sf.write('generated_music.wav', generated_sequence, 48000)
print("Audio generation complete and saved to 'generated_music.wav'")
