In [6]:
pip install soundfile

Note: you may need to restart the kernel to use updated packages.


In [88]:
import librosa
import numpy as np
import os
import soundfile as sf  # Import soundfile library

def segment_song(file_path, segment_length=40, save_segments=False, output_dir='segments'):
    """
    Segment a song into fixed-length windows.
    
    Parameters:
    - file_path: Path to the audio file (e.g., 'path/to/song.mp3').
    - segment_length: Length of each segment in seconds (e.g., 40).
    - save_segments: Boolean, whether to save segments as separate audio files.
    - output_dir: Directory where segmented audio files will be saved.
    
    Returns:
    - segments: A list of audio arrays, each representing a segment.
    """
    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)
    
    # Calculate the number of samples per segment
    samples_per_segment = segment_length * sr
    
    # Number of segments
    num_segments = int(np.floor(len(y) / samples_per_segment))
    
    segments = []
    
    for i in range(num_segments):
        # Calculate start and end sample for the current segment
        start_sample = i * samples_per_segment
        end_sample = start_sample + samples_per_segment
        
        # Extract the segment
        segment = y[start_sample:end_sample]
        segments.append(segment)
        
        # Optionally save the segment to disk
        if save_segments:
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            segment_filename = os.path.join(output_dir, f'segment_{i}.wav')
            sf.write(segment_filename, segment, sr)  # Use soundfile to save the segment
    
    return segments

# Example usage
file_path = '/Users/chamudi/Desktop/songs/train_data/1.mp3'
segments = segment_song(file_path, segment_length=40, save_segments=True, output_dir='output_segments')

In [89]:
import librosa
import numpy as np

def extract_features(segments, sr=22050, n_fft=2048, hop_length=512):
    """
    Extract STFT-based features from audio segments.
    
    Parameters:
    - segments: List of audio segments.
    - sr: Sampling rate of the audio segments.
    - n_fft: Number of FFT components.
    - hop_length: Number of samples between successive frames.
    
    Returns:
    - fingerprints: A list of magnitude spectrograms (audio fingerprints) for each segment.
    """
    fingerprints = []
    for segment in segments:
        # Compute the STFT
        stft = librosa.stft(segment, n_fft=n_fft, hop_length=hop_length)
        
        # Compute the magnitude spectrogram from the STFT
        spectrogram = np.abs(stft)
        
        # Convert to decibel units for a more dynamic range
        db_spectrogram = librosa.amplitude_to_db(spectrogram, ref=np.max)
        
        fingerprints.append(db_spectrogram)
    
    return fingerprints

# Example usage assuming 'segments' is a list of audio segments from the previous step
fingerprints = extract_features(segments)

In [90]:
# Dummy data initialization for demonstration purposes
fingerprints = np.random.rand(100, 128, 44, 1)  # Example shape, adjust as necessary
song_ids = np.random.randint(0, 10, size=100)   # Assuming 10 unique songs for 100 fingerprints

In [91]:
def generate_pairs(fingerprints, song_ids):
    positive_pairs = []
    negative_pairs = []
    labels = []

    # Generate positive pairs
    for i in range(len(fingerprints)):
        for j in range(i+1, len(fingerprints)):
            if song_ids[i] == song_ids[j]:
                positive_pairs.append([fingerprints[i], fingerprints[j]])
                labels.append(1)
    
    # Generate negative pairs (simplified approach)
    for i in range(len(positive_pairs)):  # Generating as many negative pairs as positive
        while True:
            idx1, idx2 = np.random.randint(0, len(fingerprints), size=2)
            if song_ids[idx1] != song_ids[idx2]:
                negative_pairs.append([fingerprints[idx1], fingerprints[idx2]])
                labels.append(0)
                break
    
    # Combine positive and negative pairs
    pairs = positive_pairs + negative_pairs
    return np.array(pairs), np.array(labels)

In [92]:
pairs, labels = generate_pairs(fingerprints, song_ids)

In [93]:
# Example of splitting (adjust indices according to your dataset size and needs)
split_point = int(len(pairs) * 0.8)
pairs_train, labels_train = pairs[:split_point], labels[:split_point]
pairs_val, labels_val = pairs[split_point:], labels[split_point:]

In [94]:
import numpy as np

# Dummy initialization of fingerprints and their corresponding song IDs
# Replace this with actual loading or generation of your dataset
fingerprints = np.random.rand(100, 128, 44, 1)  # 100 random fingerprints
song_ids = np.random.randint(0, 10, 100)       # 100 random song IDs ranging from 0 to 9

In [95]:
def generate_pairs(fingerprints, song_ids):
    positive_pairs = []
    negative_pairs = []
    labels = []

    # Example logic to generate positive and negative pairs
    for i in range(len(fingerprints)):
        for j in range(i + 1, len(fingerprints)):
            if song_ids[i] == song_ids[j]:
                positive_pairs.append([fingerprints[i], fingerprints[j]])
                labels.append(1)  # Similar
            else:
                if len(negative_pairs) < len(positive_pairs):  # To balance the dataset
                    negative_pairs.append([fingerprints[i], fingerprints[j]])
                    labels.append(0)  # Dissimilar

    pairs = np.array(positive_pairs + negative_pairs)
    labels = np.array(labels)
    return pairs, labels

In [96]:
pairs, labels = generate_pairs(fingerprints, song_ids)  # Use the actual function to generate pairs

# Shuffle and split the dataset into training and validation sets
indices = np.arange(len(pairs))
np.random.shuffle(indices)

split_point = int(len(pairs) * 0.8)  # 80% for training, 20% for validation
train_indices = indices[:split_point]
val_indices = indices[split_point:]

pairs_train, labels_train = pairs[train_indices], labels[train_indices]
pairs_val, labels_val = pairs[val_indices], labels[val_indices]

In [195]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Lambda
from tensorflow.keras import backend as K

def create_base_network(input_shape):
    """
    Base network to be shared (eq. to feature extraction).
    """
    input = Input(shape=input_shape)
    x = Conv2D(64, (3, 3), activation='relu')(input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    return Model(input, x)

def euclidean_distance(vects):
    """
    Compute Euclidean Distance between two vectors.
    """
    x, y = vects
    sum_square = tf.math.reduce_sum(tf.math.square(x - y), axis=1, keepdims=True)
    return tf.math.sqrt(tf.maximum(sum_square, tf.keras.backend.epsilon()))

def eucl_dist_output_shape(shapes):
    """
    Shape of the output of the Euclidean distance layer.
    """
    shape1, shape2 = shapes
    return (shape1[0], 1)

# Define the tensors for the two input images
input_shape = (128, 44, 1)  # Example input shape, adjust based on your fingerprint shape
base_network = create_base_network(input_shape)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

# Because we re-use the same instance `base_network`,
# the weights of the network will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

# Use a Lambda layer to compute the absolute difference between the feature vectors
distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

# Add a dense layer with a sigmoid unit to generate the similarity score
prediction = Dense(1, activation='sigmoid')(distance)

# Connect the inputs with the outputs
model = Model(inputs=[input_a, input_b], outputs=prediction)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model.summary()

In [196]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Lambda
from tensorflow.keras import backend as K

def create_base_network(input_shape):
    """
    Base network to be shared (eq. to feature extraction).
    """
    input = Input(shape=input_shape)
    x = Conv2D(64, (3, 3), activation='relu')(input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    return Model(input, x)

def euclidean_distance(vects):
    """
    Compute Euclidean Distance between two vectors.
    """
    x, y = vects
    sum_square = tf.reduce_sum(tf.square(x - y), axis=1, keepdims=True)
    return tf.sqrt(tf.maximum(sum_square, tf.keras.backend.epsilon()))

def eucl_dist_output_shape(shapes):
    """
    Shape of the output of the Euclidean distance layer.
    """
    shape1, shape2 = shapes
    return (shape1[0], 1)

# Define the tensors for the two input images
input_shape = (128, 44, 1)  # Example input shape, adjust based on your fingerprint shape
base_network = create_base_network(input_shape)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

# Because we re-use the same instance `base_network`,
# the weights of the network will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

# Use a Lambda layer to compute the absolute difference between the feature vectors
distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

# Add a dense layer with a sigmoid unit to generate the similarity score
prediction = Dense(1, activation='sigmoid')(distance)

# Connect the inputs with the outputs
model = Model(inputs=[input_a, input_b], outputs=prediction)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model.summary()

In [197]:
# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [198]:
model.save('/Users/chamudi/Desktop/my_model.keras')

In [199]:
checkpoint_filepath = 'User/chamudi/Desktop/weights.weights.h5'  # Adjusted extension
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [200]:
checkpoint_filepath = 'User/chamudi/Desktop/model.keras'  # Updated extension for full model saving
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,  # Saving the entire model
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [201]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    mode='min',
    patience=5,  # Number of epochs with no improvement after which training will be stopped
    verbose=1)

In [202]:
history = model.fit(
    [pairs_train[:, 0], pairs_train[:, 1]],  # Assuming pairs_train is structured to allow this indexing
    labels_train,
    validation_data=([pairs_val[:, 0], pairs_val[:, 1]], labels_val),
    epochs=20,  # Adjust based on your needs
    batch_size=64,  # Adjust based on your needs
    callbacks=[model_checkpoint_callback, early_stopping_callback],
    verbose=1)

Epoch 1/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 206ms/step - accuracy: 0.4811 - loss: 0.7223 - val_accuracy: 0.5196 - val_loss: 0.7117
Epoch 2/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 202ms/step - accuracy: 0.5047 - loss: 0.6879 - val_accuracy: 0.5196 - val_loss: 0.6977
Epoch 3/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 201ms/step - accuracy: 0.5170 - loss: 0.6701 - val_accuracy: 0.5196 - val_loss: 0.6918
Epoch 4/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 198ms/step - accuracy: 0.4922 - loss: 0.6641 - val_accuracy: 0.5196 - val_loss: 0.6935
Epoch 5/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 201ms/step - accuracy: 0.4939 - loss: 0.6527 - val_accuracy: 0.5196 - val_loss: 0.6936
Epoch 6/20
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 199ms/step - accuracy: 0.4934 - loss: 0.6450 - val_accuracy: 0.5245 - val_loss: 0.6917
Epoch 7/20
[1m13/13[0m [3

In [203]:
# Generate dummy test data (assuming your inputs are images of shape 128x44x1)
import numpy as np

num_test_samples = 100
pairs_test = np.random.rand(num_test_samples, 2, 128, 44, 1)  # 100 pairs of test fingerprints
labels_test = np.random.randint(0, 2, num_test_samples)       # 100 random binary labels

# Convert the dummy test data into the correct format
left_input_test = np.array([pair[0] for pair in pairs_test])
right_input_test = np.array([pair[1] for pair in pairs_test])

# Evaluate the model on the dummy test data
test_loss, test_accuracy = model.evaluate([left_input_test, right_input_test], labels_test, verbose=1)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.4890 - loss: 0.6929
Test Loss: 0.6920572519302368
Test Accuracy: 0.4699999988079071


In [204]:
# Assuming pairs_test and labels_test are your test dataset prepared similarly to training data
test_loss, test_accuracy = model.evaluate([pairs_test[:, 0], pairs_test[:, 1]], labels_test, verbose=1)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.4890 - loss: 0.6929
Test Loss: 0.6920572519302368
Test Accuracy: 0.4699999988079071


In [205]:
# Assuming left_input_test and right_input_test are your test inputs
predictions = model.predict([left_input_test, right_input_test])

# Binarize predictions based on a 0.5 threshold
binary_predictions = (predictions > 0.5).astype("int32")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


In [206]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming labels_test are your true binary labels
precision = precision_score(labels_test, binary_predictions)
recall = recall_score(labels_test, binary_predictions)
f1 = f1_score(labels_test, binary_predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.4792
Recall: 0.9388
F1 Score: 0.6345


In [207]:
import numpy as np

def format_features(fingerprints):
    # Check if fingerprints is a list and has at least one array
    if isinstance(fingerprints, list) and all(isinstance(fp, np.ndarray) for fp in fingerprints) and len(fingerprints) > 0:
        fingerprints = np.concatenate(fingerprints)
    elif not fingerprints:  # fingerprints is empty
        # Handle empty fingerprints list: return an empty array or raise an error
        return np.array([])  # or raise ValueError("No data to format")
    
    # Assuming fingerprints now contains data, proceed with normalization and reshaping
    normalized_fingerprints = (fingerprints - fingerprints.mean()) / fingerprints.std()
    reshaped_fingerprints = normalized_fingerprints.reshape(-1, 1)  # Example reshaping
    return reshaped_fingerprints

In [208]:
import librosa

# Load the input song as a NumPy array
input_song, sr = librosa.load('/Users/chamudi/Desktop/songs/unknown_data/10.mp3', sr=None, mono=False)  # Load as stereo

# Ensure the input song is a NumPy array
if not isinstance(input_song, np.ndarray):
    input_song = np.array(input_song)

# Now, you can pass the input song to the preprocessing function
segments, fingerprints = preprocess_input_song(input_song)


[src/libmpg123/id3.c:process_comment():584] error: No comment text / valid description?


In [209]:
import numpy as np

def preprocess_input_song(input_song):
    # Split the input song into segments
    segment_length = 40  # Assuming each segment is 40 seconds long
    num_segments = len(input_song) // segment_length
    segments = []
    for i in range(num_segments):
        start_idx = i * segment_length
        end_idx = (i + 1) * segment_length
        segment = input_song[start_idx:end_idx]
        segments.append(segment)
    segments = np.array(segments)

    # Extract features from each segment (e.g., audio fingerprints using STFT)
    fingerprints = extract_features(segments)

    # Format the data appropriately (e.g., reshape features, normalize)
    fingerprints = format_features(fingerprints)

    return segments, fingerprints

In [214]:
# Assuming you have already loaded your trained model and preprocessed the input song

# Step 1: Preprocess the Input Song
# This could involve splitting the song into segments, extracting features, and formatting the data
segments, fingerprints = preprocess_input_song(input_song)

# Step 2: Load the Trained Model
from tensorflow.keras.models import load_model

#model = tf.keras.models.load_model(
   # '/Users/chamudi/Desktop/model_version_7.h5',
    #custom_objects={'euclidean_distance': euclidean_distance}
#)

#model = load_model('/Users/chamudi/Desktop/model_version_7.h5')

# Step 3: Predict the Similarity
# Assuming your model takes pairs of fingerprints as input
predictions = model.predict([fingerprints[:, 0], fingerprints[:, 1]])

# Step 4: Assess Accuracy (if ground truth labels are available)
# Assuming you have ground truth labels for similarity between segments
# Calculate accuracy, precision, recall, and F1 score

# Step 5: Output Predictions and Accuracy
print("Predictions:")
print(predictions)
print("Accuracy:")
print(accuracy)
print("Precision:")
print(precision)
print("Recall:")
print(recall)
print("F1 Score:")
print(f1_score)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed