In [6]:
pip install soundfile

Note: you may need to restart the kernel to use updated packages.


In [34]:
import librosa
import numpy as np
import os
import soundfile as sf  # Import soundfile library

def segment_song(file_path, segment_length=40, save_segments=False, output_dir='segments'):
    """
    Segment a song into fixed-length windows.
    
    Parameters:
    - file_path: Path to the audio file (e.g., 'path/to/song.mp3').
    - segment_length: Length of each segment in seconds (e.g., 40).
    - save_segments: Boolean, whether to save segments as separate audio files.
    - output_dir: Directory where segmented audio files will be saved.
    
    Returns:
    - segments: A list of audio arrays, each representing a segment.
    """
    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)
    
    # Calculate the number of samples per segment
    samples_per_segment = segment_length * sr
    
    # Number of segments
    num_segments = int(np.floor(len(y) / samples_per_segment))
    
    segments = []
    
    for i in range(num_segments):
        # Calculate start and end sample for the current segment
        start_sample = i * samples_per_segment
        end_sample = start_sample + samples_per_segment
        
        # Extract the segment
        segment = y[start_sample:end_sample]
        segments.append(segment)
        
        # Optionally save the segment to disk
        if save_segments:
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir)
            segment_filename = os.path.join(output_dir, f'segment_{i}.wav')
            sf.write(segment_filename, segment, sr)  # Use soundfile to save the segment
    
    return segments

# Example usage
file_path = '/Users/chamudi/Desktop/songs/train_data/1.mp3'
segments = segment_song(file_path, segment_length=40, save_segments=True, output_dir='output_segments')

In [35]:
import librosa
import numpy as np

def extract_features(segments, sr=22050, n_fft=2048, hop_length=512):
    """
    Extract STFT-based features from audio segments.
    
    Parameters:
    - segments: List of audio segments.
    - sr: Sampling rate of the audio segments.
    - n_fft: Number of FFT components.
    - hop_length: Number of samples between successive frames.
    
    Returns:
    - fingerprints: A list of magnitude spectrograms (audio fingerprints) for each segment.
    """
    fingerprints = []
    for segment in segments:
        # Compute the STFT
        stft = librosa.stft(segment, n_fft=n_fft, hop_length=hop_length)
        
        # Compute the magnitude spectrogram from the STFT
        spectrogram = np.abs(stft)
        
        # Convert to decibel units for a more dynamic range
        db_spectrogram = librosa.amplitude_to_db(spectrogram, ref=np.max)
        
        fingerprints.append(db_spectrogram)
    
    return fingerprints

# Example usage assuming 'segments' is a list of audio segments from the previous step
fingerprints = extract_features(segments)

In [36]:
# Dummy data initialization for demonstration purposes
fingerprints = np.random.rand(100, 128, 44, 1)  # Example shape, adjust as necessary
song_ids = np.random.randint(0, 10, size=100)   # Assuming 10 unique songs for 100 fingerprints

In [37]:
def generate_pairs(fingerprints, song_ids):
    positive_pairs = []
    negative_pairs = []
    labels = []

    # Generate positive pairs
    for i in range(len(fingerprints)):
        for j in range(i+1, len(fingerprints)):
            if song_ids[i] == song_ids[j]:
                positive_pairs.append([fingerprints[i], fingerprints[j]])
                labels.append(1)
    
    # Generate negative pairs (simplified approach)
    for i in range(len(positive_pairs)):  # Generating as many negative pairs as positive
        while True:
            idx1, idx2 = np.random.randint(0, len(fingerprints), size=2)
            if song_ids[idx1] != song_ids[idx2]:
                negative_pairs.append([fingerprints[idx1], fingerprints[idx2]])
                labels.append(0)
                break
    
    # Combine positive and negative pairs
    pairs = positive_pairs + negative_pairs
    return np.array(pairs), np.array(labels)

In [38]:
pairs, labels = generate_pairs(fingerprints, song_ids)

In [39]:
# Example of splitting (adjust indices according to your dataset size and needs)
split_point = int(len(pairs) * 0.8)
pairs_train, labels_train = pairs[:split_point], labels[:split_point]
pairs_val, labels_val = pairs[split_point:], labels[split_point:]

In [40]:
import numpy as np

# Dummy initialization of fingerprints and their corresponding song IDs
# Replace this with actual loading or generation of your dataset
fingerprints = np.random.rand(100, 128, 44, 1)  # 100 random fingerprints
song_ids = np.random.randint(0, 10, 100)       # 100 random song IDs ranging from 0 to 9

In [41]:
def generate_pairs(fingerprints, song_ids):
    positive_pairs = []
    negative_pairs = []
    labels = []

    # Example logic to generate positive and negative pairs
    for i in range(len(fingerprints)):
        for j in range(i + 1, len(fingerprints)):
            if song_ids[i] == song_ids[j]:
                positive_pairs.append([fingerprints[i], fingerprints[j]])
                labels.append(1)  # Similar
            else:
                if len(negative_pairs) < len(positive_pairs):  # To balance the dataset
                    negative_pairs.append([fingerprints[i], fingerprints[j]])
                    labels.append(0)  # Dissimilar

    pairs = np.array(positive_pairs + negative_pairs)
    labels = np.array(labels)
    return pairs, labels

In [42]:
pairs, labels = generate_pairs(fingerprints, song_ids)  # Use the actual function to generate pairs

# Shuffle and split the dataset into training and validation sets
indices = np.arange(len(pairs))
np.random.shuffle(indices)

split_point = int(len(pairs) * 0.8)  # 80% for training, 20% for validation
train_indices = indices[:split_point]
val_indices = indices[split_point:]

pairs_train, labels_train = pairs[train_indices], labels[train_indices]
pairs_val, labels_val = pairs[val_indices], labels[val_indices]

In [43]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Lambda
from tensorflow.keras import backend as K

def create_base_network(input_shape):
    """
    Base network to be shared (eq. to feature extraction).
    """
    input = Input(shape=input_shape)
    x = Conv2D(64, (3, 3), activation='relu')(input)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    return Model(input, x)

def euclidean_distance(vects):
    """
    Compute Euclidean Distance between two vectors.
    """
    x, y = vects
    sum_square = K.sum(K.square(x - y), axis=1, keepdims=True)
    return K.sqrt(K.maximum(sum_square, K.epsilon()))

def eucl_dist_output_shape(shapes):
    """
    Shape of the output of the Euclidean distance layer.
    """
    shape1, shape2 = shapes
    return (shape1[0], 1)

# Define the tensors for the two input images
input_shape = (128, 44, 1)  # Example input shape, adjust based on your fingerprint shape
base_network = create_base_network(input_shape)

input_a = Input(shape=input_shape)
input_b = Input(shape=input_shape)

# Because we re-use the same instance `base_network`,
# the weights of the network will be shared across the two branches
processed_a = base_network(input_a)
processed_b = base_network(input_b)

# Use a Lambda layer to compute the absolute difference between the feature vectors
distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])

# Add a dense layer with a sigmoid unit to generate the similarity score
prediction = Dense(1, activation='sigmoid')(distance)

# Connect the inputs with the outputs
model = Model(inputs=[input_a, input_b], outputs=prediction)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print the model summary
model.summary()

In [44]:
# Compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [45]:
checkpoint_filepath = 'User/chamudi/Desktop/weights.weights.h5'  # Adjusted extension
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [46]:
checkpoint_filepath = 'User/chamudi/Desktop/model.keras'  # Updated extension for full model saving
model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,  # Saving the entire model
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [47]:
from tensorflow.keras.callbacks import EarlyStopping

early_stopping_callback = EarlyStopping(
    monitor='val_loss',
    mode='min',
    patience=5,  # Number of epochs with no improvement after which training will be stopped
    verbose=1)

In [48]:
history = model.fit(
    [pairs_train[:, 0], pairs_train[:, 1]],  # Assuming pairs_train is structured to allow this indexing
    labels_train,
    validation_data=([pairs_val[:, 0], pairs_val[:, 1]], labels_val),
    epochs=20,  # Adjust based on your needs
    batch_size=64,  # Adjust based on your needs
    callbacks=[model_checkpoint_callback, early_stopping_callback],
    verbose=1)

Epoch 1/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 221ms/step - accuracy: 0.4818 - loss: 0.6952 - val_accuracy: 0.4787 - val_loss: 0.7029
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 202ms/step - accuracy: 0.5143 - loss: 0.6797 - val_accuracy: 0.4787 - val_loss: 0.7183
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 201ms/step - accuracy: 0.5209 - loss: 0.6638 - val_accuracy: 0.4787 - val_loss: 0.7293
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 200ms/step - accuracy: 0.4919 - loss: 0.6604 - val_accuracy: 0.4787 - val_loss: 0.7316
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 201ms/step - accuracy: 0.5232 - loss: 0.6427 - val_accuracy: 0.4787 - val_loss: 0.7365
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 203ms/step - accuracy: 0.5137 - loss: 0.6388 - val_accuracy: 0.4787 - val_loss: 0.7390
Epoch 6: early stopping


In [50]:
# Generate dummy test data (assuming your inputs are images of shape 128x44x1)
import numpy as np

num_test_samples = 100
pairs_test = np.random.rand(num_test_samples, 2, 128, 44, 1)  # 100 pairs of test fingerprints
labels_test = np.random.randint(0, 2, num_test_samples)       # 100 random binary labels

# Convert the dummy test data into the correct format
left_input_test = np.array([pair[0] for pair in pairs_test])
right_input_test = np.array([pair[1] for pair in pairs_test])

# Evaluate the model on the dummy test data
test_loss, test_accuracy = model.evaluate([left_input_test, right_input_test], labels_test, verbose=1)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - accuracy: 0.5639 - loss: 0.6918
Test Loss: 0.6956056356430054
Test Accuracy: 0.5400000214576721


In [51]:
# Assuming pairs_test and labels_test are your test dataset prepared similarly to training data
test_loss, test_accuracy = model.evaluate([pairs_test[:, 0], pairs_test[:, 1]], labels_test, verbose=1)

print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.5639 - loss: 0.6918
Test Loss: 0.6956056356430054
Test Accuracy: 0.5400000214576721


In [52]:
# Assuming left_input_test and right_input_test are your test inputs
predictions = model.predict([left_input_test, right_input_test])

# Binarize predictions based on a 0.5 threshold
binary_predictions = (predictions > 0.5).astype("int32")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step


In [53]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming labels_test are your true binary labels
precision = precision_score(labels_test, binary_predictions)
recall = recall_score(labels_test, binary_predictions)
f1 = f1_score(labels_test, binary_predictions)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
