In [None]:
import os
from keras.models import Sequential,Model
from keras.layers import Conv2D,MaxPool2D,GlobalMaxPool2D,Flatten,Dense,Dropout,Input,Lambda
from keras.callbacks import ModelCheckpoint,EarlyStopping
import keras.backend as K
import librosa
import numpy as np
import random
import string
import matplotlib.pyplot as plt
import librosa.display
from sklearn.utils import shuffle
import cv2
import tensorflow as tf

In [None]:
def create_spectrogram(clip,sample_rate,save_path):
  plt.interactive(False)
  fig=plt.figure(figsize=[0.72,0.72])
  ax=fig.add_subplot(111)
  ax.axes.get_xaxis().set_visible(False)
  ax.axes.get_yaxis().set_visible(False)
  ax.set_frame_on(False)
  S=librosa.feature.melspectrogram(y=clip,sr=sample_rate)
  librosa.display.specshow(librosa.power_to_db(S,ref=np.max))
  fig.savefig(save_path,dpi=400,bbox_inches='tight',pad_inches=0)
  plt.close()
  fig.clf()
  plt.close(fig)
  plt.close('all')
  del save_path,clip,sample_rate,fig,ax,S

In [None]:
def get_encoder(input_size):
  model=Sequential()
  model.add(Conv2D(32,(3,3),input_shape=(150,150,3),activation='relu'))
  model.add(Dropout(0.5))
  model.add(Conv2D(64,(3,3),activation='relu'))
  model.add(MaxPool2D(2,2))
  model.add(Dropout(0.5))

  model.add(Conv2D(64,(3,3),activation='relu'))
  model.add(Dropout(0.5))
  model.add(Conv2D(64,(3,3),activation='relu'))
  model.add(MaxPool2D(2,2))
  model.add(Dropout(0.5))


  model.add(GlobalMaxPool2D())

  return model

In [None]:
def get_siamese_network(encoder,input_size):
  input1=Input(input_size)
  input2=Input(input_size)

  encoder_l=encoder(input1)
  encoder_r=encoder(input2)

  # The encoder output is (None, 64), so the difference will also be (None, 64).
  # We need to specify the shape without the batch dimension, which is (64,).
  L1_layer = Lambda(lambda tensors:tf.math.abs(tensors[0] - tensors[1]), output_shape=(64,))
  L1_distance = L1_layer([encoder_l, encoder_r])

  output=Dense(1,activation='sigmoid')(L1_distance)
  siam_model=Model(inputs=[input1,input2],outputs=output)
  return siam_model

encoder=get_encoder((150,150,3))
siamese_net=get_siamese_network(encoder,(150,150,3))
siamese_net.compile(loss='binary_crossentropy',optimizer='adam')

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
def different_label_index(X):
    idx1=0
    idx2=0
    while idx1==idx2:
        idx1=np.random.randint(0,len(X))
        idx2=np.random.randint(0,len(X))
    return idx1,idx2

def batch_generator(X_data,batch_size):
  while True:
    data=[np.zeros((batch_size,150,150,3)) for i in range(2)]
    tar=[np.zeros(batch_size,)]

    #Generating same pairs.
    for i in range(0,batch_size//2):
      idx1=np.random.randint(0,len(X_data))
      img1 = X_data[idx1]

      data[0][i,:,:,:]=img1
      data[1][i,:,:,:]=img1
      tar[0][i]=1

    #Generating different pairs.
    for k in range(batch_size//2,batch_size):
      idx1=np.random.randint(0,len(X_data))
      img1 = X_data[idx1]

      idx1_diff,idx2_diff=different_label_index(X_data) # Use different_label_index for distinct indices
      img2 = X_data[idx2_diff]

      data[0][k,:,:,:]=img1
      data[1][k,:,:,:]=img2
      tar[0][k]=0

    yield (data[0], data[1]),tar[0]

In [None]:
os.makedirs('./Spectrograms/', exist_ok=True)
songs_path = '/content/Music/'
songs_list = [f for f in os.listdir(songs_path) if os.path.isfile(os.path.join(songs_path, f))] # Filter for files only

#Read the songs,divide them into 10s segment,create spectrogram of them

charsets=string.ascii_letters

def get_random_name():
    name=''.join([random.choice(charsets) for _ in range(20)])
    name=name+str(np.random.randint(0,1000))
    return name

for song in songs_list:
    print(song)
    songfile,sr=librosa.load(songs_path+song)
    duration=librosa.get_duration(y=songfile,sr=sr)
    prev=0
    for i in range(1,int((duration//10)+1)):
        if i==int((duration//10)):
            """Since we are dividing the song in 10s segment there might be case that after taking 10
            fragments also few more seconds are left so in this case extra becomes extra=extra+(10-extra)
            from the previous segment."""
            extra=int((int(duration)/10-int(int(duration)/10))*10)
            st=(sr*i*10)-(10-extra)
            end=st+10
            songfrag=np.copy(songfile[st:end])
        else:
            songfrag=np.copy(songfile[prev:(sr*i*10)])
        specname=get_random_name()
        create_spectrogram(songfrag,sr,'./Spectrograms/'+specname+'.png')
        prev=sr*i*10

Donell Jones - This Luv [cQUYe18YmSw].mp3




Maxwell - Fortunate [TRfzaBJhTto].mp3
Slow Dancing In A Burning Room (Live in L.A.) [32GZ3suxRn4].mp3
Alicia Keys - Un-thinkable (I'm Ready) (Official Video).mp3
January 28th [d15cxI5yx5c].mp3


In [None]:
raw_spec_files = os.listdir('./Spectrograms/')
specfilelist = []
for filename in raw_spec_files:
    full_path = os.path.join('./Spectrograms/', filename)
    if os.path.exists(full_path) and os.path.getsize(full_path) > 0:
        specfilelist.append(full_path)

specfilelist = shuffle(specfilelist)

print(f"Found {len(specfilelist)} valid spectrogram files.")

Found 549 valid spectrogram files.


In [None]:
def load_and_preprocess_image(path):
    img = cv2.imread(path)
    if img is None:
        print(f"Warning: Could not load image at {path}")
        return None
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (150, 150))
    img = img / 255.0  # Normalize pixel values
    return img

# Load and preprocess all spectrograms once
preprocessed_spectrograms = []
for spec_path in specfilelist:
    img = load_and_preprocess_image(spec_path)
    if img is not None:
        preprocessed_spectrograms.append(img)

# Convert to numpy array
preprocessed_spectrograms = np.array(preprocessed_spectrograms)

# Shuffle and split the preprocessed data
from sklearn.model_selection import train_test_split
X_train_data, X_test_data = train_test_split(preprocessed_spectrograms, test_size=0.25, random_state=42)

print(f"Loaded {len(X_train_data)} training spectrograms and {len(X_test_data)} testing spectrograms.")

# Now, update the training cell to use X_train_data and X_test_data

Loaded 411 training spectrograms and 137 testing spectrograms.


In [None]:
batch_size=10

# X_train=specfilelist[0:int(0.75*len(specfilelist))]
# X_test=specfilelist[int(0.75*len(specfilelist)):]

# Use the preprocessed data directly
X_train = X_train_data
X_test = X_test_data

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.0001)
mc = ModelCheckpoint('embdmodel.keras', monitor='val_loss', verbose=1, save_best_only=True, mode='min')
history=siamese_net.fit(batch_generator(X_train,batch_size),steps_per_epoch=len(X_train)//batch_size,epochs=50,validation_data=batch_generator(X_test,batch_size),
                            validation_steps=len(X_test)//batch_size,callbacks=[es,mc],shuffle=True)

Epoch 1/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 0.7129
Epoch 1: val_loss improved from inf to 0.67953, saving model to embdmodel.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 68ms/step - loss: 0.7123 - val_loss: 0.6795
Epoch 2/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 0.6398
Epoch 2: val_loss improved from 0.67953 to 0.65163, saving model to embdmodel.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - loss: 0.6397 - val_loss: 0.6516
Epoch 3/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 0.6487
Epoch 3: val_loss improved from 0.65163 to 0.63590, saving model to embdmodel.keras
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - loss: 0.6481 - val_loss: 0.6359
Epoch 4/50
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - loss: 0.5951
Epoch 4: val_loss improved 