In [1]:
import tensorflow as tf
import os
import shutil
import numpy as np
import matplotlib.pyplot as plt
import librosa
import cv2

: 

: 

In [20]:
# pip install --upgrade tensorflow==2.12.0
tf.__version__

'2.12.0'

In [21]:
encoder = tf.keras.saving.load_model("../app/model/AEv3encoder3seconds")



In [23]:
encoder.summary()

Model: "encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 128, 128, 3)]     0         
                                                                 
 conv2d_3 (Conv2D)           (None, 64, 64, 32)        896       
                                                                 
 conv2d_4 (Conv2D)           (None, 32, 32, 64)        18496     
                                                                 
 conv2d_5 (Conv2D)           (None, 16, 16, 128)       73856     
                                                                 
 flatten_1 (Flatten)         (None, 32768)             0         
                                                                 
 dense_2 (Dense)             (None, 100)               3276900   
                                                                 
Total params: 3,370,148
Trainable params: 3,370,148
Non-tra

In [24]:
def create_melspectrogram(y: np.ndarray, sr: int, output_file: str, array_path: str = None):
    """
        Args:
            y : np.ndarray [shape=(..., n)] or None
                audio time-series. Multi-channel is supported.
            sr : number > 0 [scalar]
                sampling rate of ``y``
            output_file: str or pathlib.Path
                file to store the diagram
    """
    if not os.path.exists(os.path.dirname(output_file)):
      os.makedirs(os.path.dirname(output_file))

    melspectrogram_array = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=512)

    mel = librosa.power_to_db(melspectrogram_array)
    # Length and Width of Spectogram
    fig_size = plt.rcParams["figure.figsize"]
    fig_size[0] = float(mel.shape[1] / 100)
    fig_size[1] = float(mel.shape[0] / 100)
    plt.rcParams["figure.figsize"] = fig_size
    plt.axis('off')
    plt.axes([0., 0., 1., 1.0], frameon=False, xticks=[], yticks=[])
    librosa.display.specshow(mel)   # ,cmap='gray_r'
    plt.savefig(output_file, dpi=100)
    plt.close()
    if array_path is not None:
      np.save(array_path, melspectrogram_array)
    return melspectrogram_array, output_file

In [25]:
# def resize_image(image, new_width, new_height):
#     # Resize the image
#     resized_image = cv2.resize(image, (new_width, new_height, 3), interpolation=cv2.INTER_LINEAR)

#     # Save the resized image
#     return resized_image

In [26]:
def predict_audio(audio_path: str, encoder_model):

    y, sr = librosa.load(audio_path)

    # if file shorter than 30 seconds, padding with zeros
    if len(y)//sr < 30:
      new_array_30_secs = np.zeros(sr*30)
      new_array_30_secs[:len(y)] = y
    else:
      # extract middle 30 seconds of file
      middle = len(y)//2
      start = middle - int(sr*15)
      end = middle + int(sr*15)
      new_array_30_secs = y[start:end]

    melspectrogram, output_file = create_melspectrogram(new_array_30_secs, sr, "/tmp/melspectrogram.jpg")
    melspectrogram_resized = cv2.imread(output_file)[:, :1280, :]
    # return melspectrogram_resized
    predictions = []
    
    for counter in range(10):
      predictions.extend(encoder_model.predict(melspectrogram_resized[: , counter*128:(counter+1)*128, :].reshape(-1, 128, 128, 3)))
    return predictions

In [27]:
audiofile = "../data/80x27s-islandy-loop-925bpm-132431.mp3"

In [28]:
test = predict_audio(audiofile, encoder)



In [29]:
np.array(test).shape

(10, 100)