In [3]:
import PIL
from PIL import Image
import librosa
import librosa.display
import numpy as np
from numpy import asarray
import IPython.display as ipd
import matplotlib.pyplot as plt
from pydub import AudioSegment
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

import torch
import torchaudio
import torchvision
from PIL import Image

import tensorflow as tf
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout

In [None]:
audio_path = 'data/genres_original/hiphop/hiphop.00005.wav'
x , sr = librosa.load(audio_path)#x is an audio time series as a numpy array. sr is the sampling rate
ipd.Audio(audio_path)

In [None]:
def create_dataset(path, genre_list, dataset):
    
    num_channels = 3
    window_sizes = [25, 50, 100]
    hop_sizes = [10, 25, 50]
    
    for genre in genre_list:
        files = librosa.util.find_files(path+genre, ext=['wav'])#this returns the entire path for each file in a genre folder
        
        for song in files:
            x , sr = librosa.load(song)
            song_id = song[71+len(genre)+1:-4]#-4 - len(genre)
            specs_ = []
            
            for i in range(num_channels):
                
                window_length = int(round(window_sizes[i]*sr/1000))
                hop_length = int(round(hop_sizes[i]*sr/1000))

                clip = torch.Tensor(x)
                spec = torchaudio.transforms.MelSpectrogram(sample_rate=sr, n_fft=2205, win_length=window_length, hop_length=hop_length, n_mels=128)(clip) #Check this otherwise use 2400
                eps = 1e-6
                spec = spec.numpy()
                spec = np.log(spec+ eps)
                spec = np.asarray(torchvision.transforms.Resize((128, 1500))(Image.fromarray(spec)))
                specs_.append(spec)
            
            spec = np.dstack((specs_[0], specs_[1], specs_[2]))
            dataset['song_id'].append(song_id)  
            if song_id[-5:]!='25_30' or song_id[-5:]!='_0_50':
                dataset["Mel_spectrograms"].append(spec)  
            dataset['label'].append(genre)
                            
                
    return dataset

In [None]:
dataset = { 'song_id':[], 'audio':[], 'Mel_spectrograms':[], 'label':[] }
path = 'data/data_fiveSeconds/'
genre_list = ['jazz', 'rock', 'hiphop', 'metal', 'pop', 'disco', 'blues', 'classical', 'country', 'reggae']
data_diz = create_dataset(path, genre_list, dataset)  

In [None]:
with open('DATA_Mel_spectrograms.pkl', 'wb') as f:
    pickle.dump(data_diz, f)

In [4]:
with open('../input/data-mels/DATA_Mel_spectrograms_small.pkl', 'rb') as f:
    DATA = pickle.load(f) 


In [5]:
def prepare_datasets(inputs, targets, split_size):
      
    #scale the data
    mean = inputs.mean(axis=(1, 2), keepdims=True)
    std = inputs.std(axis=(1, 2), keepdims=True)
    inputs = (inputs-mean)/std
    
    # Creating a validation set and a test set.
    inputs_train, inputs_val, targets_train, targets_val = train_test_split(inputs, targets, test_size=split_size)
    inputs_train, inputs_test, targets_train, targets_test = train_test_split(inputs_train, targets_train, 
                                                                              test_size=split_size)
    
    return inputs_train, inputs_val, inputs_test, targets_train, targets_val, targets_test


def design_model_1(input_shape, targets):
    
    base_model = tf.keras.applications.densenet.DenseNet121(input_shape = input_shape, 
                                                            include_top = False, 
                                                            weights = "imagenet")
    base_model.trainable = False

    # Let's design the model architecture.
    model = tf.keras.models.Sequential([
        base_model,
        
        tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3), 
        
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'), 
        tf.keras.layers.Dense(len(np.unique(targets)), activation='softmax')
    ])

    return model

def make_prediction(model, X, y, idx):
    
    genre_dict = {
        0 : 'jazz',
        1 : 'rock',
        2 : 'hiphop',
        3 : "metal",
        4 : "pop",
        5 : "disco",
        6 : "blues",
        7 : "classical",
        8 : "country",
        9 : "reggae",
        }
        
    predictions = model.predict(X)
    genre = np.argmax(predictions[idx])

In [6]:
X = np.array(DATA['Mel_spectrograms'])
y = np.array(DATA['label'])
y_encoded = pd.factorize(y.reshape(X.shape[0],))[0]
y_encoded = y_encoded.reshape(X.shape[0],1)
inputs_train, inputs_val, inputs_test, targets_train, targets_val, targets_test = prepare_datasets(X, y_encoded, 0.1)

In [10]:
if __name__ == "__main__":
    

    model = design_model_1(inputs_train.shape[1:], y)

    # Selection of the optimizer, loss type and metrics for performance evaluation.
    model.compile(optimizer = tf.keras.optimizers.Adam(lr=0.0001),
                     loss='sparse_categorical_crossentropy',
                     metrics = ['acc']
                     )

    model.summary()

    # Training the model.
    history = model.fit(inputs_train, targets_train,
                        validation_data=(inputs_val, targets_val),
                        epochs=20,
                        batch_size=32
                        )

    # Testing the model on never seen before data.
    make_prediction(model, inputs_test, targets_test, 24)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
densenet121 (Functional)     (None, 4, 46, 1024)       7037504   
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 2, 23, 1024)       0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 2, 23, 1024)       4096      
_________________________________________________________________
dropout_1 (Dropout)          (None, 2, 23, 1024)       0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 47104)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                3014720   
_________________________________________________________________
dense_3 (Dense)              (None, 10)               

2022-05-21 09:17:35.731406: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1863936000 exceeds 10% of free system memory.
2022-05-21 09:17:37.630211: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1863936000 exceeds 10% of free system memory.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [9]:

def design_model_1(input_shape, targets):
    
    base_model = tf.keras.applications.densenet.DenseNet121(input_shape = input_shape, 
                                                            include_top = False, 
                                                            weights = "imagenet")
    base_model.trainable = False

    # Let's design the model architecture.
    model = tf.keras.models.Sequential([
        base_model,
        
        tf.keras.layers.MaxPooling2D((3,3), strides=(2,2), padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3), 
        
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(64, activation='relu'), 
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(len(np.unique(targets)), activation='softmax')
    ])

    return model



In [10]:
if __name__ == "__main__":
    

    model = design_model_1(inputs_train.shape[1:], y)

    # Selection of the optimizer, loss type and metrics for performance evaluation.
    model.compile(optimizer = tf.keras.optimizers.Adam(lr=0.0001),
                     loss='sparse_categorical_crossentropy',
                     metrics = ['acc']
                     )

    model.summary()

    # Training the model.
    history = model.fit(inputs_train, targets_train,
                        validation_data=(inputs_val, targets_val),
                        epochs=20,
                        batch_size=32
                        )

    # Testing the model on never seen before data.
    make_prediction(model, inputs_test, targets_test, 24)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
densenet121 (Functional)     (None, 4, 46, 1024)       7037504   
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 2, 23, 1024)       0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 2, 23, 1024)       4096      
_________________________________________________________________
dropout_1 (Dropout)          (None, 2, 23, 1024)       0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 47104)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                3014720   
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)               

2022-05-21 09:46:19.539311: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1863936000 exceeds 10% of free system memory.
2022-05-21 09:46:21.435616: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1863936000 exceeds 10% of free system memory.


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
