<h1 style="font-family: Helvetica; font-size: 29px">Load Necassary Libraries </h1>

The code imports necessary libraries for audio processing and machine learning. It defines functions for loading and processing audio data, and also for creating and training a neural network model for music genre classification. The neural network model is based on a convolutional neural network architecture and uses mel-spectrograms as input features. Finally, the code loads the model and uses it to predict the genre of a given audio file.

In [1]:
import os
from pydub import AudioSegment
import librosa
import soundfile as sf
from matplotlib import pyplot as plt
import librosa.display
import numpy as np
import io
from keras.models import load_model
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import tensorflow as tf


# MP3 to WAV Function

In [2]:
def MP3TOWAV(mp3_folder, wav_folder):
    for filename in os.listdir(mp3_folder):
        if filename.endswith(".mp3"):
            ## Set input and output file paths
            mp3_path = os.path.join(mp3_folder, filename)
            wav_path = os.path.join(wav_folder, os.path.splitext(filename)[0] + ".wav")
            
            # Load MP3 file using pydub
            audio = AudioSegment.from_file(mp3_path)

            # Export the audio file to a WAV file
            audio.export(wav_path, format="wav")

# Normalization function

In [3]:
def Norm(input_folder, output_folder):
    for filename in os.listdir(input_folder):
        if filename.endswith(".wav"):
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)

            # Load audio file using librosa
            y, sr = librosa.load(input_path, sr=None)

            # Normalize audio using peak amplitude normalization
            y_normalized = librosa.util.normalize(y)

            # Save normalized audio to file
            sf.write(output_path, y_normalized, sr)

# Segmentation function

In [23]:
def Segment(input_folder, output_folder):
    segment_length = 10
    hop_length = 2
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".wav"):
            ## Set input and output file paths
            input_path = os.path.join(input_folder, filename)
            output_path = os.path.join(output_folder, filename)

            # Load audio file using librosa
            y, sr = librosa.load(input_path, sr=None)

            # Calculate segment frame and sample lengths
            segment_frames = int(segment_length * sr)
            hop_frames = int(hop_length * sr)
            total_frames = len(y)
            total_segments = int((total_frames - segment_frames) / hop_frames) + 1

            # Segment audio using a sliding window
            for i in range(total_segments):
                # Calculate start and end frame indices for current segment
                start_frame = i * hop_frames
                end_frame = start_frame + segment_frames

                # Extract audio segment
                y_segment = y[start_frame:end_frame]

                # Set output file path for current segment
                output_segment_path = output_path.replace(".wav", f"_segment{i}.wav")

                # Save audio segment to file
                #librosa.output.write_wav(output_segment_path, y_segment, sr)
                sf.write(output_segment_path, y_segment, sr)



# Extracting MFCC function

In [5]:
def MFCC(input_folder, output_folder):
    # Extracting MFCC
    # Set up parameters for MFCC extraction
    n_fft = 2048
    hop_length = 512
    n_mels = 128
    n_mfcc = 20
    
    # Loop over files in input folder
    for file_name in os.listdir(input_folder):
        # Check if file is a WAV file
        if not file_name.endswith('.wav'):
            continue

        # Load audio file using librosa
        file_path = os.path.join(input_folder, file_name)
        y, sr = librosa.load(file_path, sr=None)

        # Compute MFCCs
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, n_mfcc=n_mfcc)

        # Save MFCCs to file
        output_path = os.path.join(output_folder, file_name.replace('.wav', '_mfcc.npy'))
        np.save(output_path, mfcc)
    

# Converting to WAV

In [21]:
input_folder = r"G:\F2022-7\NEw\Train 8"
wav_out = r"G:\F2022-7\NEw\Train 8\wav"

MP3TOWAV(input_folder, wav_out)

# Normalization

In [22]:
input_folder = r"G:\F2022-7\NEw\Train 8\wav"
norm_out = r"G:\F2022-7\NEw\Train 8\norm"
Norm(input_folder, norm_out)

# Segmentation 

In [24]:
input_folder = r"G:\F2022-7\NEw\Train 8\norm"
seg_out = r"G:\F2022-7\NEw\hjk"

Segment(input_folder, seg_out)

# MFCC

In [22]:
input_folder = r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Segmented\El-Waily"
mfcc_out = r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Valid\Valid 0\MFCC"
MFCC(input_folder, mfcc_out)

# Padding

In [1]:
def preparing_data(data_path):
    # define path to MFCC data and number of classes
    num_classes = 11
    counter = 1
    # define the desired shape
    target_shape = (20, 938)

    # load and preprocess data
    X = []
    y = []
    for composer_idx in range(num_classes):
        composer_path = os.path.join(data_path, f'Test {composer_idx}')
        for file in os.listdir(composer_path):
            file_path = os.path.join(composer_path, file)
            mfcc = np.load(file_path, allow_pickle=True)
            # pad or truncate to ensure consistent shape
            padding = [(0, 0), (0, target_shape[1] - mfcc.shape[1])]

            # pad the MFCC array with zeros
            if mfcc.shape[1] < target_shape[1]:
                mfcc = np.pad(mfcc, padding, mode='constant')
            
            X.append(mfcc)
            y.append(composer_idx)
            print(f"{counter} : {mfcc.shape}")
            counter += 1

    # convert to numpy arrays and one-hot encode labels
    X = np.array(X)
    y = np.array(y)
    y_onehot = to_categorical(y, num_classes=num_classes)

    # Split the data into training and test sets
    #X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=0.1, stratify=y, random_state=42)

    # Split the test set into validation and test sets
    #X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=42)

    return X, y_onehot


In [4]:
folder = r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Test_MFCC"

X_test, y_test = preparing_data(folder)

1 : (20, 938)
2 : (20, 938)
3 : (20, 938)
4 : (20, 938)
5 : (20, 938)
6 : (20, 938)
7 : (20, 938)
8 : (20, 938)
9 : (20, 938)
10 : (20, 938)
11 : (20, 938)
12 : (20, 938)
13 : (20, 938)
14 : (20, 938)
15 : (20, 938)
16 : (20, 938)
17 : (20, 938)
18 : (20, 938)
19 : (20, 938)
20 : (20, 938)
21 : (20, 938)
22 : (20, 938)
23 : (20, 938)
24 : (20, 938)
25 : (20, 938)
26 : (20, 938)
27 : (20, 938)
28 : (20, 938)
29 : (20, 938)
30 : (20, 938)
31 : (20, 938)
32 : (20, 938)
33 : (20, 938)
34 : (20, 938)
35 : (20, 938)
36 : (20, 938)
37 : (20, 938)
38 : (20, 938)
39 : (20, 938)
40 : (20, 938)
41 : (20, 938)
42 : (20, 938)
43 : (20, 938)
44 : (20, 938)
45 : (20, 938)
46 : (20, 938)
47 : (20, 938)
48 : (20, 938)
49 : (20, 938)
50 : (20, 938)
51 : (20, 938)
52 : (20, 938)
53 : (20, 938)
54 : (20, 938)
55 : (20, 938)
56 : (20, 938)
57 : (20, 938)
58 : (20, 938)
59 : (20, 938)
60 : (20, 938)
61 : (20, 938)
62 : (20, 938)
63 : (20, 938)
64 : (20, 938)
65 : (20, 938)
66 : (20, 938)
67 : (20, 938)
68 :

  X = np.array(X)


ValueError: could not broadcast input array from shape (20,938) into shape (20,)

In [27]:
path = r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Preprocessed Data 2\y_val"
np.save(path, y_val)

In [16]:
X_train.shape

(22705, 20, 938)

In [3]:
X_train = np.load(r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Preprocessed Data\X_train.npy", allow_pickle=True)
X_val = np.load(r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Preprocessed Data\X_val.npy", allow_pickle=True)
X_test = np.load(r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Preprocessed Data\X_test.npy", allow_pickle=True)
y_train = np.load(r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Preprocessed Data\y_train.npy", allow_pickle=True)
y_val = np.load(r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Preprocessed Data\y_val.npy", allow_pickle=True)
y_test = np.load(r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Preprocessed Data\y_test.npy", allow_pickle=True)

In [28]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)


(22705, 20, 938)
(2924, 20, 938)
(22705, 11)
(2924, 11)


# Building Model

In [29]:
import tensorflow as tf

# define model architecture
model = tf.keras.Sequential([
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(20, 938, 1)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2,2)),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D((2,2)),
    tf.keras.layers.Dropout(0.3),

    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(11, activation='softmax')
])



# Fitting

In [30]:
# compile model
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

# train model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 18, 936, 32)       320       
                                                                 
 batch_normalization (BatchN  (None, 18, 936, 32)      128       
 ormalization)                                                   
                                                                 
 max_pooling2d (MaxPooling2D  (None, 9, 468, 32)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 9, 468, 32)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 7, 466, 64)        18496     
                                                                 
 batch_normalization_1 (Batc  (None, 7, 466, 64)       2

In [32]:
model.save(r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Model_1")



INFO:tensorflow:Assets written to: D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Model_1\assets


INFO:tensorflow:Assets written to: D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Model_1\assets


# Validation

In [4]:
model = load_model(r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Model")
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=2)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

50/50 - 4s - loss: 0.1329 - accuracy: 0.9718 - 4s/epoch - 82ms/step
Test loss: 0.13290460407733917
Test accuracy: 0.9718397855758667


# Prediction

In [2]:
def MP3TOWAV(mp3_file):
    if mp3_file.endswith(".mp3"):
        # Load MP3 file using pydub
        audio = AudioSegment.from_file(mp3_file)
        
        # Export the audio file to a WAV file in memory
        wav_file = io.BytesIO()
        audio.export(wav_file, format="wav")
        wav_file.seek(0)

        # Load the WAV file into memory using librosa.load()
        y, sr = librosa.load(wav_file, sr=None)
        return y, sr

def Norm(audio):
    return librosa.util.normalize(audio)

def Segment(audio, sr):
    segment_length = 10
    hop_length = 2
    Segmented = []
    
    # Calculate segment frame and sample lengths
    segment_frames = int(segment_length * sr)
    hop_frames = int(hop_length * sr)
    total_frames = len(audio)
    total_segments = int((total_frames - segment_frames) / hop_frames) + 1

    # Segment audio using a sliding window
    for i in range(total_segments):
        # Calculate start and end frame indices for current segment
        start_frame = i * hop_frames
        end_frame = start_frame + segment_frames

        # Extract audio segment
        y_segment = audio[start_frame:end_frame]
        Segmented.append(y_segment)
    return Segmented
    
def MFCC(audio, sr):
    # Extracting MFCC
    # Set up parameters for MFCC extraction
    n_fft = 2048
    hop_length = 512
    n_mels = 128
    n_mfcc = 20

    # Compute MFCCs
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, n_mfcc=n_mfcc)
    
    num_classes=11
    target_shape=(20,938)
    
    #Load and preprocess data
    X = []
    y = []
    
    #Padding
    padding = [(0,0), (0, target_shape[1] - mfcc.shape[1])]
    mfcc = np.pad(mfcc, padding, mode='constant')
    
    return mfcc
    

def predict(model_path, file_path):
    audio, sr = MP3TOWAV(file_path)
    norm_audio = Norm(audio)
    segmented_audio = Segment(norm_audio, sr)
    mfcc_audio = []
    for audio_index in segmented_audio:
        audio_mfcc = MFCC(audio_index, sr)
        mfcc_audio.append(audio_mfcc)
    
    arr = np.array(mfcc_audio)
    model = load_model(model_path)
    y = model.predict(arr)
    #predicted_labels = np.argmax(y, axis=1)
    #int(sum(predicted_labels) / len(predicted_labels))
    return y


In [14]:
path = r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Test\Khalik Hena - Baligh Hamdy.mp3"
model_path = r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Model"
y = predict(model_path, path)



In [15]:
y = np.argmax(y, axis=1)
print(y)

[ 3  3  3  3  3  6  6  6 10  3  6  7  7  3  9  9  9  9  9 10 10  3  4  4
  4  4  4  4  4  4  4  4  4  4  3  3 10  3  4  4  4  3  1  3  6  6  6  6
  3  3  6  3  7  3  3  3  3  3  3  9  9  9  9  9  9  4  3  7  7  7  7  3
  4  3  3  3  3  7  3  9 10  9  9  9  9  9  9  9  9  9 10 10 10 10 10  3
  3 10  3 10  3  3  7  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9  9
  9  9  3  3  3  3 10  9  3  7  7  3  9  9  9  9  9  9  3  9  9  3 10  9
  3  3  7  7  7  7  7  7  3  3  4  3  3  7  3  7  2  2  2  2  2]


In [6]:
path = r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\Train"
for folder in os.listdir(path):
    folder_path = os.path.join(path, folder)
    for file_name in os.listdir(folder_path):
        y = folder.split()
        print(y)

['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '0']
['Train', '1']
['Train', '1']
['Train', '1']
['Train', '1']
['Train', '1']
['Train', '1']
['Train', '10']
['Train', '10']
['Train', '10']
['Train', '10']
['Train', '10']
['Train', '10']
['Train', '10']
['Train', '10']
['Train', '10']
['Train', '10']
['Train', '10']
['Train', '2']
['Train', '2']
['Train', '2']
['Train', '2']
['Train', '2']
['Train', '2']
['Train', '2']
['Train', '2']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '3']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']
['Train', '4']

In [11]:
x = np.load(r"D:\Education\Semester 6\CSE 321 Project Based Learning on CSE\Composers Classification\MFCC\composer_0\1998_segment0_mfcc.npy")
x

array([[-720.06165  , -720.06165  , -720.06165  , ..., -367.31427  ,
        -353.92166  , -343.24902  ],
       [   0.       ,    0.       ,    0.       , ...,   89.510605 ,
         115.980316 ,  133.58696  ],
       [   0.       ,    0.       ,    0.       , ..., -123.17783  ,
         -98.4527   ,  -74.27963  ],
       ...,
       [   0.       ,    0.       ,    0.       , ...,   23.159985 ,
          13.582274 ,    4.8609514],
       [   0.       ,    0.       ,    0.       , ...,   13.891649 ,
           8.62771  ,    2.4468532],
       [   0.       ,    0.       ,    0.       , ...,    7.8098965,
           5.9895554,    5.614708 ]], dtype=float32)