<a href="https://colab.research.google.com/github/gyanasluitel/Music_Genre_Classification/blob/main/preparing_final_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
import librosa

In [3]:
import math

In [4]:
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle datasets download -d andradaolteanu/gtzan-dataset-music-genre-classification

Downloading gtzan-dataset-music-genre-classification.zip to /content
 99% 1.20G/1.21G [00:11<00:00, 94.0MB/s]
100% 1.21G/1.21G [00:11<00:00, 116MB/s] 


In [None]:
!unzip gtzan-dataset-music-genre-classification.zip

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
path = os.getcwd()
DATASET_PATH = os.path.join(path, 'Data/genres_original')
DATASET_PATH

'/content/Data/genres_original'

In [10]:
#path to sub_data
SUB_DATASET_PATH = os.path.join(os.getcwd(),'drive/MyDrive/Sub_Data/genres_original')
SUB_DATASET_PATH

'/content/drive/MyDrive/Sub_Data/genres_original'

In [11]:
def generating_headers(n_mfcc):
  headers = []
  for i in range(1,n_mfcc+1):
    headers.append(f'mfcc_{i}')
  return headers

In [12]:
def prepare_dataset(datapath, n_mfcc=13, n_fft=248, hop_length=512, num_segments=5):
    """
    Extracts mffcs feature from music dataset and saves them into a dataframe
    along with the genre associated with each audio file.
    """
    #Defining constants
    SAMPLE_RATE = 22050                                 #no of data samples taken per second
    TRACK_DURATION = 30                                 #duration of the track
    SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION    #total samples per track

    final_df = pd.DataFrame() # contains all the data samples
    num_samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
    expected_num_mfcc_vectors_per_segment = math.ceil(num_samples_per_segment/hop_length)
    
    for dirpath, dirnames, filenames in os.walk(DATASET_PATH):
        #only process if the path is not the current working directory
        if dirpath is not DATASET_PATH:
            dirpath_components = dirpath.split('/')
            semantic_label = dirpath_components[-1]
            print(f'\nProcessing: {semantic_label}')

            single_file_df = pd.DataFrame()
            #processing each file within a specific genre
            for file in filenames:
                file_path = os.path.join(dirpath, file)
                label = file_path.split('/')[-2]

                try:
                    #load the audio file
                    signal, sr = librosa.load(file_path, sr = SAMPLE_RATE)

                    #process all segments extracting mfcc and storing data
                    for segment in range(num_segments):
                        #setting the start and finish index of a sample for each segment
                        start_sample = num_samples_per_segment * segment
                        finish_sample = start_sample + num_samples_per_segment

                        #extracting mfcc features for a single segment
                        mfcc = librosa.feature.mfcc(signal[start_sample:finish_sample],
                                                   sr = sr,
                                                   n_mfcc = n_mfcc,
                                                   n_fft = n_fft,
                                                   hop_length = hop_length)
                        mfcc = mfcc.T

                        #store mfcc feature for a single segment only if the number of vectors
                        #for each segment contains expected number of vectors
                        if len(mfcc) == expected_num_mfcc_vectors_per_segment:
                            segment_df = pd.DataFrame(data = mfcc,
                                                     columns= generating_headers(n_mfcc),
                                                     index = range(len(mfcc)))
                            segment_df['Labels'] = [label] * mfcc.shape[0]
                            #nparray.append(mfcc)
                            #concatenating segment dataframe into a single file df
                            single_file_df = pd.concat([single_file_df, segment_df],
                                                      axis=0,
                                                      sort=False,
                                                      ignore_index = True)                        
                except Exception as e:
                    print(f"Error Message: {e}")
            final_df = pd.concat([final_df, single_file_df],
                         axis=0,
                         sort = False,
                         ignore_index = True)
        print('Done with preparing dataset')
    return final_df


In [13]:
df = prepare_dataset(datapath=DATASET_PATH, num_segments=10)

Done with preparing dataset

Processing: reggae


  "Empty filters detected in mel frequency basis. "


Done with preparing dataset

Processing: jazz




Error Message: 


  "Empty filters detected in mel frequency basis. "


Done with preparing dataset

Processing: disco
Done with preparing dataset

Processing: hiphop
Done with preparing dataset

Processing: classical
Done with preparing dataset

Processing: metal
Done with preparing dataset

Processing: pop
Done with preparing dataset

Processing: country
Done with preparing dataset

Processing: rock
Done with preparing dataset

Processing: blues
Done with preparing dataset


In [14]:
df

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,Labels
0,-536.042786,32.067459,-49.450306,25.776079,-11.531991,-18.413410,-8.967739,-4.582458,-0.501133,6.618028,12.905916,6.469436,12.735432,reggae
1,-392.235077,28.644016,-73.791199,50.155655,1.933116,-14.058987,1.360568,-14.130725,-4.516984,-18.869156,12.337111,0.084416,-19.625988,reggae
2,-451.695129,17.615837,-53.177261,43.919796,28.546822,-8.240944,3.578257,-8.234414,-9.060442,-26.781670,-4.912760,1.545562,-2.422036,reggae
3,-477.528839,33.314350,-58.991947,36.357792,26.536734,-0.675872,1.368691,-16.134178,-0.317810,-8.654249,-9.246617,-24.636469,-10.302074,reggae
4,-504.078796,35.871197,-48.558861,50.370205,2.761873,-12.647474,7.050649,-2.322120,-6.039096,-31.120485,-4.710315,-3.640983,-0.203701,reggae
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1298175,-455.102631,24.158625,-63.375328,65.074524,3.704499,40.963463,-21.325134,10.105320,-16.304668,-9.928100,-23.563305,4.619114,15.669947,blues
1298176,-447.722656,2.117634,-44.335674,58.148350,12.036886,21.317839,-22.656757,19.766155,-27.212273,-20.932671,-30.610203,16.540192,9.672492,blues
1298177,-449.050079,10.812412,-50.106941,54.213680,1.793259,21.684978,-25.272818,8.188376,-16.318310,9.493264,-36.492809,12.093601,3.433430,blues
1298178,-437.535187,12.875460,-50.128567,54.451469,7.296531,35.563461,-19.915752,4.126825,-17.346703,5.894392,-34.753819,12.200981,-2.462997,blues


In [15]:
df['Labels'].unique()

array(['reggae', 'jazz', 'disco', 'hiphop', 'classical', 'metal', 'pop',
       'country', 'rock', 'blues'], dtype=object)

In [16]:
len(df['Labels'].unique())

10

# Saving Dataframe in a pickle file

In [17]:
import pickle

In [19]:
import os

In [20]:
os.getcwd()

'/content'

In [21]:
os.listdir(os.getcwd())

['.config',
 'kaggle.json',
 'Data',
 'gtzan-dataset-music-genre-classification.zip',
 'drive',
 'sample_data']

In [22]:
df.to_pickle('data.pickle')

In [23]:
DATASET_PATH

'/content/Data/genres_original'