# Quran Reciter Identification (Speech Recognition) using Machine Learning
#### by Muhammad Khurram Chughtai

# 2_Data_Preparation
This notebook prepares the data for training in the following steps:
- Quick look at the downloaded data & select 5 reciters from the downloaded data
- Select the Sura/Aya for the reciters
- Mark the data for training/validation/test such that the same Sura/Aya are used for each reciter
- Select the feature(s) to be extracted
- Extract the feature(s)
- Save the train/val/test data

# Quran organization
Before looking at the data, it is important to understand how Quran is organized and some related terms which will help make sense of the data.

Quran is the holy book of religion of Islam. It is organized as follows:
- Quran is dvided into "suras" (chapters) with unque names for each sura. e.g. the first sura has the name "Al-Fatiha" (The Opening). There are 114 suras.
- Each "sura" (chapter) consists of at least 3 or more "ayas" (verses). Each "aya" (verse) consists of one or more Arabic sentences. e.g. the first surah "Al-Fatiha" consists of 7 ayas. There are total of 6,236 ayas in all 114 suras.

Let's start by loading all the helper procedures into the Notebook.
Loading helper procedures from helpers.py:
- Create a new cell & execute the following line
        %load helpers.py
- This will put the contents of the file in the cell. Then execute the cell to load everything in memory

**Note**: Start by cutting the cell below to replace it with empty new cell

In [2]:
# %load helpers.py
##################
# imports
import os
import csv
import re
import pathlib
import xml.etree.ElementTree as ET
import zipfile
import shutil
import librosa
import math
import numpy as np 
import pandas as pd 
import warnings
import audioread
import time
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import ExcelWriter
from pydub.utils import mediainfo

##################
# Default Constants
# Directory where MP3 files have been downloaded
zip_data_dir   = '../DownloadedReciters'
# Name of ZIP file for each reciter
zip_file_name  = '000_versebyverse.zip'
# Directory where processed data & other generated data will be stored
data_dir       = '../data'
audio_data_dir = os.path.join(os.getcwd(), data_dir, "audio")
quran_meta_xml = os.path.join(os.getcwd(), 'Qurandata', "quran-data.xml")
# Max/Min number of Sura/Aya with first as index 1
SuraIndexMIN   = 1
SuraIndexMAX   = 114
AyaIndexMIN    = 0
AyaIndexMAX    = 6236

# Suppress this warning from librosa:
# UserWarning: PySoundFile failed. Trying audioread instead.
warnings.filterwarnings('ignore')

##################
# Functions
def qsura_ayat_to_labels(suraFrom=None, suraTo=None, ayaFrom=None, ayaTo=None):
    """ Converts either Sura or Aya numbers to labels.

       :param suraFrom: An interger for Sura index to start from. Valid numbers are 1 to 114 (default: None)
       :type suraFrom: int
       :param suraTo: An interger for Sura index to end to. Valid numbers are 1 to 114 (inclusive) (default: None)
       :type suraTo: int
       :param ayaFrom: An interger for Aya index to start from (default: None)
       :type ayaFrom: int
       :param ayaTo: An interger for Aya index to end to (inclusive) (default: None)
       :type ayaTo: int
       :return: A list of labels in the form ['001001', '001002', ...], A list of aya number in quran [0, 2, ... AyaIndexMAX]
       :rtype: list, list 

    """

    # Return lists
    labels_list = list()
    ayainq = list()

    useSura = False
    useAya = False
    if suraFrom is not None and suraTo is not None:
        if suraFrom < SuraIndexMIN or suraFrom > SuraIndexMAX:
            print("ERROR: {} not between {} and {}".format('suraFrom', SuraIndexMIN,SuraIndexMAX))
            return labels_list, ayainq
        if suraTo < SuraIndexMIN or suraTo > SuraIndexMAX:
            print("ERROR: {} not between {} and {}".format('suraTo', SuraIndexMIN,SuraIndexMAX))
            return labels_list, ayainq
        useSura = True
    elif ayaFrom is not None and ayaTo is not None:
        if ayaFrom < AyaIndexMIN or ayaFrom > AyaIndexMAX:
            print("ERROR: {} not between {} and {}".format('ayaFrom', AyaIndexMIN,AyaIndexMAX))
            return labels_list, ayainq
        if ayaTo < AyaIndexMIN or ayaTo > AyaIndexMAX:
            print("ERROR: {} not between {} and {}".format('ayaTo', AyaIndexMIN,AyaIndexMAX))
            return labels_list, ayainq
        useAya = True

    ##################
    # qmeta: Quran Meta Data
    qmeta_tree = ET.parse(quran_meta_xml)
    qmeta_root = qmeta_tree.getroot()
    #print("qmeta_root :", qmeta_root)

    # As an Element, root has a tag and a dictionary of attributes:
    qmeta_root_tag = qmeta_root.tag
    qmeta_root_att = qmeta_root.attrib
    #print("qmeta_root_tag = " + qmeta_root_tag)
    #print("qmeta_root_att = ")
    #print(qmeta_root_att)

    # It also has children nodes over which we can iterate:
    for qmeta_suras in qmeta_root:
        qmeta_suras_tag = qmeta_suras.tag
        #qmeta_suras_att = qmeta_suras.attrib
        #print("qmeta_suras_tag = " + qmeta_suras_tag)
        #print("qmeta_suras_att = ")
        #print(qmeta_suras_att)
        if qmeta_suras_tag == "suras":
            for qmeta_sura in qmeta_suras:
                qmeta_sura_tag = qmeta_sura.tag
                #qmeta_sura_att = qmeta_sura.attrib
                #print("qmeta_sura_tag = " + qmeta_sura_tag)
                #print("qmeta_sura_att = ")
                #print(qmeta_sura_att)
                if qmeta_sura_tag == "sura":
                    #print("qmeta_sura :", qmeta_sura)
                    qmeta_sura_index = qmeta_sura.attrib.get('index')
                    qmeta_sura_ayas = qmeta_sura.attrib.get('ayas')
                    qmeta_sura_start = qmeta_sura.attrib.get('start')
                    #print("qmeta_sura_index :", qmeta_sura_index)
                    #print("qmeta_sura_ayas :", qmeta_sura_ayas)
                    #print("qmeta_sura_start :", qmeta_sura_start)

                    if useSura:
                        if int(qmeta_sura_index) >= suraFrom and int(qmeta_sura_index) <= suraTo:
                            #print("  MKC: qmeta_sura_index :", qmeta_sura_index)
                            #print("  MKC: qmeta_sura_ayas :", qmeta_sura_ayas)
                            #print("  MKC: qmeta_sura_start :", qmeta_sura_start)
                            for i in range(1, int(qmeta_sura_ayas)+1):
                                labels_list.append("{:03d}{:03d}".format(int(qmeta_sura_index), i))

                    if useAya:
                        # Get the current sura end ayat
                        sura_start = int(qmeta_sura_start)
                        sura_end = sura_start + (int(qmeta_sura_ayas) - 1)
                        #print("sura start -> end: {} -> {}".format(sura_start, sura_end))
                        for i in range(sura_start, sura_end+1):
                            if i >= ayaFrom and i <= ayaTo:
                                #print("  -> ",i)
                                ayainq.append(i)
                                labels_list.append("{:03d}{:03d}".format(int(qmeta_sura_index), i+1-sura_start))

    #print("labels_list :", labels_list)
    return labels_list, ayainq

def report_stats_zip_data(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name):
    """ Reports statitics for the zipped data

       :param zip_data_dir: Directory containing the zipped data
       :type zip_data_dir: str
       :param zip_file_name: Name of the zip file in the directory
       :type zip_file_name: str
       :return: A list of directory names in zip_data_dir
       :rtype: list 

    """

    # Return list
    dir_names = list()

    # Directory names are also names of the reciters
    print("{:30s} {:10s} {:8s} {:9s}".format("Reciter name", "Data Size", "Files", "MP3 Files"))
    print("{:30s} {:10s} {:8s} {:9s}".format("============", "=========", "=====", "========="))
    for dd in os.listdir(zip_data_dir):
        # Each directory has one zip file called 000_versebyverse.zip
        dd_zip_file = zip_data_dir + "/" + dd + "/" + zip_file_name
        # Size
        dd_size_bytes = os.path.getsize(dd_zip_file)
        dd_size_MB = dd_size_bytes / (1024 * 1024)
        # Number of files
        archive = zipfile.ZipFile(dd_zip_file, 'r')
        num_files = len(archive.namelist())
        # Mp3 files
        mp3_cnt = 0
        for ff in archive.namelist():
            if ff.endswith('.mp3'):
                mp3_cnt += 1

        print("{:30s} {:6.0f} MB {:6d} {:12d}".format(dd, dd_size_MB, num_files, mp3_cnt))
        dir_names.append(dd)
    return dir_names

# Directory size (https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python)
def get_dir_size(start_path = '.'):
    """ Gets the size of the directory recursively

       :param start_path: Path to a directory whose size is needed
       :type start_path: str
       :return: Total size of the directory in bytes
       :rtype: int

    """

    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

def report_stats_audio_data(data_dir=audio_data_dir, verbose=False):
    """ Reports statitics for the audio data

       :param data_dir: Directory containing the audio data
       :type data_dir: str
       :return: DataFrame with audio directory details
       :rtype: DataFrame 

    """

    column_names = list()
    column_names.append('ReciterName')
    column_names.append('FileName')
    column_names.append('DataSizeKB')
    column_names.append('BitRate')
    column_names.append('Channels')
    column_names.append('Mono/Stereo')
    column_names.append('Duration')
    column_names.append('FileNoExt')
    column_names.append('Sura')
    column_names.append('Aya')
    column_names.append('AyaInQuran')

    sv_reciter   = list()
    sv_file_name = list()
    sv_data_size = list()
    sv_bit_rate  = list()
    sv_ch        = list()
    sv_ms        = list()
    sv_dur       = list()
    sv_filenoext = list()
    sv_sura      = list()
    sv_aya       = list()
    sv_ayainq    = list()

    # Directory names are also names of the reciters
    print("{:30s} {:10s} {:4s} {:4s} {:2s} {:6s} {:9s}".format("Reciter name/MP3 File", "Data Size", "MP3s", "kbps", "Ch", "Mono/S", "Duration(sec)"))
    print("{:30s} {:10s} {:4s} {:4s} {:2s} {:6s} {:9s}".format("==============================", "=========", "====", "====", "==", "======", "============="))
    for dd in os.listdir(data_dir):
        reciter_dir = os.path.join(data_dir, dd)
        # Size
        #dd_size_bytes = os.path.getsize(reciter_dir)
        dd_size_bytes = get_dir_size(reciter_dir)
        dd_size_KB = dd_size_bytes / (1024)
        # Number of files
        num_files = len(os.listdir(reciter_dir))
        # Mp3 files
        mp3_cnt = 0
        for ff in os.listdir(reciter_dir):
            if ff.endswith('.mp3'):
                mp3_cnt += 1

        print("{:30s} {:6.0f} KB {:5d}".format(dd, dd_size_KB, mp3_cnt))

        for ff in os.listdir(reciter_dir):
            if not ff.endswith('.mp3'):
                continue
            mp3_file = os.path.join(reciter_dir, ff)

            duration = channel_layout = bit_rate = channels = bit_rate_kbps = -1
            try:
                # Look at some audio features
                y, sr = librosa.load(mp3_file, sr=None)
                # Get the length of the audio
                duration = librosa.core.get_duration(y=y, sr=sr)
                duration = len(y) / sr

                # Sample rate
                info = mediainfo(mp3_file)
                #print(info)
                channel_layout = info['channel_layout']
                bit_rate = int(info['bit_rate'])
                channels = int(info['channels'])
                #artist = info['artist']
                artist = ""
                bit_rate_kbps = bit_rate / 1000
            
            except:
                print("Couldn't process, skipping: ", mp3_file)

            #print("{:>30s} {:6.0f} KB {:5d} {:12d} {:5.1f}".format(ff, dd_size_KB, 0, int(info['sample_rate']), duration))
            #with audioread.audio_open(mp3_file) as input_file:
            #    sr_native = input_file.samplerate
            #    n_channels = input_file.channels
            #print(sr_native, n_channels)
            if verbose == True:
                print("{:>30s} {:6.0f} KB {:5s} {:4.0f} {:2d} {:6s} {:13.1f}".format(ff, dd_size_KB, "", bit_rate_kbps, channels, channel_layout, duration))

            # Size
            dd_size_bytes = os.path.getsize(mp3_file)
            dd_size_KB = dd_size_bytes / (1024)

            sv_reciter.append(dd)
            sv_file_name.append(ff)
            sv_data_size.append(int(dd_size_KB))
            sv_bit_rate.append(int(bit_rate_kbps))
            sv_ch.append(channels)
            sv_ms.append(channel_layout)
            sv_dur.append(duration)
            filenoext, sura, aya, ayainq = get_mp3_file_info(ff)
            sv_filenoext.append(filenoext)
            sv_sura.append(sura)
            sv_aya.append(aya)
            sv_ayainq.append(ayainq)
            #break
        #break
    
    # Create a DataFrame with all the info
    df = pd.DataFrame(list(zip(sv_reciter, sv_file_name, sv_data_size, 
        sv_bit_rate, sv_ch, sv_ms, sv_dur, sv_filenoext, sv_sura, sv_aya, sv_ayainq)), 
               columns=column_names) 

    return df

def get_mp3_file_info(file_name):
    """ Get info from MP3 file name. Use lbl_aya_dict dictionary for lookup so this variable needs to be defined before.

       :param file_name: MP3 file name
       :type file_name: str
       :return: label, sura, aya, ayainquran
       :rtype: str 

    File name should be ######.mp3, e.g. 001004.mp3
        SuraAya    = 001004
        Sura       = 1
        Aya        = 4
        AyaInQuran = 3
    """

    # Return items
    suraaya   = ''
    sura      = -1
    aya       = -1
    ayainquan = -1

    suraaya = os.path.splitext(file_name)[0]
    sura = int(suraaya[:3])
    aya  = int(suraaya[3:])
    ayainquan = int(lbl_aya_dict[suraaya])

    return suraaya, sura, aya, ayainquan

def audio_data_initialize(dir_name=audio_data_dir):
    """ Initialize audio data directory

       :param dir_name: Directory name to initialize
       :type dir_name: str
       :return: dir_name
       :rtype: str 

    """

    # If directory exists, delete the directory 
    if pathlib.Path(dir_name).exists():
        print("Directory exists, deleting :", dir_name)
        #pathlib.Path(dir_name).rmdir()
        shutil.rmtree(dir_name)

    # Create the directory
    print("Creating directory :", dir_name)
    pathlib.Path(dir_name).mkdir()

    return dir_name

def populate_audio_files(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name, 
        audio_data_dir=audio_data_dir, reciters=None, suraFrom=None, suraTo=None,
        ayaFrom=None, ayaTo=None):
    """ Populate audio files for the reciters in the given directory.

       :param zip_data_dir: Directory containing the zipped data
       :type zip_data_dir: str
       :param zip_file_name: Name of the zip file in the directory
       :type zip_file_name: str
       :param audio_data_dir: Name of the download directory with the reciter zip files
       :type audio_data_dir: str
       :param reciters: Name(s) of the reciters. Only their data will be processed, rest will be ignored
       :type reciters: list of strings
       :param suraFrom: Starting sura
       :type suraFrom: int
       :param suraTo: Ending sura
       :type suraTo: int
       :param ayaFrom: Starting aya
       :type ayaFrom: int
       :param ayaTo: Ending aya
       :type ayaTo: int
       :return: None
       :rtype: None

    """

    # Convert SuraAya.mp3 name to AyaInQuran 
    audio_labels, ayainq_list = qsura_ayat_to_labels(suraFrom=suraFrom, suraTo=suraTo, 
        ayaFrom=ayaFrom, ayaTo=ayaTo)
    #print("Audio labels: ", audio_labels)
    #print("Aya in Quran labels: ", ayainq_list)

    for dd in os.listdir(zip_data_dir):
        if dd not in reciters:
            continue

        print("Found reciter: ", dd)
        # Create the directory
        reciter_dir = os.path.join(audio_data_dir, dd)
        print("Creating directory :", reciter_dir)
        pathlib.Path(reciter_dir).mkdir()

        # Each directory has one zip file called 000_versebyverse.zip
        dd_zip_file = zip_data_dir + "/" + dd + "/" + zip_file_name
        # Size
        dd_size_bytes = os.path.getsize(dd_zip_file)
        dd_size_MB = dd_size_bytes / (1024 * 1024)
        # Number of files
        archive = zipfile.ZipFile(dd_zip_file, 'r')
        num_files = len(archive.namelist())
        # Mp3 files
        mp3_cnt = 0
        for ff in archive.namelist():
            if ff.endswith('.mp3'):
                mp3_cnt += 1

        print("{:30s} {:6.0f} MB {:6d} {:12d}".format(dd, dd_size_MB, num_files, mp3_cnt))

        num_files_extracted = 0
        for lbl in audio_labels:
            mp3_file = lbl + ".mp3"
            if mp3_file not in archive.namelist():
                print("ERROR: Couldn't find file: ", mp3_file)
                continue
            archive.extract(mp3_file, path=reciter_dir)
            num_files_extracted += 1
        print("{} files extracted".format(num_files_extracted))
    print()

    return None


def extract_audio_features(reciter, mp3_file, sr=22050, n_mfcc=13, n_fft=2048, hop_length=512,
    pad_duration=None, read_duration=None, features_list=['mfcc', 'zcr', 'spectral_center', 
    'spectral_rolloff', 'chroma', 'spectral_bandwidth_2', 'spectral_bandwidth_3', 
    'spectral_bandwidth_4', 'spectral_contrast'], shp_0=None, shp_1=None, normalization=True):
    """ Extract the requested audio features.

       :param reciter: Name of the reciter
       :type reciter: str
       :param mp3_file: Name of the mp3_file
       :type mp3_file: str
       :param sr: Sampling rate to apply during audio file read with librosa
       :type sr: int
       :param n_mfcc: Number of MFCC features to return by librosa
       :type n_mfcc: int
       :param n_fft: Number of Fast Frourier Transform frequeny bins to use with librosa
       :type n_fft: int
       :param hop_length: hop_length for librosa. This says how much to overlap audio frame windows during feature extraction.
       :type hop_length: int
       :param pad_duration: Pad the duration to this number if the duration of the MP3 file is shorter
       :type pad_duration: int
       :param read_duration: Read only this much duration from the audio file
       :type read_duration: int
       :param features_list: List of features to extract
       :type features_list: list
       :param shp_0: Initialize the return data NumPy array with this shape
       :type shp_0: int
       :param shp_1: Initialize the return data NumPy array with this shape
       :type shp_1: int
       :param normalization: Normalize the MFCC data. Only works for the MFCC feature
       :type normalization: bool
       :return: columns, data, feature_shapes, new_shp_0, new_shp_1
       :rtype: list, NumPy array, list, int, int

    """

    # File name is dir/reciter/mp3_file
    file_name = os.path.join(audio_data_dir, reciter, mp3_file)

    # Initilize return variables
    columns = data = feature_shapes = new_shp_0 = new_shp_1 = None

    # Few MP3 files for few reciters were corrupted. Give a message about them & bail out
    try:
        y , sr = librosa.load(file_name, sr=sr, duration=read_duration)
        orig_duration = len(y) / sr
        #print("pad_duration = ", pad_duration)
        #print("read_duration = ", read_duration)
        #print("orig_duration = ", orig_duration)
        # Pad the duration
        if pad_duration is not None:
            if pad_duration > orig_duration:
                new_len_y = pad_duration * sr
                y = librosa.util.fix_length(y, new_len_y)
            elif pad_duration <= orig_duration:
                # Nothing to be done!
                pass
        duration = len(y) / sr
        #print("FINAL: duration = ", duration)

        # Column names
        columns = list()

        # Feature shapes
        feature_shapes = list()

        #print("shp_0 :", shp_0)
        #print("shp_1 :", shp_1)
        if shp_0 is not None and shp_1 is not None:
            if 'spect' in features_list:
                #data = np.empty(
                #    (shp_0, shp_1), dtype=np.float64
                #)
                data = np.empty(
                    (shp_1, shp_0), dtype=np.float64
                )
            else:
                data = np.zeros(
                    (shp_1, shp_0), dtype=np.float64
                )
            #data = np.empty(
            #  (0, shp_0, shp_1)
            #)
            #print("data initialized:")
            #print(type(data))
            #print(data.shape)
        else:
            data = list()
            #print(type(data))

        # Start index is 0 and gets updated after feature is concatenated to "data"
        start_idx = 0
        if 'mfcc' in features_list:
            #spect = librosa.feature.melspectrogram(y=y, sr=sr,n_fft=n_fft, hop_length=hop_length)
            mfcc = librosa.feature.mfcc(
                y=y, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc, n_fft=n_fft
            )
            feature_shapes.append(mfcc.shape)
            #print("mfcc:")
            #print(mfcc.shape)
            #print("mfcc.T:")
            #print(mfcc.T)
            #print("mfcc:")
            #print(mfcc)
            #print(np.amin(mfcc), np.amax(mfcc), np.mean(mfcc))
            # Normalize?
            if normalization == True:
                divby = abs(np.amin(mfcc))
                if abs(np.amax(mfcc)) > divby:
                    divby = abs(np.amax(mfcc))

                #print("divby = ", divby)
                mfcc_orig = mfcc
                x = mfcc / divby
                #print("x = ", x)
                #x = mfcc / math.abs()
                mfcc = x
            for i in range(1, mfcc.shape[0]+1):
                columns.append('mfcc{}'.format(i))
            if shp_0 is not None:
                #data = np.append(data, [mfcc.T], axis=0)
                data[:, start_idx:start_idx+mfcc.shape[0]] = mfcc.T[0:mfcc.shape[1], :]
                start_idx += mfcc.shape[0]
                #print("mfcc start_idx updated to: ", start_idx)
        if 'zcr' in features_list:
            zcr = librosa.feature.zero_crossing_rate(y)
            feature_shapes.append(zcr.shape)
            #print("zcr:")
            #print(zcr.shape)
            #print(zcr.T)
            #print(zcr.shape[1])
            columns.append('zcr')
            if shp_0 is not None:
                data[:, start_idx:start_idx+zcr.shape[0]] = zcr.T[0:zcr.shape[1], :]
                start_idx += zcr.shape[0]
                #print("zcr start_idx updated to: ", start_idx)
        if 'spectral_center' in features_list:
            spectral_center = librosa.feature.spectral_centroid(
                y=y, sr=sr, hop_length=hop_length
            )
            feature_shapes.append(spectral_center.shape)
            #print("spectral_center:")
            #print(spectral_center.shape)
            columns.append('spectral_center')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_center.shape[0]] = spectral_center.T[0:spectral_center.shape[1], :]
                start_idx += spectral_center.shape[0]
                #print("spectral_center start_idx updated to: ", start_idx)
        if 'spectral_rolloff' in features_list:
            #spectral_rolloff = librosa.feature.spectral_rolloff(y+0.01, sr=sr)[0]
            spectral_rolloff = librosa.feature.spectral_rolloff(y+0.01, sr=sr)
            feature_shapes.append(spectral_rolloff.shape)
            #print("spectral_rolloff:")
            #print(spectral_rolloff.shape)
            #print(spectral_rolloff)
            columns.append('spectral_rolloff')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_rolloff.shape[0]] = spectral_rolloff.T[0:spectral_rolloff.shape[1], :]
                start_idx += spectral_rolloff.shape[0]
                #print("spectral_rolloff start_idx updated to: ", start_idx)
        if 'chroma' in features_list:
            chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
            feature_shapes.append(chroma.shape)
            #print("chroma:")
            #print(chroma.shape)
            for i in range(1, chroma.shape[0]+1):
                columns.append('chroma{}'.format(i))
            if shp_0 is not None:
                data[:, start_idx:start_idx+chroma.shape[0]] = chroma.T[0:chroma.shape[1], :]
                start_idx += chroma.shape[0]
                #print("chroma start_idx updated to: ", start_idx)
        if 'spectral_bandwidth_2' in features_list:
            spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(y+0.01, sr=sr)
            feature_shapes.append(spectral_bandwidth_2.shape)
            #print("spectral_bandwidth_2:")
            #print(spectral_bandwidth_2.shape)
            columns.append('spectral_bandwidth_2')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_bandwidth_2.shape[0]] = spectral_bandwidth_2.T[0:spectral_bandwidth_2.shape[1], :]
                start_idx += spectral_bandwidth_2.shape[0]
                #print("spectral_bandwidth_2 start_idx updated to: ", start_idx)
        if 'spectral_bandwidth_3' in features_list:
            spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(y+0.01, sr=sr, p=3)
            feature_shapes.append(spectral_bandwidth_3.shape)
            #print("spectral_bandwidth_3:")
            #print(spectral_bandwidth_3.shape)
            columns.append('spectral_bandwidth_3')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_bandwidth_3.shape[0]] = spectral_bandwidth_3.T[0:spectral_bandwidth_3.shape[1], :]
                start_idx += spectral_bandwidth_3.shape[0]
                #print("spectral_bandwidth_3 start_idx updated to: ", start_idx)
        if 'spectral_bandwidth_4' in features_list:
            spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(y+0.01, sr=sr, p=4)
            feature_shapes.append(spectral_bandwidth_4.shape)
            #print("spectral_bandwidth_4:")
            #print(spectral_bandwidth_4.shape)
            columns.append('spectral_bandwidth_4')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_bandwidth_4.shape[0]] = spectral_bandwidth_4.T[0:spectral_bandwidth_4.shape[1], :]
                start_idx += spectral_bandwidth_4.shape[0]
                #print("spectral_bandwidth_4 start_idx updated to: ", start_idx)
        if 'spectral_contrast' in features_list:
            spectral_contrast = librosa.feature.spectral_contrast(
                y=y, sr=sr, hop_length=hop_length
            )
            feature_shapes.append(spectral_contrast.shape)
            #print("spectral_contrast:")
            #print(spectral_contrast.shape)
            #print(spectral_contrast)
            #print(spectral_contrast.T)
            for i in range(1, spectral_contrast.shape[0]+1):
                columns.append('spcontr{}'.format(i))
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_contrast.shape[0]] = spectral_contrast.T[0:spectral_contrast.shape[1], :]
                start_idx += spectral_contrast.shape[0]
                #print("spectral_contrast start_idx updated to: ", start_idx)
        if 'spect' in features_list:
            spect = librosa.feature.melspectrogram(y=y, sr=sr,n_fft=n_fft, hop_length=hop_length)
            spect = librosa.power_to_db(spect, ref=np.max)
            feature_shapes.append(spect.shape)
            #print("spect:")
            #print(spect.shape)
            for i in range(1, spect.shape[0]+1):
                columns.append('spect{}'.format(i))
            if shp_0 is not None:
                data[:, start_idx:start_idx+spect.shape[0]] = spect.T[0:spect.shape[1], :]
                start_idx += spect.shape[0]
                #print("spect start_idx updated to: ", start_idx)

        new_shp_0 = shp_0
        new_shp_1 = shp_1
        if shp_0 is None:
            #print(feature_shapes)
            new_shp_0 = 0
            for i, shp in enumerate(feature_shapes):
                if i == 0:
                    prev_shp_1 = shp[1]
                else:
                    if shp[1] != prev_shp_1:
                        print("ERROR: shape[1] are different: {} != {}".format(shp[1], prev_shp_1))
                print("shp[0] :", shp[0])
                new_shp_0 += shp[0]
            new_shp_1 = prev_shp_1
            print("new_shp_0 :", new_shp_0)
            print("new_shp_1 :", new_shp_1)

        #print("duration :", duration)

    except:
            print("Couldn't process, skipping: ", file_name)

    return columns, data, feature_shapes, new_shp_0, new_shp_1

# Suras: Update 'Set' column with test/train/val in df
def assign_set_sura(row):
    """ Mark Suras for Test/Train/Validation (uses row.Sura column). Requires Train_/Val_/Test_Suras lists to be defined prior to calling.

       :param row: DataFrame row
       :type row: DataFrame row
       :return: None
       :rtype: None

    """

    train = Train_Suras
    val   = Val_Suras
    test  = Test_Suras
    if row.Sura in val:
        return "validation"
    elif row.Sura in test:
        return "test"
    else:
        return "train"

# Ayas: Update 'Set' column with test/train/val in df
def assign_set_aya(row):
    """ Mark Ayas for Test/Train/Validation (uses row.AyaInQuran column). Requires Train_/Val_/Test_Ayas lists to be defined prior to calling.

       :param row: DataFrame row
       :type row: DataFrame row
       :return: None
       :rtype: None

    """

    train = Train_Ayas
    val   = Val_Ayas
    test  = Test_Ayas
    if row.AyaInQuran in val:
        return "validation"
    elif row.AyaInQuran in test:
        return "test"
    else:
        return "train"

# Ayas: Update 'Set' column with test/train/val in df
def assign_set_filename(row):
    """ Mark Ayas for Test/Train/Validation (uses row.FileName column). Requires Train_/Val_/Test_Ayas lists to be defined prior to calling.

       :param row: DataFrame row
       :type row: DataFrame row
       :return: None
       :rtype: None

    """

    train = Train_Ayas
    val   = Val_Ayas
    test  = Test_Ayas
    if row.FileName in val:
        return "validation"
    elif row.FileName in test:
        return "test"
    else:
        return "train"

def gen_audio_data(df, shp0, shp1, normalization=True):
    """ Extract audio features for the given df which is a Train/Val/Test subset of the main df.

       :param df: DataFrame
       :type df: DataFrame
       :param shp_0: Initialize the return data NumPy array with this shape
       :type shp_0: int
       :param shp_1: Initialize the return data NumPy array with this shape
       :type shp_1: int
       :param normalization: Normalize the MFCC data. Only works for the MFCC feature
       :type normalization: bool
       :return: X_arr, reciters_arr
       :rtype: NumPy arr, NumPy arr

    """

    print("shp0 shp1 = ", shp0, shp1)
    X_arr = np.empty((0, shp1, shp0))
    print("X_arr initialized to :", X_arr.shape)
    reciters_arr = np.empty((0, len(list(le.classes_))))
    print("reciters_arr initialized to :", reciters_arr.shape)
    print("normalization :", normalization)

    cnt = 0
    for index, row in df.iterrows():
        cnt += 1
        ReciterName = row['ReciterName']
        FileName = row['FileName']
        # Get audio features
        columns, data, feature_shapes, new_shp_0, new_shp_1 = extract_audio_features(
                reciter=ReciterName, mp3_file=FileName, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, 
                hop_length=hop_length, pad_duration=pad_duration, read_duration=read_duration, 
                features_list=features_list, shp_0=shp0, shp_1=shp1, normalization=normalization)
        if columns == None and data == None and feature_shapes == None:
            # Skips in case of errors
            continue

        X_arr = np.append(X_arr, [data], axis=0)
                            
        reciters_list = [0 for i in range(0, len(list(le.classes_)))]
        reciters_index = list(le.transform([ReciterName]))[0]
        reciters_list[reciters_index] = 1
        reciters_arr = np.append(reciters_arr, [reciters_list], axis=0)
            
        if cnt % 100 == 0:
            print("Processed ", cnt)
        #if cnt == 10:
        #    break

    return X_arr, reciters_arr

def filter_duration(row):
    """ Finds the same FileName for all the selected_reciters (uses row.FileName/row.ReciterName columns). Uses selected_reciters variable to look for the recieter, so it needs to be defined.

       :param row: DataFrame row
       :type row: DataFrame row
       :return: 'Yes' or 'NaN'
       :rtype: str

    """

    my_df = df_tmp
    FileName = row.FileName
    #print("FileName =", FileName)
    not_found = False
    for rec in selected_reciters:
        #print("  rec =", rec)
        if ((my_df['ReciterName'] == rec) & (my_df['FileName'] == FileName)).any():
            pass
        else:
            not_found = True
            #print("not_found =", not_found)
            break
    
    if not_found == True:
        return 'NaN'
    else:
        return 'Yes'

print("helpers.py LOADED!")
# End of helpers.py

helpers.py LOADED!


In [3]:
##################
# imports
import os
import csv
import re
import pathlib
import xml.etree.ElementTree as ET
import zipfile
import shutil
import librosa
import math
import numpy as np 
import pandas as pd 
import warnings
import audioread
import time
import matplotlib.pyplot as plt
from pandas import ExcelWriter
from pydub.utils import mediainfo
import seaborn as sns

%matplotlib inline

print("Current dir is: ", os.getcwd())

Current dir is:  D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter


In [4]:
##################
# Default Constants
# Directory where MP3 files have been downloaded
zip_data_dir   = '../DownloadedReciters'
# Name of ZIP file for each reciter
zip_file_name  = '000_versebyverse.zip'
# Directory where processed data & other generated data will be stored
data_dir       = '../data'
audio_data_dir = os.path.join(os.getcwd(), data_dir, "audio")
quran_meta_xml = os.path.join(os.getcwd(), 'Qurandata', "quran-data.xml")
# Max/Min number of Sura/Aya with first as index 1
SuraIndexMIN   = 1
SuraIndexMAX   = 114
AyaIndexMIN    = 0
AyaIndexMAX    = 6236

# Suppress this warning from librosa:
# UserWarning: PySoundFile failed. Trying audioread instead.
warnings.filterwarnings('ignore')


In [5]:
# Create a lookup table between Mp3 filename and AyaInQuran
t0 = time.time()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=AyaIndexMIN, 
                                                ayaTo=AyaIndexMAX)
lbl_aya_df = pd.DataFrame(list(zip(labels_list, ayainq_list)), 
               columns=['Label', 'AyaInQuran']) 

lbl_aya_dict = dict(zip(labels_list, ayainq_list))
print("DONE in {:0.3} sec\n".format(time.time() - t0))

lbl_aya_df.head(10)

DONE in 0.014 sec



Unnamed: 0,Label,AyaInQuran
0,1001,0
1,1002,1
2,1003,2
3,1004,3
4,1005,4
5,1006,5
6,1007,6
7,2001,7
8,2002,8
9,2003,9


In [6]:
# Report stats on all downloaded data
t0 = time.time()
reciter_names = report_stats_zip_data(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name)
print("DONE in {:0.3} sec\n".format(time.time() - t0))

print("\nDownloaded Reciter name list :\n", reciter_names)

Reciter name                   Data Size  Files    MP3 Files
AbdulBasit                        864 MB   6255         6253
AbdullahBasfar                    435 MB   6239         6236
AbdulSamad                       1643 MB   6240         6238
AbdurrahmaanAs-Sudais             584 MB   6351         6349
AbuBakrAsh-Shaatree               729 MB   6356         6353
Ajami                            1436 MB   6354         6350
Alafasy                           825 MB   6352         6350
AliJaber                          701 MB   6354         6351
FaresAbbad                        594 MB   6357         6353
Ghamadi                           426 MB   6351         6349
HaniRifai                         702 MB   6239         6237
Karim Mansoori-Iran              1015 MB   6352         6348
KhalefaAl-Tunaiji                 757 MB   6238         6236
MaherAlMuaiqly                    586 MB   6350         6348
MinshawyMujawwad                 1650 MB   6351         6349
MohammadalTablaway      

## Reciter selection
The following 5 reciters were selected:
- Ghamadi
- Abdurrahmaan As-Sudais -> AbdurrahmaanAs-Sudais
- Ahmed Ibn Ali Al Ajamy -> Ajami
- Alafasy
- Fares Abbad -> Fares Abbad

Select 3000 Ayas. 60% Ayas will be used for training, 20% for validation, and 20% for test.

In [7]:
# List of reciters we are interested in
selected_reciters = ['Ghamadi', 'AbdurrahmaanAs-Sudais', 'Ajami', 'Alafasy', 'FaresAbbad']
print("Selected reciter: ", selected_reciters)

# List of Sura/Ayas we are interested in
suraFrom = None
suraTo   = None
ayaFrom  = AyaIndexMIN
ayaTo    = 2999

print("Selected Suras/Ayas: ")
print("      suraFrom: ", suraFrom)
print("        suraTo: ", suraTo)
print("       ayaFrom: ", ayaFrom)
print("         ayaTo: ", ayaTo)
print()

Selected reciter:  ['Ghamadi', 'AbdurrahmaanAs-Sudais', 'Ajami', 'Alafasy', 'FaresAbbad']
Selected Suras/Ayas: 
      suraFrom:  None
        suraTo:  None
       ayaFrom:  0
         ayaTo:  2999



In [8]:
if not pathlib.Path(data_dir).exists():
    print("Directory doesn't exist ...")
    audio_data_initialize(dir_name=data_dir)

# Start from a clean data directory
audio_data_initialize(dir_name=audio_data_dir)

# Populate the data directory with MP3 files for the reciters
t0 = time.time()
populate_audio_files(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name, 
        audio_data_dir=audio_data_dir, reciters=selected_reciters, suraFrom=suraFrom, suraTo=suraTo,
        ayaFrom=ayaFrom, ayaTo=ayaTo)
print("DONE in {:0.3} sec\n".format(time.time() - t0))


Directory exists, deleting : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\../data\audio
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\../data\audio
Found reciter:  AbdurrahmaanAs-Sudais
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\../data\audio\AbdurrahmaanAs-Sudais
AbdurrahmaanAs-Sudais             584 MB   6351         6349
3000 files extracted
Found reciter:  Ajami
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\../data\audio\Ajami
Ajami                            1436 MB   6354         6350
3000 files extracted
Found reciter:  Alafasy
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\../data\audio\Alafasy
Alafasy                           825 MB   6352         6350
3000 files extracted
Found reciter:  FaresAbbad
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\

In [9]:
# Report stats on all the extracted MP3 files and get the details in a DataFrame
t0 = time.time()
df = report_stats_audio_data(data_dir=audio_data_dir)
print("DONE in {:0.3} sec\n".format(time.time() - t0))

print("\n\n\nDONE")

Reciter name/MP3 File          Data Size  MP3s kbps Ch Mono/S Duration(sec)
AbdurrahmaanAs-Sudais          356372 KB  3000
Ajami                          871152 KB  3000
Alafasy                        507975 KB  3000
FaresAbbad                     357083 KB  3000
Ghamadi                        264714 KB  3000
DONE in 4.25e+03 sec




DONE


### Save the DataFrame df with all MP3 extracted info for future
Save the DataFrame information in an Excel file. This helps extract the data & use Librosa to read MP3 file information only one time (both are slow processes to run). Once all the data is in the DataFrame "df", we can do remainder of the data preparation by using the df and updating it later as necessary (e.g. to mark ayas for train/val/test, etc.)

**Note**: If you are executing this Notebook for the **first** time then save to Excel file by executing the cell below. **Skip** the below cell for later run, otherwise it will save the current df in memory which may lead to undesired results!

In [11]:
# SKIP IF DONE EARLIER!!!
# Save the df 
from pandas import ExcelWriter
print("data_dir = ", data_dir)
my_excel_fileA = os.path.join(data_dir, 'pd_df_1.xlsx')
writer = ExcelWriter(my_excel_fileA)
df.to_excel(writer,'Sheet1',index=False)
writer.save()

data_dir =  ../data


In [8]:
# SKIP IF RUNNING THIS NOTEBOOK FOR THE FIRST TIME!!!
# Read info saved earlier to save time
print("data_dir = ", data_dir)
my_excel_fileA = os.path.join(data_dir, 'pd_df_1.xlsx')
# Read Excel file
df = pd.read_excel(my_excel_fileA)

data_dir =  data3


In [12]:
# A quick look at the DataFrame
df.head()

Unnamed: 0,ReciterName,FileName,DataSizeKB,BitRate,Channels,Mono/Stereo,Duration,FileNoExt,Sura,Aya,AyaInQuran
0,AbdurrahmaanAs-Sudais,001001.mp3,24,64,2,stereo,3.003719,1001,1,1,0
1,AbdurrahmaanAs-Sudais,001002.mp3,35,64,2,stereo,4.414331,1002,1,2,1
2,AbdurrahmaanAs-Sudais,001003.mp3,24,64,2,stereo,3.029841,1003,1,3,2
3,AbdurrahmaanAs-Sudais,001004.mp3,27,64,2,stereo,3.369433,1004,1,4,3
4,AbdurrahmaanAs-Sudais,001005.mp3,38,64,2,stereo,4.858413,1005,1,5,4


In [13]:
# A quick look at the DataFrame shape
df.shape

(15000, 11)

In [14]:
# Mark ayas for each reciter for training/test/validation

# Work with a copy of the df.
dfD = df.copy()

# Add a new column to see if Ayas are Common among all reciters
# This was needed when I was ignoring some durations, and wanted
# to make sure I only keep the common Ayas between reciters. 
# It's no longer needed but keeping it just-in-case.
newcol = 'Common'
dfD[newcol] = 'Yes'
df_tmp = dfD

# Save the df with the new column
my_out_file = os.path.join(data_dir, 'pd_df_2.xlsx')
writer = ExcelWriter(my_out_file)
df_tmp.to_excel(writer,'Sheet1',index=False)
writer.save()

# Only select the common ayas between all reciters
df_tmp = df_tmp[df_tmp[newcol] == 'Yes']

# Get all the available ayas in the df & create index boundaries for 
# Train/Val/Test
FileNames = list(df_tmp['FileName'].unique())
print("len = ", len(FileNames))
tot_ayas = len(FileNames)
test_val_ayas = math.ceil(0.2 * tot_ayas)
print("test_val_ayas = ", test_val_ayas)

val_start_idx  = tot_ayas - 1 - test_val_ayas
test_start_idx = val_start_idx - test_val_ayas
print("        val_start_idx = ", val_start_idx)
print("       test_start_idx = ", test_start_idx)
Val_Ayas   = FileNames[val_start_idx+1:tot_ayas+1]
Test_Ayas  = FileNames[test_start_idx+1:val_start_idx+1]
Train_Ayas = FileNames[0:test_start_idx+1]

df_tmp['Set'] = df_tmp.apply(assign_set_filename, axis=1)

# Save the df after updating Train/Val/Test info.
my_out_file = os.path.join(data_dir, 'pd_df_3.xlsx')
writer = ExcelWriter(my_out_file)
df_tmp.to_excel(writer,'Sheet1',index=False)
writer.save()


len =  3000
test_val_ayas =  600
        val_start_idx =  2399
       test_start_idx =  1799


In [15]:
# Look at Train/Val/Test Ayas to make sure no overlap
lookcol = 'AyaInQuran'

print(" Test: Number of items = {}, StartIndex = {}, EndIndex = {}".format(len(df_tmp[df_tmp['Set'] == "test"][lookcol].unique()), df_tmp[df_tmp['Set'] == "test"][lookcol].unique()[0], df_tmp[df_tmp['Set'] == "test"][lookcol].unique()[-1] ))
print("  Val: Number of items = {}, StartIndex = {}, EndIndex = {}".format(len(df_tmp[df_tmp['Set'] == "validation"][lookcol].unique()), df_tmp[df_tmp['Set'] == "validation"][lookcol].unique()[0], df_tmp[df_tmp['Set'] == "validation"][lookcol].unique()[-1] ))
print("Train: Number of items = {}, StartIndex = {}, EndIndex = {}".format(len(df_tmp[df_tmp['Set'] == "train"][lookcol].unique()), df_tmp[df_tmp['Set'] == "train"][lookcol].unique()[0], df_tmp[df_tmp['Set'] == "train"][lookcol].unique()[-1] ))

# Check(s)
display(df_tmp[df_tmp['Set'] == "test"][lookcol].unique())
display(df_tmp[df_tmp['Set'] == "validation"][lookcol].unique())
display(df_tmp[df_tmp['Set'] == "train"][lookcol].unique())


 Test: Number of items = 600, StartIndex = 1800, EndIndex = 2399
  Val: Number of items = 600, StartIndex = 2400, EndIndex = 2999
Train: Number of items = 1800, StartIndex = 0, EndIndex = 1799


array([1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, 1810,
       1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, 1819, 1820, 1821,
       1822, 1823, 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832,
       1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1842, 1843,
       1844, 1845, 1846, 1847, 1848, 1849, 1850, 1851, 1852, 1853, 1854,
       1855, 1856, 1857, 1858, 1859, 1860, 1861, 1862, 1863, 1864, 1865,
       1866, 1867, 1868, 1869, 1870, 1871, 1872, 1873, 1874, 1875, 1876,
       1877, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887,
       1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898,
       1899, 1900, 1901, 1902, 1903, 1904, 1905, 1906, 1907, 1908, 1909,
       1910, 1911, 1912, 1913, 1914, 1915, 1916, 1917, 1918, 1919, 1920,
       1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931,
       1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942,
       1943, 1944, 1945, 1946, 1947, 1948, 1949, 19

array([2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410,
       2411, 2412, 2413, 2414, 2415, 2416, 2417, 2418, 2419, 2420, 2421,
       2422, 2423, 2424, 2425, 2426, 2427, 2428, 2429, 2430, 2431, 2432,
       2433, 2434, 2435, 2436, 2437, 2438, 2439, 2440, 2441, 2442, 2443,
       2444, 2445, 2446, 2447, 2448, 2449, 2450, 2451, 2452, 2453, 2454,
       2455, 2456, 2457, 2458, 2459, 2460, 2461, 2462, 2463, 2464, 2465,
       2466, 2467, 2468, 2469, 2470, 2471, 2472, 2473, 2474, 2475, 2476,
       2477, 2478, 2479, 2480, 2481, 2482, 2483, 2484, 2485, 2486, 2487,
       2488, 2489, 2490, 2491, 2492, 2493, 2494, 2495, 2496, 2497, 2498,
       2499, 2500, 2501, 2502, 2503, 2504, 2505, 2506, 2507, 2508, 2509,
       2510, 2511, 2512, 2513, 2514, 2515, 2516, 2517, 2518, 2519, 2520,
       2521, 2522, 2523, 2524, 2525, 2526, 2527, 2528, 2529, 2530, 2531,
       2532, 2533, 2534, 2535, 2536, 2537, 2538, 2539, 2540, 2541, 2542,
       2543, 2544, 2545, 2546, 2547, 2548, 2549, 25

array([   0,    1,    2, ..., 1797, 1798, 1799], dtype=int64)

In [16]:
# Once satisfied with updated df_tmp, assign it back to df
df = df_tmp.copy()

df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,ReciterName,FileName,DataSizeKB,BitRate,Channels,Mono/Stereo,Duration,FileNoExt,Sura,Aya,AyaInQuran,Common,Set
0,AbdurrahmaanAs-Sudais,001001.mp3,24,64,2,stereo,3.003719,1001,1,1,0,Yes,train
1,AbdurrahmaanAs-Sudais,001002.mp3,35,64,2,stereo,4.414331,1002,1,2,1,Yes,train
2,AbdurrahmaanAs-Sudais,001003.mp3,24,64,2,stereo,3.029841,1003,1,3,2,Yes,train
3,AbdurrahmaanAs-Sudais,001004.mp3,27,64,2,stereo,3.369433,1004,1,4,3,Yes,train
4,AbdurrahmaanAs-Sudais,001005.mp3,38,64,2,stereo,4.858413,1005,1,5,4,Yes,train


In [17]:
# Check the shapes
print("test: ",df[df['Set'] == "test"].shape)
print("validation: ",df[df['Set'] == "validation"].shape)
print("train: ",df[df['Set'] == "train"].shape)

df_test  = df[df['Set'] == "test"]
df_valid = df[df['Set'] == "validation"]
df_train = df[df['Set'] == "train"]

print(df_test.shape, df_valid.shape, df_train.shape)

test:  (3000, 13)
validation:  (3000, 13)
train:  (9000, 13)
(3000, 13) (3000, 13) (9000, 13)


In [18]:
# Encoding for the reciters as lables
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['ReciterName'])

print("Classes:", len(list(le.classes_)), list(le.classes_))
print("Classes:", len(list(le.classes_)), list(le.transform(le.classes_)))
print()
#print(le.transform(df['ReciterName']))

# Create a dictionary
dict_reciter_to_label = dict(zip(list(le.classes_), list(le.transform(le.classes_))))
print("dict_reciter_to_label = ", dict_reciter_to_label)
dict_label_to_reciter = {v: k for k, v in dict_reciter_to_label.items()}
print("dict_label_to_reciter = ", dict_label_to_reciter)


Classes: 5 ['AbdurrahmaanAs-Sudais', 'Ajami', 'Alafasy', 'FaresAbbad', 'Ghamadi']
Classes: 5 [0, 1, 2, 3, 4]

dict_reciter_to_label =  {'AbdurrahmaanAs-Sudais': 0, 'Ajami': 1, 'Alafasy': 2, 'FaresAbbad': 3, 'Ghamadi': 4}
dict_label_to_reciter =  {0: 'AbdurrahmaanAs-Sudais', 1: 'Ajami', 2: 'Alafasy', 3: 'FaresAbbad', 4: 'Ghamadi'}


In [19]:
list(le.transform(['Alafasy']))[0]

2

# Extract Audio Features
I wanted to make sure I am able to explore & switch between different features easily so there are many audio features that have been coded in the helper function "extract_audio_features".

From the data frame it is clear that the various MP3 files have different characteristics. Almost all of these need to be "matched" in order to get the correct data. Here is how this was acheived:
- Bit rate (kbps)    - This didn't seem to make a difference, so nothing was done about this characteristic.
- Sample rate (kHz)  - The sample rate needs to be the same across all audio files so features are comparable. Librosa can be given the sample rate as an input. The default is "22050". I left it at default so all the files will be re-sampled at this rate during the read operation.
- Duration (seconds) - The duration needs to be the same across all audio files. Librosa by default reads the whole duration of the file. We can read a smaller duration but can't "pad" the duration during read. There were two options to deal with the varying durations of the audio files:
     - Figure out min duration & only read the minimum duration.
     - Figure out max duration & pad the duration after reading the MP3 file.
     - Figure out an arbitrary number like 3 seconds. If this works then it's great at keeping the data size small!
     
     After experimenting with different durations starting from 30 seconds, I am finally choosing to go with the 3 sec duration.

Other parameters (hop_length, etc.) are related to the Librosa and were made the same during feature extraction.

One other thing to note is that in order to get same data array sizes from each audio file, we first run feature extraction on one audio file to get the "shape" of the data array. Then the data array is initilized and real feature extraction begins.

In [20]:
# Before audio features can be extracted, need to figure out the longest/shortest
# duration and make all audio same duration
duration_max = df['Duration'].max()
print(duration_max)
# Round it up
new_max_duration = math.ceil(duration_max)
print(new_max_duration)
print("Max duration = {} seconds".format(new_max_duration))

duration_min = df['Duration'].min()
print(duration_min)
# Round it up
new_min_duration = math.ceil(duration_min)
print(new_min_duration)
print("Min duration = {} seconds".format(new_min_duration))
# %load helpers.py

149.49877551020407
150
Max duration = 150 seconds
0.9806575963718821
1
Min duration = 1 seconds


In [21]:
# Run feature extraction on one audio file to get the "shape" of the data array
reciter = df.lookup([0], ['ReciterName'])[0]
mp3_file = df.lookup([0], ['FileName'])[0]
read_duration = pad_duration = 3
shp_0 = shp_1 = None
sr = 22050
n_fft = 2048
n_mfcc = 13
hop_length = 512
features_list = ['spect']
savez_dir = '../datamelspect'

print("reciter       = ", reciter)
print("mp3_file      = ", mp3_file)
print("read_duration = ", read_duration)
print("pad_duration  = ", pad_duration)
print("        shp_0 = ", shp_0)
print("        shp_1 = ", shp_1)
print("           sr = ", sr)
print("        n_fft = ", n_fft)
print("       n_mfcc = ", n_mfcc)
print("   hop_length = ", hop_length)
print("features_list = ", features_list)
print("    savez_dir = ", savez_dir)

if not pathlib.Path(savez_dir).exists():
    print("Directory doesn't exist ...")
    audio_data_initialize(dir_name=savez_dir)


reciter       =  AbdurrahmaanAs-Sudais
mp3_file      =  001001.mp3
read_duration =  3
pad_duration  =  3
        shp_0 =  None
        shp_1 =  None
           sr =  22050
        n_fft =  2048
       n_mfcc =  13
   hop_length =  512
features_list =  ['spect']
    savez_dir =  ../datamelspect
Directory doesn't exist ...
Creating directory : ../datamelspect


In [22]:
columns, data, feature_shapes, new_shp_0, new_shp_1 = extract_audio_features(
    reciter=reciter, mp3_file=mp3_file, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, 
    hop_length=hop_length, pad_duration=pad_duration, read_duration=read_duration, 
    features_list=features_list, shp_0=shp_0, shp_1=shp_1)
print("reciter INIT:")
print(type(data))
print("shape 0/1 :")
print(new_shp_0, new_shp_1)

shp[0] : 128
new_shp_0 : 128
new_shp_1 : 130
reciter INIT:
<class 'list'>
shape 0/1 :
128 130


In [23]:
t0 = time.time()
X_test, y_test = gen_audio_data(df_test, shp0=new_shp_0, shp1=new_shp_1)
print("DONE in {:0.3} sec\n".format(time.time() - t0))


shp0 shp1 =  128 130
X_arr initialized to : (0, 130, 128)
reciters_arr initialized to : (0, 5)
normalization : True
Processed  100
Processed  200
Processed  300
Processed  400
Processed  500
Processed  600
Processed  700
Processed  800
Processed  900
Processed  1000
Processed  1100
Processed  1200
Processed  1300
Processed  1400
Processed  1500
Processed  1600
Processed  1700
Processed  1800
Processed  1900
Processed  2000
Processed  2100
Processed  2200
Processed  2300
Processed  2400
Processed  2500
Processed  2600
Processed  2700
Processed  2800
Processed  2900
Processed  3000
DONE in 9.46e+02 sec



In [24]:
print(X_test.shape, y_test.shape)

(3000, 130, 128) (3000, 5)


In [25]:
np.savez(os.path.join(savez_dir, 'test_arr'), X_test, y_test)

In [26]:
t0 = time.time()
X_valid, y_valid = gen_audio_data(df_valid, shp0=new_shp_0, shp1=new_shp_1)
print("DONE in {:0.3} sec\n".format(time.time() - t0))


shp0 shp1 =  128 130
X_arr initialized to : (0, 130, 128)
reciters_arr initialized to : (0, 5)
normalization : True
Processed  100
Processed  200
Processed  300
Processed  400
Processed  500
Processed  600
Processed  700
Processed  800
Processed  900
Processed  1000
Processed  1100
Processed  1200
Processed  1300
Processed  1400
Processed  1500
Processed  1600
Processed  1700
Processed  1800
Processed  1900
Processed  2000
Processed  2100
Processed  2200
Processed  2300
Processed  2400
Processed  2500
Processed  2600
Processed  2700
Processed  2800
Processed  2900
Processed  3000
DONE in 9.43e+02 sec



In [27]:
print(X_valid.shape, y_valid.shape)

(3000, 130, 128) (3000, 5)


In [28]:
np.savez(os.path.join(savez_dir, 'valid_arr'), X_valid, y_valid)

In [29]:
t0 = time.time()
X_train, y_train = gen_audio_data(df_train, shp0=new_shp_0, shp1=new_shp_1)
print("DONE in {:0.3} sec\n".format(time.time() - t0))


shp0 shp1 =  128 130
X_arr initialized to : (0, 130, 128)
reciters_arr initialized to : (0, 5)
normalization : True
Processed  100
Processed  200
Processed  300
Processed  400
Processed  500
Processed  600
Processed  700
Processed  800
Processed  900
Processed  1000
Processed  1100
Processed  1200
Processed  1300
Processed  1400
Processed  1500
Processed  1600
Processed  1700
Processed  1800
Processed  1900
Processed  2000
Processed  2100
Processed  2200
Processed  2300
Processed  2400
Processed  2500
Processed  2600
Processed  2700
Processed  2800
Processed  2900
Processed  3000
Processed  3100
Processed  3200
Processed  3300
Processed  3400
Processed  3500
Processed  3600
Processed  3700
Processed  3800
Processed  3900
Processed  4000
Processed  4100
Processed  4200
Processed  4300
Processed  4400
Processed  4500
Processed  4600
Processed  4700
Processed  4800
Processed  4900
Processed  5000
Processed  5100
Processed  5200
Processed  5300
Processed  5400
Processed  5500
Processed  56

In [30]:
print(X_train.shape, y_train.shape)

(9000, 130, 128) (9000, 5)


In [31]:
np.savez(os.path.join(savez_dir, 'train_arr'), X_train, y_train)

In [32]:
# Convert the scale of training data
X_train_raw = librosa.core.db_to_power(X_train, ref=1.0)
print(np.amin(X_train_raw), np.amax(X_train_raw), np.mean(X_train_raw))

1e-08 1.0000008783668917 0.005410723282877886


In [33]:
X_train_log = np.log(X_train_raw)
print(np.amin(X_train_log), np.amax(X_train_log), np.mean(X_train_log))

-18.420680743952367 8.783665059016772e-07 -10.345927531676363


In [34]:
X_valid_raw = librosa.core.db_to_power(X_valid, ref=1.0)
X_valid_log = np.log(X_valid_raw)

In [35]:
def shuffle_same_way(arr1, arr2):
    if len(arr1) != len(arr2):
        print("ERROR: len(arr1) {} is different from len(arr2) {}, please fix and re-run!"
             .format(len(arr1), len(arr2)))
        return arr1, arr2
    perms = np.random.permutation(len(arr1))
    return arr1[perms], arr2[perms]

In [36]:
X_train, y_train = shuffle_same_way(X_train_log, y_train)
X_valid, y_valid = shuffle_same_way(X_valid_log, y_valid)

In [37]:
print("Shapes are: ", X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

Shapes are:  (9000, 130, 128) (3000, 130, 128) (9000, 5) (3000, 5)


In [38]:
np.savez(os.path.join(savez_dir, 'shuffled_train_log'), X_train, y_train)
np.savez(os.path.join(savez_dir, 'shuffled_valid_log'), X_valid, y_valid)