# Identify Reciter

# 2_Data_Preparation
This notebook prepares the data for training in the following steps:
- Quick look at the downloaded data & select 5 reciters from the downloaded data
- Select the Sura/Aya for the reciter
- Mark the data for training/validation/test such that the same Sura/Aya are used for these
- Select the feature(s) to be extracted
- Extract the feature(s)
- Save the train/val/test data

https://machinelearningmastery.com/develop-bidirectional-lstm-sequence-classification-python-keras/

Let's take care of package imports and variable defnitions & Helper functions before looking at the downloaded data.

In [None]:
# %load helpers.py
# Cut-paste above line without "#" in cell below & execute once to get code in the cell, 
# and then execute again to load the functions

In [2]:
# %load helpers.py
##################
# imports
import os
import csv
import re
import pathlib
import xml.etree.ElementTree as ET
import zipfile
import shutil
import librosa
import math
import numpy as np 
import pandas as pd 
import warnings
import audioread
import time
import matplotlib.pyplot as plt
from pydub.utils import mediainfo

# https://towardsdatascience.com/music-genre-classification-with-python-c714d032f0d8
# https://levelup.gitconnected.com/audio-data-analysis-using-deep-learning-part-1-7f6e08803f60
# https://www.eurasip.org/Proceedings/Eusipco/Eusipco2018/papers/1570434062.pdf

##################
# Constants
debug          = False
zip_data_dir   = '../../L5_Capstone/Audio/Quran'
zip_file_name  = '000_versebyverse.zip'
data_dir       = 'data'
audio_data_dir = os.path.join(os.getcwd(), data_dir, "audio")
quran_meta_xml = os.path.join(os.getcwd(), data_dir, "quran-data.xml")
SuraIndexMIN   = 1
SuraIndexMAX   = 114
AyaIndexMIN    = 0
AyaIndexMAX    = 6236

# Suppress this warning from librosa:
# UserWarning: PySoundFile failed. Trying audioread instead.
warnings.filterwarnings('ignore')

if debug:
    print("quran_meta_xml :", quran_meta_xml)


##################
# Functions
def hello_world():
    print("Hello world from helpers.py!")

def qsura_ayat_to_labels(suraFrom=None, suraTo=None, ayaFrom=None, ayaTo=None):
    """ Converts either Sura or Aya numbers to labels.

       :param suraFrom: An interger for Sura index to start from. Valid numbers are 1 to 114 (default: None)
       :type suraFrom: int
       :param suraTo: An interger for Sura index to end to. Valid numbers are 1 to 114 (inclusive) (default: None)
       :type suraTo: int
       :param ayaFrom: An interger for Aya index to start from (default: None)
       :type ayaFrom: int
       :param ayaTo: An interger for Aya index to end to (inclusive) (default: None)
       :type ayaTo: int
       :return: A list of labels in the form ['001001', '001002', ...], A list of aya number in quran [0, 2, ... AyaIndexMAX]
       :rtype: list, list 

    """

    # Return lists
    labels_list = list()
    ayainq = list()

    useSura = False
    useAya = False
    if suraFrom is not None and suraTo is not None:
        if suraFrom < SuraIndexMIN or suraFrom > SuraIndexMAX:
            print("ERROR: {} not between {} and {}".format('suraFrom', SuraIndexMIN,SuraIndexMAX))
            return labels_list, ayainq
        if suraTo < SuraIndexMIN or suraTo > SuraIndexMAX:
            print("ERROR: {} not between {} and {}".format('suraTo', SuraIndexMIN,SuraIndexMAX))
            return labels_list, ayainq
        useSura = True
    elif ayaFrom is not None and ayaTo is not None:
        if ayaFrom < AyaIndexMIN or ayaFrom > AyaIndexMAX:
            print("ERROR: {} not between {} and {}".format('ayaFrom', AyaIndexMIN,AyaIndexMAX))
            return labels_list, ayainq
        if ayaTo < AyaIndexMIN or ayaTo > AyaIndexMAX:
            print("ERROR: {} not between {} and {}".format('ayaTo', AyaIndexMIN,AyaIndexMAX))
            return labels_list, ayainq
        useAya = True

    ##################
    # qmeta: Quran Meta Data
    qmeta_tree = ET.parse(quran_meta_xml)
    qmeta_root = qmeta_tree.getroot()
    #print("qmeta_root :", qmeta_root)

    # As an Element, root has a tag and a dictionary of attributes:
    qmeta_root_tag = qmeta_root.tag
    qmeta_root_att = qmeta_root.attrib
    #print("qmeta_root_tag = " + qmeta_root_tag)
    #print("qmeta_root_att = ")
    #print(qmeta_root_att)

    # It also has children nodes over which we can iterate:
    for qmeta_suras in qmeta_root:
        qmeta_suras_tag = qmeta_suras.tag
        #qmeta_suras_att = qmeta_suras.attrib
        #print("qmeta_suras_tag = " + qmeta_suras_tag)
        #print("qmeta_suras_att = ")
        #print(qmeta_suras_att)
        if qmeta_suras_tag == "suras":
            for qmeta_sura in qmeta_suras:
                qmeta_sura_tag = qmeta_sura.tag
                #qmeta_sura_att = qmeta_sura.attrib
                #print("qmeta_sura_tag = " + qmeta_sura_tag)
                #print("qmeta_sura_att = ")
                #print(qmeta_sura_att)
                if qmeta_sura_tag == "sura":
                    #print("qmeta_sura :", qmeta_sura)
                    qmeta_sura_index = qmeta_sura.attrib.get('index')
                    qmeta_sura_ayas = qmeta_sura.attrib.get('ayas')
                    qmeta_sura_start = qmeta_sura.attrib.get('start')
                    #print("qmeta_sura_index :", qmeta_sura_index)
                    #print("qmeta_sura_ayas :", qmeta_sura_ayas)
                    #print("qmeta_sura_start :", qmeta_sura_start)

                    if useSura:
                        if int(qmeta_sura_index) >= suraFrom and int(qmeta_sura_index) <= suraTo:
                            #print("  MKC: qmeta_sura_index :", qmeta_sura_index)
                            #print("  MKC: qmeta_sura_ayas :", qmeta_sura_ayas)
                            #print("  MKC: qmeta_sura_start :", qmeta_sura_start)
                            for i in range(1, int(qmeta_sura_ayas)+1):
                                labels_list.append("{:03d}{:03d}".format(int(qmeta_sura_index), i))

                    if useAya:
                        # Get the current sura end ayat
                        sura_start = int(qmeta_sura_start)
                        sura_end = sura_start + (int(qmeta_sura_ayas) - 1)
                        #print("sura start -> end: {} -> {}".format(sura_start, sura_end))
                        for i in range(sura_start, sura_end+1):
                            if i >= ayaFrom and i <= ayaTo:
                                #print("  -> ",i)
                                ayainq.append(i)
                                labels_list.append("{:03d}{:03d}".format(int(qmeta_sura_index), i+1-sura_start))

    #print("labels_list :", labels_list)
    return labels_list, ayainq

"""
# abc
labels_list, ayainq_list = qsura_ayat_to_labels(suraFrom=113, suraTo=114, ayaFrom=None, ayaTo=None)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(suraFrom=113, suraTo=115, ayaFrom=None, ayaTo=None)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()

labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=0, ayaTo=6)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=1, ayaTo=6)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=1, ayaTo=7)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=0, ayaTo=7)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=0, ayaTo=0)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=0, ayaTo=1)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=290, ayaTo=292)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=6228, ayaTo=6236)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=6228, ayaTo=6237)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=-1, ayaTo=6236)
print("expected_labels_list =", labels_list)
print("ayainq_list =", ayainq_list)
print()
# Create a dictonary of lookup between labels and aya number in Quran
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=0, ayaTo=6236)
lbl_aya_df = pd.DataFrame(list(zip(labels_list, ayainq_list)), 
               columns=['Label', 'AyaInQuran']) 

lbl_aya_dict = dict(zip(labels_list, ayainq_list))

print()
#print("labels_list :", labels_list)
# abc
"""
# abc

def report_stats_zip_data(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name):
    """ Reports statitics for the zipped data

       :param zip_data_dir: Directory containing the zipped data
       :type zip_data_dir: str
       :param zip_file_name: Name of the zip file in the directory
       :type zip_file_name: str
       :return: A list of directory names in zip_data_dir
       :rtype: list 

    """

    # Return list
    dir_names = list()

    # Directory names are also names of the reciters
    print("{:30s} {:10s} {:8s} {:9s}".format("Reciter name", "Data Size", "Files", "MP3 Files"))
    print("{:30s} {:10s} {:8s} {:9s}".format("============", "=========", "=====", "========="))
    for dd in os.listdir(zip_data_dir):
        # Each directory has one zip file called 000_versebyverse.zip
        dd_zip_file = zip_data_dir + "/" + dd + "/" + zip_file_name
        # Size
        dd_size_bytes = os.path.getsize(dd_zip_file)
        dd_size_MB = dd_size_bytes / (1024 * 1024)
        # Number of files
        archive = zipfile.ZipFile(dd_zip_file, 'r')
        num_files = len(archive.namelist())
        # Mp3 files
        mp3_cnt = 0
        for ff in archive.namelist():
            if ff.endswith('.mp3'):
                mp3_cnt += 1

        print("{:30s} {:6.0f} MB {:6d} {:12d}".format(dd, dd_size_MB, num_files, mp3_cnt))
        dir_names.append(dd)
    return dir_names

# Directory size (https://stackoverflow.com/questions/1392413/calculating-a-directorys-size-using-python)
def get_dir_size(start_path = '.'):
    total_size = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            # skip if it is symbolic link
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)

    return total_size

def report_stats_audio_data(data_dir=audio_data_dir):
    """ Reports statitics for the audio data

       :param data_dir: Directory containing the audio data
       :type data_dir: str
       :param zip_file_name: Name of the zip file in the directory
       :type zip_file_name: str
       :return: DataFrame with audio directory details
       :rtype: DataFrame 

    """

    column_names = list()
    column_names.append('ReciterName')
    column_names.append('FileName')
    column_names.append('DataSizeKB')
    column_names.append('BitRate')
    column_names.append('Channels')
    column_names.append('Mono/Stereo')
    column_names.append('Duration')
    column_names.append('FileNoExt')
    column_names.append('Sura')
    column_names.append('Aya')
    column_names.append('AyaInQuran')

    sv_reciter   = list()
    sv_file_name = list()
    sv_data_size = list()
    sv_bit_rate  = list()
    sv_ch        = list()
    sv_ms        = list()
    sv_dur       = list()
    sv_filenoext = list()
    sv_sura      = list()
    sv_aya       = list()
    sv_ayainq    = list()

    # Directory names are also names of the reciters
    print("{:30s} {:10s} {:4s} {:4s} {:2s} {:6s} {:9s}".format("Reciter name/MP3 File", "Data Size", "MP3s", "kbps", "Ch", "Mono/S", "Duration(sec)"))
    print("{:30s} {:10s} {:4s} {:4s} {:2s} {:6s} {:9s}".format("==============================", "=========", "====", "====", "==", "======", "============="))
    for dd in os.listdir(data_dir):
        reciter_dir = os.path.join(data_dir, dd)
        # Size
        #dd_size_bytes = os.path.getsize(reciter_dir)
        dd_size_bytes = get_dir_size(reciter_dir)
        dd_size_KB = dd_size_bytes / (1024)
        # Number of files
        num_files = len(os.listdir(reciter_dir))
        # Mp3 files
        mp3_cnt = 0
        for ff in os.listdir(reciter_dir):
            if ff.endswith('.mp3'):
                mp3_cnt += 1

        print("{:30s} {:6.0f} KB {:5d}".format(dd, dd_size_KB, mp3_cnt))

        for ff in os.listdir(reciter_dir):
            if not ff.endswith('.mp3'):
                continue
            mp3_file = os.path.join(reciter_dir, ff)

            duration = channel_layout = bit_rate = channels = bit_rate_kbps = -1
            try:
                # Look at some audio features
                y, sr = librosa.load(mp3_file, sr=None)
                # Get the length of the audio
                duration = librosa.core.get_duration(y=y, sr=sr)
                duration = len(y) / sr

                # Sample rate
                info = mediainfo(mp3_file)
                #print(info)
                channel_layout = info['channel_layout']
                bit_rate = int(info['bit_rate'])
                channels = int(info['channels'])
                #artist = info['artist']
                artist = ""
                bit_rate_kbps = bit_rate / 1000
            
            except:
                print("Couldn't process, skipping: ", mp3_file)

            #print("{:>30s} {:6.0f} KB {:5d} {:12d} {:5.1f}".format(ff, dd_size_KB, 0, int(info['sample_rate']), duration))
            #with audioread.audio_open(mp3_file) as input_file:
            #    sr_native = input_file.samplerate
            #    n_channels = input_file.channels
            #print(sr_native, n_channels)
            #print("{:>30s} {:6.0f} KB {:5s} {:4.0f} {:2d} {:6s} {:13.1f}".format(ff, dd_size_KB, "", bit_rate_kbps, channels, channel_layout, duration))

            # Size
            dd_size_bytes = os.path.getsize(mp3_file)
            dd_size_KB = dd_size_bytes / (1024)

            sv_reciter.append(dd)
            sv_file_name.append(ff)
            sv_data_size.append(int(dd_size_KB))
            sv_bit_rate.append(int(bit_rate_kbps))
            sv_ch.append(channels)
            sv_ms.append(channel_layout)
            sv_dur.append(duration)
            filenoext, sura, aya, ayainq = get_mp3_file_info(ff)
            sv_filenoext.append(filenoext)
            sv_sura.append(sura)
            sv_aya.append(aya)
            sv_ayainq.append(ayainq)
            #break
        #break
    
    # Create a DataFrame with all the info
    df = pd.DataFrame(list(zip(sv_reciter, sv_file_name, sv_data_size, 
        sv_bit_rate, sv_ch, sv_ms, sv_dur, sv_filenoext, sv_sura, sv_aya, sv_ayainq)), 
               columns=column_names) 

    return df

lbl_aya_dict = dict()
def get_mp3_file_info(file_name):
    """ Get info from MP3 file name

       :param file_name: MP3 file name
       :type file_name: str
       :return: label, sura, aya, ayainquran
       :rtype: str 

    File name should be ######.mp3, e.g. 001004.mp3
        SuraAya    = 001004
        Sura       = 1
        Aya        = 4
        AyaInQuran = 3
    """

    # Return items
    suraaya   = ''
    sura      = -1
    aya       = -1
    ayainquan = -1

    suraaya = os.path.splitext(file_name)[0]
    sura = int(suraaya[:3])
    aya  = int(suraaya[3:])
    ayainquan = int(lbl_aya_dict[suraaya])

    return suraaya, sura, aya, ayainquan

def audio_data_initialize(dir_name=audio_data_dir):
    """ Initialize audio data directory

       :param dir_name: Directory name to initialize
       :type dir_name: str
       :return: dir_name
       :rtype: str 

    """

    # If directory exists, delete the directory 
    if pathlib.Path(dir_name).exists():
        print("Directory exists, deleting :", dir_name)
        #pathlib.Path(dir_name).rmdir()
        shutil.rmtree(dir_name)

    # Create the directory
    print("Creating directory :", dir_name)
    pathlib.Path(dir_name).mkdir()

    return dir_name

def populate_audio_files(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name, 
        audio_data_dir=audio_data_dir, reciters=None, suraFrom=None, suraTo=None,
        ayaFrom=None, ayaTo=None, checkOnly=False):
    """ Populate audio files for the reciters in the given directory

       :param zip_data_dir: Directory containing the zipped data
       :type zip_data_dir: str
       :param zip_file_name: Name of the zip file in the directory
       :type zip_file_name: str
       :return: None
       :rtype: None

    """

    audio_labels, ayainq_list = qsura_ayat_to_labels(suraFrom=suraFrom, suraTo=suraTo, 
        ayaFrom=ayaFrom, ayaTo=ayaTo)
    #print("Audio labels: ", audio_labels)
    #print("Aya in Quran labels: ", ayainq_list)

    for dd in os.listdir(zip_data_dir):
        if dd not in reciters:
            continue

        print("Found reciter: ", dd)
        # Create the directory
        reciter_dir = os.path.join(audio_data_dir, dd)
        print("Creating directory :", reciter_dir)
        pathlib.Path(reciter_dir).mkdir()

        # Each directory has one zip file called 000_versebyverse.zip
        dd_zip_file = zip_data_dir + "/" + dd + "/" + zip_file_name
        # Size
        dd_size_bytes = os.path.getsize(dd_zip_file)
        dd_size_MB = dd_size_bytes / (1024 * 1024)
        # Number of files
        archive = zipfile.ZipFile(dd_zip_file, 'r')
        num_files = len(archive.namelist())
        # Mp3 files
        mp3_cnt = 0
        for ff in archive.namelist():
            if ff.endswith('.mp3'):
                mp3_cnt += 1

        print("{:30s} {:6.0f} MB {:6d} {:12d}".format(dd, dd_size_MB, num_files, mp3_cnt))

        num_files_extracted = 0
        for lbl in audio_labels:
            mp3_file = lbl + ".mp3"
            if mp3_file not in archive.namelist():
                print("ERROR: Couldn't find file: ", mp3_file)
                continue
            archive.extract(mp3_file, path=reciter_dir)
            num_files_extracted += 1
        print("{} files extracted".format(num_files_extracted))
    print()

    return None

# abc
"""
reciter_names = report_stats_zip_data(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name)
print("Reciter name list :", reciter_names)
"""
# abc

"""
selected_reciters = ['AbdulBasit', 'AbdullahBasfar']
"""
# abc
"""
selected_reciters = ['AbdulBasit', 'AbdullahBasfar', 'AbdulSamad', 'AbdurrahmaanAs-Sudais', 'AbuBakrAsh-Shaatree', 'Ajami', 'Alafasy', 'AliJaber', 'FaresAbbad', 'Ghamadi', 'HaniRifai', 'Karim Mansoori-Iran', 'KhalefaAl-Tunaiji', 'MaherAlMuaiqly', 'MinshawyMujawwad', 'MohammadalTablaway', 'MuhammadAyyoub', 'MuhammadJibreel', 'Parhizgar', 'SaoodbinIbraaheemAsh-Shuraym', 'Sudais']
audio_data_initialize(dir_name=audio_data_dir)

suraFrom = None
suraTo = None
ayaFrom = 0
ayaTo = 1
populate_audio_files(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name, 
        audio_data_dir=audio_data_dir, reciters=selected_reciters, suraFrom=suraFrom, suraTo=suraTo,
        ayaFrom=ayaFrom, ayaTo=ayaTo, checkOnly=True)

df = report_stats_audio_data(data_dir=audio_data_dir)

print(df)
"""

def extract_audio_features(self, list_of_audiofiles):

    data = np.zeros(
        (len(list_of_audiofiles), self.timeseries_length, 33), dtype=np.float64
    )
    target = []

    for i, file in enumerate(list_of_audiofiles):
        y, sr = librosa.load(file)
        mfcc = librosa.feature.mfcc(
            y=y, sr=sr, hop_length=self.hop_length, n_mfcc=13
        )
        spectral_center = librosa.feature.spectral_centroid(
            y=y, sr=sr, hop_length=self.hop_length
        )
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=self.hop_length)
        spectral_contrast = librosa.feature.spectral_contrast(
            y=y, sr=sr, hop_length=self.hop_length
        )

        splits = re.split("[ .]", file)
        genre = re.split("[ /]", splits[1])[3]
        target.append(genre)

        data[i, :, 0:13] = mfcc.T[0:self.timeseries_length, :]
        data[i, :, 13:14] = spectral_center.T[0:self.timeseries_length, :]
        data[i, :, 14:26] = chroma.T[0:self.timeseries_length, :]
        data[i, :, 26:33] = spectral_contrast.T[0:self.timeseries_length, :]

        print(
            "Extracted features audio track %i of %i."
            % (i + 1, len(list_of_audiofiles))
        )

    return data, np.expand_dims(np.asarray(target), axis=1)

def extract_audio_features(reciter, mp3_file, sr=22050, n_mfcc=13, n_fft=2048, hop_length=512,
    pad_duration=None, read_duration=None, features_list=['mfcc', 'zcr', 'spectral_center', 
    'spectral_rolloff', 'chroma', 'spectral_bandwidth_2', 'spectral_bandwidth_3', 
    'spectral_bandwidth_4', 'spectral_contrast'], shp_0=None, shp_1=None):

    file_name = os.path.join(audio_data_dir, reciter, mp3_file)

    columns = data = feature_shapes = new_shp_0 = new_shp_1 = None

    try:
        y , sr = librosa.load(file_name, sr=sr, duration=read_duration)
        orig_duration = len(y) / sr
        #print("pad_duration = ", pad_duration)
        #print("read_duration = ", read_duration)
        #print("orig_duration = ", orig_duration)
        # Pad the duration
        if pad_duration is not None:
            if pad_duration > orig_duration:
                new_len_y = pad_duration * sr
                y = librosa.util.fix_length(y, new_len_y)
            elif pad_duration <= orig_duration:
                # Nothing to be done!
                pass
        duration = len(y) / sr
        #print("FINAL: duration = ", duration)

        # Column names
        columns = list()

        # Feature shapes
        feature_shapes = list()

        #print("shp_0 :", shp_0)
        #print("shp_1 :", shp_1)
        if shp_0 is not None and shp_1 is not None:
            if 'spect' in features_list:
                #data = np.empty(
                #    (shp_0, shp_1), dtype=np.float64
                #)
                data = np.empty(
                    (shp_1, shp_0), dtype=np.float64
                )
            else:
                data = np.zeros(
                    (shp_1, shp_0), dtype=np.float64
                )
            #data = np.empty(
            #  (0, shp_0, shp_1)
            #)
            #print("data initialized:")
            #print(type(data))
            #print(data.shape)
        else:
            data = list()
            #print(type(data))

        # Start index is 0 and gets updated after feature is concatenated to "data"
        start_idx = 0
        if 'mfcc' in features_list:
            #spect = librosa.feature.melspectrogram(y=y, sr=sr,n_fft=n_fft, hop_length=hop_length)
            mfcc = librosa.feature.mfcc(
                y=y, sr=sr, hop_length=hop_length, n_mfcc=n_mfcc, n_fft=n_fft
            )
            feature_shapes.append(mfcc.shape)
            #print("mfcc:")
            #print(mfcc.shape)
            #print(mfcc.T)
            for i in range(1, mfcc.shape[0]+1):
                columns.append('mfcc{}'.format(i))
            if shp_0 is not None:
                #data = np.append(data, [mfcc.T], axis=0)
                data[:, start_idx:start_idx+mfcc.shape[0]] = mfcc.T[0:mfcc.shape[1], :]
                start_idx += mfcc.shape[0]
                #print("mfcc start_idx updated to: ", start_idx)
        if 'zcr' in features_list:
            zcr = librosa.feature.zero_crossing_rate(y)
            feature_shapes.append(zcr.shape)
            #print("zcr:")
            #print(zcr.shape)
            #print(zcr.T)
            #print(zcr.shape[1])
            columns.append('zcr')
            if shp_0 is not None:
                data[:, start_idx:start_idx+zcr.shape[0]] = zcr.T[0:zcr.shape[1], :]
                start_idx += zcr.shape[0]
                #print("zcr start_idx updated to: ", start_idx)
        if 'spectral_center' in features_list:
            spectral_center = librosa.feature.spectral_centroid(
                y=y, sr=sr, hop_length=hop_length
            )
            feature_shapes.append(spectral_center.shape)
            #print("spectral_center:")
            #print(spectral_center.shape)
            columns.append('spectral_center')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_center.shape[0]] = spectral_center.T[0:spectral_center.shape[1], :]
                start_idx += spectral_center.shape[0]
                #print("spectral_center start_idx updated to: ", start_idx)
        if 'spectral_rolloff' in features_list:
            #spectral_rolloff = librosa.feature.spectral_rolloff(y+0.01, sr=sr)[0]
            spectral_rolloff = librosa.feature.spectral_rolloff(y+0.01, sr=sr)
            feature_shapes.append(spectral_rolloff.shape)
            #print("spectral_rolloff:")
            #print(spectral_rolloff.shape)
            #print(spectral_rolloff)
            columns.append('spectral_rolloff')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_rolloff.shape[0]] = spectral_rolloff.T[0:spectral_rolloff.shape[1], :]
                start_idx += spectral_rolloff.shape[0]
                #print("spectral_rolloff start_idx updated to: ", start_idx)
        if 'chroma' in features_list:
            chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
            feature_shapes.append(chroma.shape)
            #print("chroma:")
            #print(chroma.shape)
            for i in range(1, chroma.shape[0]+1):
                columns.append('chroma{}'.format(i))
            if shp_0 is not None:
                data[:, start_idx:start_idx+chroma.shape[0]] = chroma.T[0:chroma.shape[1], :]
                start_idx += chroma.shape[0]
                #print("chroma start_idx updated to: ", start_idx)
        if 'spectral_bandwidth_2' in features_list:
            spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(y+0.01, sr=sr)
            feature_shapes.append(spectral_bandwidth_2.shape)
            #print("spectral_bandwidth_2:")
            #print(spectral_bandwidth_2.shape)
            columns.append('spectral_bandwidth_2')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_bandwidth_2.shape[0]] = spectral_bandwidth_2.T[0:spectral_bandwidth_2.shape[1], :]
                start_idx += spectral_bandwidth_2.shape[0]
                #print("spectral_bandwidth_2 start_idx updated to: ", start_idx)
        if 'spectral_bandwidth_3' in features_list:
            spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(y+0.01, sr=sr, p=3)
            feature_shapes.append(spectral_bandwidth_3.shape)
            #print("spectral_bandwidth_3:")
            #print(spectral_bandwidth_3.shape)
            columns.append('spectral_bandwidth_3')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_bandwidth_3.shape[0]] = spectral_bandwidth_3.T[0:spectral_bandwidth_3.shape[1], :]
                start_idx += spectral_bandwidth_3.shape[0]
                #print("spectral_bandwidth_3 start_idx updated to: ", start_idx)
        if 'spectral_bandwidth_4' in features_list:
            spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(y+0.01, sr=sr, p=4)
            feature_shapes.append(spectral_bandwidth_4.shape)
            #print("spectral_bandwidth_4:")
            #print(spectral_bandwidth_4.shape)
            columns.append('spectral_bandwidth_4')
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_bandwidth_4.shape[0]] = spectral_bandwidth_4.T[0:spectral_bandwidth_4.shape[1], :]
                start_idx += spectral_bandwidth_4.shape[0]
                #print("spectral_bandwidth_4 start_idx updated to: ", start_idx)
        if 'spectral_contrast' in features_list:
            spectral_contrast = librosa.feature.spectral_contrast(
                y=y, sr=sr, hop_length=hop_length
            )
            feature_shapes.append(spectral_contrast.shape)
            #print("spectral_contrast:")
            #print(spectral_contrast.shape)
            #print(spectral_contrast)
            #print(spectral_contrast.T)
            for i in range(1, spectral_contrast.shape[0]+1):
                columns.append('spcontr{}'.format(i))
            if shp_0 is not None:
                data[:, start_idx:start_idx+spectral_contrast.shape[0]] = spectral_contrast.T[0:spectral_contrast.shape[1], :]
                start_idx += spectral_contrast.shape[0]
                #print("spectral_contrast start_idx updated to: ", start_idx)
        if 'spect' in features_list:
            spect = librosa.feature.melspectrogram(y=y, sr=sr,n_fft=n_fft, hop_length=hop_length)
            spect = librosa.power_to_db(spect, ref=np.max)
            feature_shapes.append(spect.shape)
            #print("spect:")
            #print(spect.shape)
            for i in range(1, spect.shape[0]+1):
                columns.append('spect{}'.format(i))
            if shp_0 is not None:
                data[:, start_idx:start_idx+spect.shape[0]] = spect.T[0:spect.shape[1], :]
                start_idx += spect.shape[0]
                #print("spect start_idx updated to: ", start_idx)

        new_shp_0 = shp_0
        new_shp_1 = shp_1
        if shp_0 is None:
            #print(feature_shapes)
            new_shp_0 = 0
            for i, shp in enumerate(feature_shapes):
                if i == 0:
                    prev_shp_1 = shp[1]
                else:
                    if shp[1] != prev_shp_1:
                        print("ERROR: shape[1] are different: {} != {}".format(shp[1], prev_shp_1))
                print("shp[0] :", shp[0])
                new_shp_0 += shp[0]
            new_shp_1 = prev_shp_1
            print("new_shp_0 :", new_shp_0)
            print("new_shp_1 :", new_shp_1)

        #print("duration :", duration)

    except:
            print("Couldn't process, skipping: ", file_name)

    return columns, data, feature_shapes, new_shp_0, new_shp_1


# abc
"""
# Need to match audio lenghts for all files in order to get arrays of same size for the training
new_duration = math.ceil(16.770612244897958)
n_fft = 2048
hop_length = 512
#hop_length = 1024

# List of features we want to use
features_list=['mfcc', 'zcr', 'spectral_center', 
    'spectral_rolloff', 'chroma', 'spectral_bandwidth_2', 'spectral_bandwidth_3', 
    'spectral_bandwidth_4', 'spectral_contrast']
features_list = list()
features_list.append('mfcc')
features_list.append('zcr')

# Use one MP3 file to get the array sizes of each feature so starting array can be initialized properly
reciter = 'Ajami'
mp3_file = '001001.mp3'
shp_0 = shp_1 = read_duration = None
read_duration = 0.1
new_duration = None
"""
# abc

"""
columns, data, feature_shapes = extract_audio_features(reciter, mp3_file, sr=22050, n_mfcc=13, n_fft=n_fft, hop_length=hop_length, 
    pad_duration=new_duration, read_duration=read_duration, 
    features_list=features_list, shp_0=shp_0, shp_1=shp_1)
print(columns)
# Check to make sure all shapes[1] are the same, and add up all shapes[0]
print(feature_shapes)
shp_0 = 0
for i, shp in enumerate(feature_shapes):
    if i == 0:
        prev_shp_1 = shp[1]
    else:
        if shp[1] != prev_shp_1:
            print("ERROR: shape[1] are different: {} != {}".format(shp[1], prev_shp_1))
    print("shp[0] :", shp[0])
    shp_0 += shp[0]
shp_1 = prev_shp_1
print("shp_0 :", shp_0)
print("shp_1 :", shp_1)

# Initialize the start array
X_arr = np.empty((0, shp_1, shp_0))
print("X_arr initialized: ")
print(type(X_arr))
print(X_arr.shape)

# Start gathering the data
columns, data, feature_shapes = extract_audio_features(reciter, mp3_file, sr=22050, n_mfcc=13, n_fft=n_fft, hop_length=hop_length, 
    pad_duration=new_duration, read_duration=read_duration, 
    features_list=features_list, shp_0=shp_0, shp_1=shp_1)
#print(data)
print(type(data))
print(data.shape)
#X_arr[0, :, :] = [data]
X_arr = np.append(X_arr, [data], axis=0)
print("X_arr :")
print(type(X_arr))
print(X_arr.shape)
print(X_arr)
exit()
mp3_file = '001002.mp3'
columns, data, feature_shapes = extract_audio_features(reciter, mp3_file, sr=22050, n_mfcc=13, n_fft=n_fft, hop_length=hop_length, 
    pad_duration=new_duration, read_duration=read_duration, 
    features_list=features_list, shp_0=shp_0, shp_1=shp_1)

X_arr = np.append(X_arr, [data], axis=0)
print(type(X_arr))
print(X_arr.shape)

# Load an audio file and plot it
reciter = 'AbdullahBasfar'
columns, data, feature_shapes = extract_audio_features(reciter, mp3_file, sr=22050, n_mfcc=13, n_fft=n_fft, hop_length=hop_length, 
    pad_duration=new_duration, read_duration=read_duration, 
    features_list=features_list, shp_0=shp_0, shp_1=shp_1)

X_arr = np.append(X_arr, [data], axis=0)
print(type(X_arr))
print(X_arr.shape)
"""
# abc
# MKC1

Train_Suras = Val_Suras = Test_Suras = None
# Add test/train/val column in df
def assign_set_sura(row):
    train = Train_Suras
    val   = Val_Suras
    test  = Test_Suras
    if row.Sura in val:
        return "validation"
    elif row.Sura in test:
        return "test"
    else:
        return "train"

Train_Ayas = Val_Ayas = Test_Ayas = None
# Add test/train/val column in df
def assign_set_aya(row):
    train = Train_Ayas
    val   = Val_Ayas
    test  = Test_Ayas
    if row.AyaInQuran in val:
        return "validation"
    elif row.AyaInQuran in test:
        return "test"
    else:
        return "train"

def assign_set_filename(row):
    train = Train_Ayas
    val   = Val_Ayas
    test  = Test_Ayas
    if row.FileName in val:
        return "validation"
    elif row.FileName in test:
        return "test"
    else:
        return "train"

"""
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=0, ayaTo=6236)
lbl_aya_df = pd.DataFrame(list(zip(labels_list, ayainq_list)), 
               columns=['Label', 'AyaInQuran']) 

lbl_aya_dict = dict(zip(labels_list, ayainq_list))

ff = '001004.mp3'
#print(lbl_aya_dict[ff])

#filenoext, sura, aya, ayainq = get_mp3_file_info(ff, dict_lkup=lbl_aya_dict)
filenoext, sura, aya, ayainq = get_mp3_file_info(ff)
print(filenoext, sura, aya, ayainq)
"""

def gen_audio_data(df, shp0, shp1):
    print("shp0 shp1 = ", shp0, shp1)
    X_arr = np.empty((0, shp1, shp0))
    print("X_arr initialized to :", X_arr.shape)
    reciters_arr = np.empty((0, len(list(le.classes_))))
    print("reciters_arr initialized to :", reciters_arr.shape)

    cnt = 0
    for index, row in df.iterrows():
        cnt += 1
        ReciterName = row['ReciterName']
        FileName = row['FileName']
        # Get audio features
        columns, data, feature_shapes, new_shp_0, new_shp_1 = extract_audio_features(
                reciter=ReciterName, mp3_file=FileName, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, 
                hop_length=hop_length, pad_duration=pad_duration, read_duration=read_duration, 
                features_list=features_list, shp_0=shp0, shp_1=shp1)
        if columns == None and data == None and feature_shapes == None:
            # Skips in case of errors
            continue

        X_arr = np.append(X_arr, [data], axis=0)
                            
        reciters_list = [0 for i in range(0, len(list(le.classes_)))]
        reciters_index = list(le.transform([ReciterName]))[0]
        reciters_list[reciters_index] = 1
        reciters_arr = np.append(reciters_arr, [reciters_list], axis=0)
            
        if cnt % 100 == 0:
            print("Processed ", cnt)
        #if cnt == 10:
        #    break

    return X_arr, reciters_arr

def filter_duration(row):
    my_df = df_tmp
    FileName = row.FileName
    #print("FileName =", FileName)
    not_found = False
    for rec in selected_reciters:
        #print("  rec =", rec)
        if ((my_df['ReciterName'] == rec) & (my_df['FileName'] == FileName)).any():
            pass
        else:
            not_found = True
            #print("not_found =", not_found)
            break
    
    if not_found == True:
        return 'NaN'
    else:
        return 'Yes'

"""
# D:\\Khurram\\Udacity\\Udacity_ML_Course\\MKC\\P3_Capstone\\identify_reciter\\data\\audio\\Ajami\\033050.mp3
ReciterName = 'Ajami'
#FileName    = '033050.mp3'
FileName    = '033051.mp3'

read_duration = pad_duration = 30
#read_duration = pad_duration = None
shp_0 = shp_1 = None
sr = 22050
n_fft = 2048
n_mfcc = 13
hop_length = 1024
#features_list = ['spect']
features_list = ['mfcc']

columns, data, feature_shapes, new_shp_0, new_shp_1 = extract_audio_features(
        reciter=ReciterName, mp3_file=FileName, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, 
        hop_length=hop_length, pad_duration=pad_duration, read_duration=read_duration, 
        features_list=features_list, shp_0=shp_0, shp_1=shp_1)

labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=0, ayaTo=6236)
lbl_aya_df = pd.DataFrame(list(zip(labels_list, ayainq_list)), 
               columns=['Label', 'AyaInQuran']) 

lbl_aya_dict = dict(zip(labels_list, ayainq_list))

audio_data_dir = os.path.join(os.getcwd(), data_dir, "audiotest")
df = report_stats_audio_data(data_dir=audio_data_dir)
"""

"""
from pandas import ExcelWriter
my_excel_fileA = 'pd_df_1.xlsx'
# Read Excel file
df = pd.read_excel(my_excel_fileA)

#selected_reciters = ['AbdulBasit', 'AbdurrahmaanAs-Sudais', 'Ajami', 'Alafasy', 'FaresAbbad']
dfD = df.copy()
newcol = 'Common'
dfD[newcol] = 'NaN'

#print(dfD.head())
df_tmp = dfD[dfD['Duration'] >= 30]

my_out_file = 'pd_df_2.xlsx'
writer = ExcelWriter(my_out_file)
df_tmp.to_excel(writer,'Sheet1',index=False)
writer.save()

df_tmp[newcol] = df_tmp.apply(filter_duration, axis=1)
df_tmp = df_tmp[df_tmp[newcol] == 'Yes']

FileNames = list(df_tmp['FileName'].unique())
print("len = ", len(FileNames))
tot_ayas = len(FileNames)
test_val_ayas = math.ceil(0.2 * tot_ayas)
print("test_val_ayas = ", test_val_ayas)

val_start_idx  = tot_ayas - 1 - test_val_ayas
test_start_idx = val_start_idx - test_val_ayas
print("        val_start_idx = ", val_start_idx)
print("       test_start_idx = ", test_start_idx)
Val_Ayas   = FileNames[val_start_idx+1:tot_ayas+1]
Test_Ayas  = FileNames[test_start_idx+1:val_start_idx+1]
Train_Ayas = FileNames[0:test_start_idx+1]
print("Val_Ayas = ", Val_Ayas)

df_tmp['Set'] = df_tmp.apply(assign_set_filename, axis=1)

my_out_file = 'pd_df_3.xlsx'
writer = ExcelWriter(my_out_file)
df_tmp.to_excel(writer,'Sheet1',index=False)
writer.save()
"""

print("helpers.py LOADED!")
# End of helpers.py

helpers.py LOADED!


In [3]:
##################
# imports
import os
import csv
import re
import pathlib
import xml.etree.ElementTree as ET
import zipfile
import shutil
import librosa
import math
import numpy as np 
import pandas as pd 
import warnings
import audioread
import time
import matplotlib.pyplot as plt
from pandas import ExcelWriter
from pydub.utils import mediainfo

%matplotlib inline

print("Current dir is: ", os.getcwd())

Current dir is:  D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter


In [4]:
##################
# Constants
debug          = False
zip_data_dir   = '../../L5_Capstone/Audio/Quran'
zip_file_name  = '000_versebyverse.zip'
data_dir       = 'datamfcc'
audio_data_dir = os.path.join(os.getcwd(), data_dir, "audio")
quran_meta_xml = os.path.join(os.getcwd(), "Qurandata", "quran-data.xml")
SuraIndexMIN   = 1
SuraIndexMAX   = 114
AyaIndexMIN    = 0
AyaIndexMAX    = 6236

# Suppress this warning from librosa:
# UserWarning: PySoundFile failed. Trying audioread instead.
warnings.filterwarnings('ignore')

In [5]:
# Create a lookup table between Mp3 filename and ayaInQuran
t0 = time.time()
labels_list, ayainq_list = qsura_ayat_to_labels(ayaFrom=AyaIndexMIN, 
                                                ayaTo=AyaIndexMAX)
lbl_aya_df = pd.DataFrame(list(zip(labels_list, ayainq_list)), 
               columns=['Label', 'AyaInQuran']) 

lbl_aya_dict = dict(zip(labels_list, ayainq_list))
print("DONE in {:0.3} sec\n".format(time.time() - t0))

lbl_aya_df.tail()

DONE in 0.017 sec



Unnamed: 0,Label,AyaInQuran
6231,114002,6231
6232,114003,6232
6233,114004,6233
6234,114005,6234
6235,114006,6235


In [8]:
# Report stats on all downloaded data
t0 = time.time()
reciter_names = report_stats_zip_data(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name)
print("DONE in {:0.3} sec\n".format(time.time() - t0))

print("\nDownloaded Reciter name list :\n", reciter_names)

Reciter name                   Data Size  Files    MP3 Files
AbdulBasit                        864 MB   6255         6253
AbdullahBasfar                    435 MB   6239         6236
AbdulSamad                       1643 MB   6240         6238
AbdurrahmaanAs-Sudais             584 MB   6351         6349
AbuBakrAsh-Shaatree               729 MB   6356         6353
Ajami                            1436 MB   6354         6350
Alafasy                           825 MB   6352         6350
AliJaber                          701 MB   6354         6351
FaresAbbad                        594 MB   6357         6353
Ghamadi                           426 MB   6351         6349
HaniRifai                         702 MB   6239         6237
Karim Mansoori-Iran              1015 MB   6352         6348
KhalefaAl-Tunaiji                 757 MB   6238         6236
MaherAlMuaiqly                    586 MB   6350         6348
MinshawyMujawwad                 1650 MB   6351         6349
MohammadalTablaway      

## Reciter selection
The following 5 reciters were selected:
- Abdul Basit -> AbdulBasit
- Abdurrahmaan As-Sudais -> AbdurrahmaanAs-Sudais
- Ahmed Ibn Ali Al Ajamy -> Ajami
- Alafasy
- Fares Abbad -> Fares Abbad

Select 10 small suras. Roughly 60% suras will be used for training, 20% for validation, and 20% for test.

In [6]:
# List of reciters we are interested in
selected_reciters = ['AbdulBasit', 'AbdurrahmaanAs-Sudais', 'Ajami', 'Alafasy', 'FaresAbbad']
print("Selected reciter: ", selected_reciters)

# List of Sura/Ayas we are interested in
suraFrom = None
suraTo   = None
ayaFrom  = AyaIndexMIN
ayaTo    = 300
# Aya: 60% training, 20% val, 20% test
test_val_ayas = math.ceil(0.2 * (ayaTo - ayaFrom))
print("Selected Suras/Ayas: ")
print("      suraFrom: ", suraFrom)
print("        suraTo: ", suraTo)
print("       ayaFrom: ", ayaFrom)
print("         ayaTo: ", ayaTo)
print(" test_val_ayas: ", test_val_ayas)
print()  

if suraFrom is not None and suraTo is not None:
    # Train/Val/Test Suras
    Train_Suras = [105, 106, 107, 108, 111, 113, 114]
    Val_Suras   = [104, 110]
    Test_Suras  = [109, 112]
    # Check for overlap between Val/Test/Train, if any overlap then resolve
    intersection_vtr = list(set(Val_Suras).intersection(Test_Suras))
    intersection_trt = list(set(Train_Suras).intersection(Test_Suras))
    intersection_trv = list(set(Train_Suras).intersection(Val_Suras))

else:
    # Train/Val/Test Ayas
    val_start_idx  = ayaTo - test_val_ayas
    test_start_idx = val_start_idx - test_val_ayas
    print("        val_start_idx = ", val_start_idx)
    print("       test_start_idx = ", test_start_idx)
    #Train_Ayas = [i for i in range(ayaFrom-1, ayaTo)]
    #Val_Ayas   = [i for i in range(ayaTo - (ayaTo - (2 * test_val_ayas)), ayaTo - (1 * test_val_ayas))]
    #Test_Ayas  = [i for i in range(ayaTo - (ayaTo - (1 * test_val_ayas)), ayaTo)]
    Val_Ayas   = [i for i in range(val_start_idx+1, ayaTo+1)]
    Test_Ayas  = [i for i in range(test_start_idx+1, val_start_idx+1)]
    Train_Ayas = [i for i in range(ayaFrom, test_start_idx+1)]
    print("         Train_Ayas # = ", len(Train_Ayas))
    print("           Val_Ayas # = ", len(Val_Ayas))
    print("          Test_Ayas # = ", len(Test_Ayas))
    print("Train+Val+Test_Ayas # = ", len(Train_Ayas)+len(Val_Ayas)+len(Test_Ayas))

    # Check for overlap between Val/Test/Train, if any overlap then resolve
    intersection_vtr = list(set(Val_Ayas).intersection(Test_Ayas))
    intersection_trt = list(set(Train_Ayas).intersection(Test_Ayas))
    intersection_trv = list(set(Train_Ayas).intersection(Val_Ayas))

print()  
if len(intersection_vtr) != 0:
    print("ERROR: Overlap between Val & Test - Please fix!", intersection_vtr)
if len(intersection_trt) != 0:
    print("ERROR: Overlap between Train & Test - Please fix!", intersection_trt)
if len(intersection_trv) != 0:
    print("ERROR: Overlap between Train & Val - Please fix!", intersection_trv)
if len(intersection_vtr) == 0 and len(intersection_trt) == 0 and len(intersection_trv) == 0:
    print("SUCCESS: No overlap between Val & Test & Train - OK to continue")
else:
    print("ERROR: Overlaps found, please fix before continuing!")
# Todo:
# Add number of ayats for each sura to see how ayats are balanced. However true check will be how much audio length
# per test/train/val

Selected reciter:  ['AbdulBasit', 'AbdurrahmaanAs-Sudais', 'Ajami', 'Alafasy', 'FaresAbbad']
Selected Suras/Ayas: 
      suraFrom:  None
        suraTo:  None
       ayaFrom:  0
         ayaTo:  300
 test_val_ayas:  60

        val_start_idx =  240
       test_start_idx =  180
         Train_Ayas # =  181
           Val_Ayas # =  60
          Test_Ayas # =  60
Train+Val+Test_Ayas # =  301

SUCCESS: No overlap between Val & Test & Train - OK to continue


In [13]:
if not pathlib.Path(data_dir).exists():
    print("Directory doesn't exist ...")
    audio_data_initialize(dir_name=data_dir)

# Start from a clean data directory
audio_data_initialize(dir_name=audio_data_dir)

# Populate the data directory with MP3 files for the reciters
t0 = time.time()
populate_audio_files(zip_data_dir=zip_data_dir, zip_file_name=zip_file_name, 
        audio_data_dir=audio_data_dir, reciters=selected_reciters, suraFrom=suraFrom, suraTo=suraTo,
        ayaFrom=ayaFrom, ayaTo=ayaTo)
print("DONE in {:0.3} sec\n".format(time.time() - t0))


Directory doesn't exist ...
Creating directory : datamfcc
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\datamfcc\audio
Found reciter:  AbdulBasit
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\datamfcc\audio\AbdulBasit
AbdulBasit                        864 MB   6255         6253
301 files extracted
Found reciter:  AbdurrahmaanAs-Sudais
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\datamfcc\audio\AbdurrahmaanAs-Sudais
AbdurrahmaanAs-Sudais             584 MB   6351         6349
301 files extracted
Found reciter:  Ajami
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\datamfcc\audio\Ajami
Ajami                            1436 MB   6354         6350
301 files extracted
Found reciter:  Alafasy
Creating directory : D:\Khurram\Udacity\Udacity_ML_Course\MKC\P3_Capstone\identify_reciter\datamfcc\audio\Alafasy
Alafasy    

In [14]:
# Report stats on all the extracted MP3 files and get the details in a DataFrame
t0 = time.time()
df = report_stats_audio_data(data_dir=audio_data_dir)
print("DONE in {:0.3} sec\n".format(time.time() - t0))

print("\n\n\nDONE")

Reciter name/MP3 File          Data Size  MP3s kbps Ch Mono/S Duration(sec)
AbdulBasit                      78609 KB   301
AbdurrahmaanAs-Sudais           47721 KB   301
Ajami                          108051 KB   301
Alafasy                         58687 KB   301
FaresAbbad                      55796 KB   301
DONE in 4.59e+02 sec




DONE


In [31]:
# SKIP IF DONE EARLIER!!!
# Save the df 
from pandas import ExcelWriter
my_excel_fileA = os.path.join(data_dir, 'pd_df_1.xlsx')
writer = ExcelWriter(my_excel_fileA)
df.to_excel(writer,'Sheet1',index=False)
writer.save()

In [7]:
# Read info saved earlier to save time
my_excel_fileA = os.path.join(data_dir, 'pd_df_1.xlsx')
# Read Excel file
df = pd.read_excel(my_excel_fileA)

In [8]:
# A quick look at the DataFrame
df.head()

Unnamed: 0,ReciterName,FileName,DataSizeKB,BitRate,Channels,Mono/Stereo,Duration,FileNoExt,Sura,Aya,AyaInQuran
0,AbdulBasit,001001.mp3,34,64,2,stereo,4.336327,1001,1,1,0
1,AbdulBasit,001002.mp3,42,64,2,stereo,5.276735,1002,1,2,1
2,AbdulBasit,001003.mp3,32,64,2,stereo,3.996735,1003,1,3,2
3,AbdulBasit,001004.mp3,36,64,2,stereo,4.519184,1004,1,4,3
4,AbdulBasit,001005.mp3,44,64,2,stereo,5.590204,1005,1,5,4


In [9]:
# A quick look at the DataFrame
df.shape

(1505, 11)

In [32]:
from pandas import ExcelWriter

def filter_duration(row):
    my_df = df_tmp
    FileName = row.FileName
    #print("FileName =", FileName)
    not_found = False
    for rec in selected_reciters:
        #print("  rec =", rec)
        if ((my_df['ReciterName'] == rec) & (my_df['FileName'] == FileName)).any():
            pass
        else:
            not_found = True
            #print("not_found =", not_found)
            break
    
    if not_found == True:
        return 'NaN'
    else:
        return 'Yes'

def assign_set_filename(row):
    train = Train_Ayas
    val   = Val_Ayas
    test  = Test_Ayas
    if row.FileName in val:
        return "validation"
    elif row.FileName in test:
        return "test"
    else:
        return "train"

#selected_reciters = ['AbdulBasit', 'AbdurrahmaanAs-Sudais', 'Ajami', 'Alafasy', 'FaresAbbad']
dfD = df.copy()
newcol = 'Common'
#dfD[newcol] = 'NaN'
dfD[newcol] = 'Yes'

#print(dfD.head())
#df_tmp = dfD[dfD['Duration'] >= 30]
df_tmp = dfD

my_out_file = os.path.join(data_dir, 'pd_df_2.xlsx')
writer = ExcelWriter(my_out_file)
df_tmp.to_excel(writer,'Sheet1',index=False)
writer.save()

#df_tmp[newcol] = df_tmp.apply(filter_duration, axis=1)
df_tmp = df_tmp[df_tmp[newcol] == 'Yes']

FileNames = list(df_tmp['FileName'].unique())
print("len = ", len(FileNames))
tot_ayas = len(FileNames)
test_val_ayas = math.ceil(0.2 * tot_ayas)
print("test_val_ayas = ", test_val_ayas)

val_start_idx  = tot_ayas - 1 - test_val_ayas
test_start_idx = val_start_idx - test_val_ayas
print("        val_start_idx = ", val_start_idx)
print("       test_start_idx = ", test_start_idx)
Val_Ayas   = FileNames[val_start_idx+1:tot_ayas+1]
Test_Ayas  = FileNames[test_start_idx+1:val_start_idx+1]
Train_Ayas = FileNames[0:test_start_idx+1]
#print("Val_Ayas = ", Val_Ayas)

df_tmp['Set'] = df_tmp.apply(assign_set_filename, axis=1)

my_out_file = os.path.join(data_dir, 'pd_df_3.xlsx')
writer = ExcelWriter(my_out_file)
df_tmp.to_excel(writer,'Sheet1',index=False)
writer.save()


len =  301
test_val_ayas =  61
        val_start_idx =  239
       test_start_idx =  178


In [14]:
####### SKIP THIS IF USING FILENAMES ABOVE
# Using Aya or Sura?
AyaOrSura = 'Aya'

#Val_Ayas   = [i for i in range(val_start_idx+1, ayaTo+1)]
#Test_Ayas  = [i for i in range(test_start_idx+1, val_start_idx+1)]
#Train_Ayas = [i for i in range(ayaFrom, test_start_idx+1)]

#Train_Suras = [105, 106, 107, 108, 111, 113, 114]
#Val_Suras   = [104, 110]
#Test_Suras  = [109, 112]   
print(Val_Ayas)
print(Test_Ayas)

df_tmp = df.copy()
if AyaOrSura == 'Sura':
    df_tmp['Set'] = df_tmp.apply(assign_set_sura, axis=1)
    pass
elif AyaOrSura == 'Aya':
    df_tmp['Set'] = df_tmp.apply(assign_set_aya, axis=1)
    pass

display(df_tmp[df_tmp['Set'] == 'test'])

[1120, 1121, 1122, 1123, 1124, 1125, 1126, 1127, 1128, 1129, 1130, 1131, 1132, 1133, 1134, 1135, 1136, 1137, 1138, 1139, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1150, 1151, 1152, 1153, 1154, 1155, 1156, 1157, 1158, 1159, 1160, 1161, 1162, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, 1261, 1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 128

Unnamed: 0,ReciterName,FileName,DataSizeKB,BitRate,Channels,Mono/Stereo,Duration,FileNoExt,Sura,Aya,AyaInQuran,Set
840,AbdulBasit,006052.mp3,343,64,2,stereo,43.807347,006052,6,52,840,test
841,AbdulBasit,006053.mp3,233,64,2,stereo,29.701224,006053,6,53,841,test
842,AbdulBasit,006054.mp3,432,64,2,stereo,55.170612,006054,6,54,842,test
843,AbdulBasit,006055.mp3,93,64,2,stereo,11.859592,006055,6,55,843,test
844,AbdulBasit,006056.mp3,231,64,2,stereo,29.492245,006056,6,56,844,test
...,...,...,...,...,...,...,...,...,...,...,...,...
6715,FaresAbbad,007162.mp3,90,49,2,stereo,14.942041,007162,7,162,1115,test
6716,FaresAbbad,007163.mp3,142,48,2,stereo,23.902041,007163,7,163,1116,test
6717,FaresAbbad,007164.mp3,115,49,2,stereo,19.226122,007164,7,164,1117,test
6718,FaresAbbad,007165.mp3,108,49,2,stereo,17.972245,007165,7,165,1118,test


In [11]:
# Look at Sura or AyaInQuran
lookcol = 'AyaInQuran'

print(" Test: Number of items = {}, StartIndex = {}, EndIndex = {}".format(len(df_tmp[df_tmp['Set'] == "test"][lookcol].unique()), df_tmp[df_tmp['Set'] == "test"][lookcol].unique()[0], df_tmp[df_tmp['Set'] == "test"][lookcol].unique()[-1] ))
print("  Val: Number of items = {}, StartIndex = {}, EndIndex = {}".format(len(df_tmp[df_tmp['Set'] == "validation"][lookcol].unique()), df_tmp[df_tmp['Set'] == "validation"][lookcol].unique()[0], df_tmp[df_tmp['Set'] == "validation"][lookcol].unique()[-1] ))
print("Train: Number of items = {}, StartIndex = {}, EndIndex = {}".format(len(df_tmp[df_tmp['Set'] == "train"][lookcol].unique()), df_tmp[df_tmp['Set'] == "train"][lookcol].unique()[0], df_tmp[df_tmp['Set'] == "train"][lookcol].unique()[-1] ))

# Check(s)
display(df_tmp[df_tmp['Set'] == "test"][lookcol].unique())
display(df_tmp[df_tmp['Set'] == "validation"][lookcol].unique())
display(df_tmp[df_tmp['Set'] == "train"][lookcol].unique())


 Test: Number of items = 61, StartIndex = 179, EndIndex = 239
  Val: Number of items = 61, StartIndex = 240, EndIndex = 300
Train: Number of items = 179, StartIndex = 0, EndIndex = 178


array([179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
       192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204,
       205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217,
       218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230,
       231, 232, 233, 234, 235, 236, 237, 238, 239], dtype=int64)

array([240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
       253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265,
       266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278,
       279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291,
       292, 293, 294, 295, 296, 297, 298, 299, 300], dtype=int64)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178], dtype=

In [12]:
# Once satisfied with updated df_tmp, assign it back to df
df = df_tmp.copy()

df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,ReciterName,FileName,DataSizeKB,BitRate,Channels,Mono/Stereo,Duration,FileNoExt,Sura,Aya,AyaInQuran,Common,Set
0,AbdulBasit,001001.mp3,34,64,2,stereo,4.336327,1001,1,1,0,Yes,train
1,AbdulBasit,001002.mp3,42,64,2,stereo,5.276735,1002,1,2,1,Yes,train
2,AbdulBasit,001003.mp3,32,64,2,stereo,3.996735,1003,1,3,2,Yes,train
3,AbdulBasit,001004.mp3,36,64,2,stereo,4.519184,1004,1,4,3,Yes,train
4,AbdulBasit,001005.mp3,44,64,2,stereo,5.590204,1005,1,5,4,Yes,train


In [13]:
# Check the shapes
print("test: ",df[df['Set'] == "test"].shape)
print("validation: ",df[df['Set'] == "validation"].shape)
print("train: ",df[df['Set'] == "train"].shape)

test:  (305, 13)
validation:  (305, 13)
train:  (895, 13)


In [14]:
df_test  = df[df['Set'] == "test"]
df_valid = df[df['Set'] == "validation"]
df_train = df[df['Set'] == "train"]

print(df_test.shape, df_valid.shape, df_train.shape)

(305, 13) (305, 13) (895, 13)


In [15]:
# Encoding for the reciters as lables
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df['ReciterName'])

"""
le.transform(["tokyo", "tokyo", "paris"])
list(le.inverse_transform([2, 2, 1]))
"""
print("Classes:", len(list(le.classes_)), list(le.classes_))
print("Classes:", len(list(le.classes_)), list(le.transform(le.classes_)))
print()
#print(le.transform(df['ReciterName']))

# Create a dictionary
dict_reciter_to_label = dict(zip(list(le.classes_), list(le.transform(le.classes_))))
print("dict_reciter_to_label = ", dict_reciter_to_label)
dict_label_to_reciter = {v: k for k, v in dict_reciter_to_label.items()}
print("dict_label_to_reciter = ", dict_label_to_reciter)


Classes: 5 ['AbdulBasit', 'AbdurrahmaanAs-Sudais', 'Ajami', 'Alafasy', 'FaresAbbad']
Classes: 5 [0, 1, 2, 3, 4]

dict_reciter_to_label =  {'AbdulBasit': 0, 'AbdurrahmaanAs-Sudais': 1, 'Ajami': 2, 'Alafasy': 3, 'FaresAbbad': 4}
dict_label_to_reciter =  {0: 'AbdulBasit', 1: 'AbdurrahmaanAs-Sudais', 2: 'Ajami', 3: 'Alafasy', 4: 'FaresAbbad'}


In [16]:
list(le.transform(['Alafasy']))[0]

3

# Extract Audio Features
I wanted to make sure I am able to explore & switch between different features easily so there are many audio features that have been coded in the helper function below.

From the data frame it is clear that the various MP3 files have different characteristics. Almost all of these need to be "matched" in order to get the correct data. Here is how this was acheived:
- Bit rate (kbps)    - This didn't seem to make a difference, so nothing was done about this characteristic.
- Sample rate (kHz)  - The sample rate needs to be the same across all audio files so features are comparable. Librosa can be given the sample rate as an input. The default is "22050". I left it at default so all the files will be re-sampled at this rate during the read operation.
- Duration (seconds) - The duration needs to be the same across all audio files. Librosa by default reads the whole duration of the file. We can read a smaller duration but can't "pad" the duration during read. There were two options to deal with the varying durations of the audio files:
     - Figure out min duration & only read the minimum duration.
     - Figure out max duration & pad the duration after reading the MP3 file.
     
     I am choosing to go with the min duration which will help reduce the size of the data for train/test/val.

Other parameters (hop_length, etc.) are related to the Librosa and were made the same during feature extraction.

One other thing to note is that in order to get same data array sizes from each audio file, we first run feature extraction on one audio file to get the "shape" of the data array. Then the data array is initilized and real feature extraction begins.

In [17]:
# Before audio features can be extracted, need to figure out the longest/shortest
# duration and make all audio same duration
duration_max = df['Duration'].max()
print(duration_max)
# Round it up
new_max_duration = math.ceil(duration_max)
print(new_max_duration)
print("Max duration = {} seconds".format(new_max_duration))

duration_min = df['Duration'].min()
print(duration_min)
# Round it up
new_min_duration = math.ceil(duration_min)
print(new_min_duration)
print("Min duration = {} seconds".format(new_min_duration))
# %load helpers.py

166.7395918367347
167
Max duration = 167 seconds
2.93984126984127
3
Min duration = 3 seconds


In [18]:
# Run feature extraction on one audio file to get the "shape" of the data array
#reciter = df[0]['ReciterName']
reciter = df.lookup([0], ['ReciterName'])[0]
mp3_file = df.lookup([0], ['FileName'])[0]
#read_duration = pad_duration = new_min_duration
read_duration = pad_duration = 30
shp_0 = shp_1 = None
sr = 22050
n_fft = 2048
n_mfcc = 13
hop_length = 1024
features_list = ['mfcc']

print("reciter       = ", reciter)
print("mp3_file      = ", mp3_file)
print("read_duration = ", read_duration)
print("pad_duration  = ", pad_duration)
print("        shp_0 = ", shp_0)
print("        shp_1 = ", shp_1)
print("           sr = ", sr)
print("        n_fft = ", n_fft)
print("       n_mfcc = ", n_mfcc)
print("   hop_length = ", hop_length)
print("features_list = ", features_list)


reciter       =  AbdulBasit
mp3_file      =  001001.mp3
read_duration =  30
pad_duration  =  30
        shp_0 =  None
        shp_1 =  None
           sr =  22050
        n_fft =  2048
       n_mfcc =  13
   hop_length =  1024
features_list =  ['mfcc']


In [19]:
columns, data, feature_shapes, new_shp_0, new_shp_1 = extract_audio_features(
    reciter=reciter, mp3_file=mp3_file, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, 
    hop_length=hop_length, pad_duration=pad_duration, read_duration=read_duration, 
    features_list=features_list, shp_0=shp_0, shp_1=shp_1)
print("reciter INIT:")
print(type(data))
#print(data.shape)
print("shape 0/1 :")
print(new_shp_0, new_shp_1)

shp[0] : 13
new_shp_0 : 13
new_shp_1 : 646
reciter INIT:
<class 'list'>
shape 0/1 :
13 646


In [20]:
t0 = time.time()
X_test, y_test = gen_audio_data(df_test, shp0=new_shp_0, shp1=new_shp_1)
print("DONE in {:0.3} sec\n".format(time.time() - t0))


shp0 shp1 =  13 646
X_arr initialized to : (0, 646, 13)
reciters_arr initialized to : (0, 5)
Processed  100
Processed  200
Processed  300
DONE in 1.18e+02 sec



In [21]:
print(X_test.shape, y_test.shape)

(305, 646, 13) (305, 5)


In [28]:
np.savez(os.path.join(data_dir, 'test_arr'), X_test, y_test)

In [22]:
t0 = time.time()
X_valid, y_valid = gen_audio_data(df_valid, shp0=new_shp_0, shp1=new_shp_1)
print("DONE in {:0.3} sec\n".format(time.time() - t0))


shp0 shp1 =  13 646
X_arr initialized to : (0, 646, 13)
reciters_arr initialized to : (0, 5)
Processed  100
Processed  200
Processed  300
DONE in 1.13e+02 sec



In [23]:
print(X_valid.shape, y_valid.shape)

(305, 646, 13) (305, 5)


In [29]:
np.savez(os.path.join(data_dir, 'valid_arr'), X_valid, y_valid)

In [24]:
t0 = time.time()
X_train, y_train = gen_audio_data(df_train, shp0=new_shp_0, shp1=new_shp_1)
print("DONE in {:0.3} sec\n".format(time.time() - t0))


shp0 shp1 =  13 646
X_arr initialized to : (0, 646, 13)
reciters_arr initialized to : (0, 5)
Processed  100
Processed  200
Processed  300
Processed  400
Processed  500
Processed  600
Processed  700
Processed  800
DONE in 3.12e+02 sec



In [25]:
print(X_train.shape, y_train.shape)

(895, 646, 13) (895, 5)


In [30]:
np.savez(os.path.join(data_dir, 'train_arr'), X_train, y_train)

In [33]:
# Convert the scale of training data
X_train_raw = librosa.core.db_to_power(X_train, ref=1.0)
print(np.amin(X_train_raw), np.amax(X_train_raw), np.mean(X_train_raw))

4.864203307980052e-72 1.0198236647671823e+22 1.5216094321311794e+16


In [34]:
X_train_log = np.log(X_train_raw)
print(np.amin(X_train_log), np.amax(X_train_log), np.mean(X_train_log))

-164.20422375326532 50.67650178054088 -6.3850888847565175


In [35]:
X_valid_raw = librosa.core.db_to_power(X_valid, ref=1.0)
X_valid_log = np.log(X_valid_raw)

In [36]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [37]:
X_train, y_train = unison_shuffled_copies(X_train_log, y_train)
X_valid, y_valid = unison_shuffled_copies(X_valid_log, y_valid)

In [38]:
print("Shapes are: ", X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

Shapes are:  (895, 646, 13) (305, 646, 13) (895, 5) (305, 5)


In [40]:
np.savez(os.path.join(data_dir, 'shuffled_train_log'), X_train, y_train)
np.savez(os.path.join(data_dir, 'shuffled_valid_log'), X_valid, y_valid)

# Don't use db_to_power, just use the raw data


In [41]:
# Convert the scale of training data
X_train_raw = X_train
print(np.amin(X_train_raw), np.amax(X_train_raw), np.mean(X_train_raw))

-164.20422375326532 50.67650178054088 -6.385088884756508


In [42]:
X_train_log = X_train_raw
print(np.amin(X_train_log), np.amax(X_train_log), np.mean(X_train_log))

-164.20422375326532 50.67650178054088 -6.385088884756508


In [43]:
X_valid_raw = X_valid
X_valid_log = X_valid_raw

In [44]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [45]:
X_train, y_train = unison_shuffled_copies(X_train_log, y_train)
X_valid, y_valid = unison_shuffled_copies(X_valid_log, y_valid)

In [46]:
print("Shapes are: ", X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

Shapes are:  (895, 646, 13) (305, 646, 13) (895, 5) (305, 5)


In [47]:
np.savez(os.path.join(data_dir, 'shuffled_train'), X_train, y_train)
np.savez(os.path.join(data_dir, 'shuffled_valid'), X_valid, y_valid)