# Feature Extraction

After looking through the dataset features, which was done in the [Feature Visualization](./Feature%20Visualization.ipynb) notebook, the dataset needs to be preprocessed and prepared into input for the neural network models. The current notebook will accomplish the following:

* Some genres have very few tracks, which can easily lead to class imbalance and model bias. These genres should be removed or integrated into another genre.


* A directory of spectrograms will be created for the *fma_medium* tracks.


* A **Pandas** dataframe will be created for use by **Tensorflow's** ImageDataGenerator.

----

## Imports

In [1]:
# Access audio files and metadata.
import os
import utils

# Data Mangement
import pandas as PD
import gc

# Spectrgram Creation Functions
from multiprocessing import Pool, cpu_count
import convert
from spectrogram_generation import worker

----

## Pathname Constants

In [2]:
MAIN_DIR =       os.getcwd()
METADATA_DIR =   os.path.join(MAIN_DIR, 'fma_metadata')
TRACKS_FILE =    os.path.join(METADATA_DIR, 'tracks.csv')
SPECTROGRAM_DIR = os.path.join('D:', 'Spectrograms')

----

## Creating the Metadata Dataframe

### Retrieving Raw Metadata

In [3]:
# Retrieving track metadata for tracks in 'fma_medium'.
tracks = utils.load(TRACKS_FILE)
medium_tracks = tracks[tracks['set', 'subset'] <= 'medium']

# Retrieving track genres and training/testing classification.
RAW_metadata = PD.merge(
    medium_tracks['track']['genre_top'], medium_tracks['set']['split'], 
    left_index = True, right_index = True
)

RAW_metadata.head(10)

Unnamed: 0_level_0,genre_top,split
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,Hip-Hop,training
3,Hip-Hop,training
5,Hip-Hop,training
10,Pop,training
134,Hip-Hop,training
136,Rock,training
139,Folk,training
140,Folk,training
141,Folk,training
148,Experimental,validation


### Metadata Processing

In [4]:
"""
The following genres need to be removed/changed due to limitations.
-------------------------------------------------
Easy Listening:    REMOVE. The 'Easy Listening' tracks can be classified into the other genres. 
                   Additionally, there are only 21 tracks within this genre.

Spoken:            REMOVE. Consists of tracks that involve speech rather than music.

Old-Time/Historic: CHANGE. Just rename to 'Historic'
""" 
RAW_metadata = RAW_metadata[~RAW_metadata['genre_top'].isin(['Spoken', 'Easy Listening'])]
RAW_metadata['Genre'] = RAW_metadata['genre_top'].replace({'Old-Time / Historic' : 'Historic'})


# Creating pathnames based on track_id & removing track_id
RAW_metadata['Pathname'] = [f'{ID}.png' for ID in RAW_metadata.index]
RAW_metadata = RAW_metadata.reset_index()

RAW_metadata.head(10)    

Unnamed: 0,track_id,genre_top,split,Genre,Pathname
0,2,Hip-Hop,training,Hip-Hop,2.png
1,3,Hip-Hop,training,Hip-Hop,3.png
2,5,Hip-Hop,training,Hip-Hop,5.png
3,10,Pop,training,Pop,10.png
4,134,Hip-Hop,training,Hip-Hop,134.png
5,136,Rock,training,Rock,136.png
6,139,Folk,training,Folk,139.png
7,140,Folk,training,Folk,140.png
8,141,Folk,training,Folk,141.png
9,148,Experimental,validation,Experimental,148.png


----

## Creating Spectrograms

### Initializing the Data Directory

In [5]:
# List of spectrogram types to generate.
SPECTROGRAMS = {
    'Log': convert.log_spectrogram,
    'Q-Power': convert.Q_power_spectrogram,
    'Tempogram': convert.tempogram,
    'Mel': convert.mel_spectrogram,
    'Log-Mel': convert.log_mel_spectrogram,
    'Chromagram': convert.linear_chromagram,
    'Q-Power-Chromagram': convert.Q_power_chromagram
}

# Creating the data directories.
if not os.path.exists(SPECTROGRAM_DIR):
    os.mkdir(SPECTROGRAM_DIR)
    
for spectrogram in list(SPECTROGRAMS):
    # Creating the spectrogram directories.
    pathname = os.path.join(SPECTROGRAM_DIR, spectrogram)
    if not os.path.exists(pathname): 
        os.mkdir(pathname)
    
    # Storing the spectogram directory pathnames.
    SPECTROGRAMS[pathname] = SPECTROGRAMS[spectrogram]
    del SPECTROGRAMS[spectrogram]

### Generating Data

In [8]:
if __name__ ==  '__main__': 
    # Determining the number of workers.
    N_WORKERS = cpu_count() - 1
    
    # Jobs List Container
    CREATE_list = []
    
    for index, row in RAW_metadata.iterrows():
        # Checking whether any spectrograms need to be created for the current track.
        # If no spectrograms need to be created, a process doesn't need to be 
        # unecessarily created.
        files = {os.path.join(DIR, row['Pathname']):spectrogram for (DIR, spectrogram) in SPECTROGRAMS.items()}
        spectrograms_CREATE = {F: spectrogram for (F, spectrogram) in files.items() if not os.path.exists(F)}
        if len(spectrograms_CREATE) > 0: CREATE_list.append((row['track_id'], spectrograms_CREATE))
    
    # Uses the jobs list to processes the audio files into spectrograms.
    for START_N in range(0, len(CREATE_list), N_WORKERS):
        # Jobs List Subset Container
        END_N = START_N + N_WORKERS
        CREATE_jobs = CREATE_list[START_N : END_N]
        print(f'Indices: {START_N}-{END_N}')
        
        # Creating processes for each element in the subset container.
        with Pool(processes = len(CREATE_jobs)) as pool:
            fails = pool.map(worker, CREATE_jobs)
            fails = [fail for fail in fails if fail is not None]
        
        # Removing failed tracks from the metadata & logging info.
        RAW_metadata = RAW_metadata[~RAW_metadata['track_id'].isin(fails)]
        [print(f'FAIL: {fail}') for fail in fails]

### Saving the Metadata Files

In [7]:
# Separating the metadata into training and testing sets.
TRAIN_metadata = RAW_metadata[RAW_metadata['split'] != 'validation'][['Pathname', 'Genre']]
TEST_metadata = RAW_metadata[RAW_metadata['split'] == 'validation'][['Pathname', 'Genre']]

TRAIN_metadata.to_csv('./TRAIN_metadata.csv')
TEST_metadata.to_csv('./TEST_metadata.csv')