The purpose of this notebook is to 
- iterate through the unprocessed data
    - for each song
        - create tensor of one hot notes
        - create tensor of audio waveform
        - place them in \Training Data\Processed

In [1]:

from pathlib import Path
sys.path.insert(1, str(Path().resolve().parent) + r'\Preprocessing')
import os
from chart_functions import chart2tensor, get_configuration
from audio_functions import music2tensor
from tqdm import tqdm
import numpy as np

unprocessed_data_path = Path(r'X:\Training Data\Unprocessed')
processed_data_path = Path(r'X:\Training Data\Processed')

In [2]:
def check_multiple_audio_files(fileList):
    # Get audio file list
    num_files = 0
    multiple_files = False
    mp3 = False         # Boolean to determine if songs are in mp3 format
    wav = False
    for track in fileList:
        if track.endswith('.ogg'):
            num_files += 1
        if track.endswith('.mp3'):
            num_files += 1
        if track.endswith('.wav'):
            num_files += 1
    if num_files > 1:
        multiple_files = True
    return multiple_files

def get_audio_file_name(fileList):
    for track in fileList:
        if track.endswith('.ogg') or track.endswith('.mp3') or track.endswith('.wav'):
            return track
    raise NameError('Error: audio file not present')

In [3]:
wrong_format_charts = []    # Holds paths to charts not in .chart format
multiple_audio_songs = []   # Holds paths to charts with multiple audio files
processed = []              # Holds paths to song folders that were successfully processed
song_size = 0               # Total audio data size, in gigabytes
notes_size = 0              # Total note data size, in gigabytes

i = 0
for dirName, subdirList, fileList in tqdm(os.walk(unprocessed_data_path)):  # Walk through training data directory
    if not fileList:  # If file list is empty
        continue
    track_pack_ = str(Path(dirName).parent).split('\\')[3]
    song_ = str(Path(dirName)).split('\\')[4]
    processed_path = processed_data_path / track_pack_ / song_
    unprocessed_path = unprocessed_data_path / track_pack_ / song_
    processed_song_path = processed_path / 'song.npy'
    processed_notes_path = processed_path / 'notes.npy'

    print('\n\nProcessing {}, {}'.format(track_pack_, song_))
    if processed_notes_path.exists():
        os.remove(processed_notes_path)  # Delete because I accidentally saved the same array hundreds of times lol

    # Skip creating the directory if there is more than one audio file
    if check_multiple_audio_files(fileList):
        multiple_audio_songs.append(unprocessed_song_path)
        print('{}, {} contains multiple audio files, skipping'.format(track_pack_, song_))
        continue
    else:
        audio_file_name = get_audio_file_name(fileList)
        unprocessed_song_path = unprocessed_path / audio_file_name

    # Create note tensor for song
    try:
        note_tensor = np.array(chart2tensor(unprocessed_path / 'notes.chart', print_release_notes = False)).astype(int)
    except:
        print('{}, {} .chart file is in the wrong format, skipping'.format(track_pack_, song_))
        wrong_format_charts.append(unprocessed_song_path)
        continue
    
    # Make folder in 'Processed' if it doesn't already exist
    if not processed_path.exists():
        os.mkdir(processed_path)

    # Check if song has already been processed
    if processed_song_path.exists():
        print('{} audio has already been processed'.format(str(Path(processed_path)).split('\\')[-1]))
        song_size += (processed_song_path.stat().st_size) / 1e9
    else:
        song = music2tensor(unprocessed_song_path)
        np.save(processed_song_path, song)
        song_size += (processed_song_path.stat().st_size) / 1e9
    
    # Check if notes have already been processed
    if processed_notes_path.exists():
        print('{} notes have already been procesed'.format(str(Path(processed_path)).split('\\')[-1]))
        notes_size += (processed_notes_path.stat().st_size) / 1e9
    else:
        np.save(processed_notes_path, note_tensor)
        notes_size += (processed_notes_path.stat().st_size) / 1e9
        

10it [00:00, 92.25it/s]

Processing Angevil Hero II, 1. Andy McKee - Ouray
1. Andy McKee - Ouray audio has already been processed


Processing Angevil Hero II, 1. Joe Satriani - The Forgotten (Part One + Part Two)
1. Joe Satriani - The Forgotten (Part One + Part Two) audio has already been processed


Processing Angevil Hero II, 1. John 5 - 27 Needles
1. John 5 - 27 Needles audio has already been processed


Processing Angevil Hero II, 1. Lordi - Mr. Killjoy
1. Lordi - Mr. Killjoy audio has already been processed


Processing Angevil Hero II, 1. Megadeth - Kick the Chair
1. Megadeth - Kick the Chair audio has already been processed


Processing Angevil Hero II, 1. Paul Gilbert - 2 Become 1 (Spice Girls Cover)
1. Paul Gilbert - 2 Become 1 (Spice Girls Cover) audio has already been processed


Processing Angevil Hero II, 1. Protest the Hero - Limb from Limb (with Intro)
1. Protest the Hero - Limb from Limb (with Intro) audio has already been processed


Processing Angevil Hero II, 1. Pro

In [6]:
print('{:4.2f} Gigabytes of song data'.format(song_size))
print('{:3.2f} Gigabytes of notes data'.format(notes_size))

59.22 Gigabytes of song data
0.75 Gigabytes of notes data
