# This notebook has been productionized in ~/Preprocessing/preprocess_functions.py

The purpose of this notebook is to 
- iterate through the unprocessed data
    - for each song
        - create tensor of one hot notes
        - create tensor of audio waveform
        - place them in \Training Data\Processed

In [2]:
from pathlib import Path
sys.path.insert(1, str(Path().resolve().parent) + r'\Preprocessing')
import os
from chart_functions import chart2tensor, get_configuration
from audio_functions import music2tensor
from tqdm import tqdm
import numpy as np

unprocessed_data_path = Path(r'X:\Training Data\Unprocessed')
processed_data_path = Path(r'X:\Training Data\Processed')

In [3]:
def check_multiple_audio_files(fileList):
    # Get audio file list
    num_files = 0
    multiple_files = False
    mp3 = False         # Boolean to determine if songs are in mp3 format
    wav = False
    for track in fileList:
        if track.endswith('.ogg'):
            num_files += 1
        if track.endswith('.mp3'):
            num_files += 1
        if track.endswith('.wav'):
            num_files += 1
    if num_files > 1:
        multiple_files = True
    return multiple_files

def get_audio_file_name(fileList):
    for track in fileList:
        if track.endswith('.ogg') or track.endswith('.mp3') or track.endswith('.wav'):
            return track
    raise NameError('Error: audio file not present')

In [3]:
wrong_format_charts = []    # Holds paths to charts not in .chart format
multiple_audio_songs = []   # Holds paths to charts with multiple audio files
processed = []              # Holds paths to song folders that were successfully processed
song_size = 0               # Total audio data size, in gigabytes
notes_size = 0              # Total note data size, in gigabytes

i = 0
for dirName, subdirList, fileList in tqdm(os.walk(unprocessed_data_path)):  # Walk through training data directory
    if not fileList:  # If file list is empty
        continue
    track_pack_ = str(Path(dirName).parent).split('\\')[3]
    song_ = str(Path(dirName)).split('\\')[4]
    processed_path = processed_data_path / track_pack_ / song_
    unprocessed_path = unprocessed_data_path / track_pack_ / song_
    processed_song_path = processed_path / 'song.npy'
    processed_notes_path = processed_path / 'notes.npy'

    print('\n\nProcessing {}, {}'.format(track_pack_, song_))
    if processed_notes_path.exists():
        os.remove(processed_notes_path)  # Delete because I accidentally saved the same array hundreds of times lol

    # Skip creating the directory if there is more than one audio file
    if check_multiple_audio_files(fileList):
        multiple_audio_songs.append(unprocessed_song_path)
        print('{}, {} contains multiple audio files, skipping'.format(track_pack_, song_))
        continue
    else:
        audio_file_name = get_audio_file_name(fileList)
        unprocessed_song_path = unprocessed_path / audio_file_name

    # Create note tensor for song
    try:
        note_tensor = np.array(chart2tensor(unprocessed_path / 'notes.chart', print_release_notes = False)).astype(int)
    except:
        print('{}, {} .chart file is in the wrong format, skipping'.format(track_pack_, song_))
        wrong_format_charts.append(unprocessed_song_path)
        continue
    
    # Make folder in 'Processed' if it doesn't already exist
    if not processed_path.exists():
        os.mkdir(processed_path)

    # Check if song has already been processed
    if processed_song_path.exists():
        print('{} audio has already been processed'.format(str(Path(processed_path)).split('\\')[-1]))
        song_size += (processed_song_path.stat().st_size) / 1e9
    else:
        song = music2tensor(unprocessed_song_path)
        np.save(processed_song_path, song)
        song_size += (processed_song_path.stat().st_size) / 1e9
    
    # Check if notes have already been processed
    if processed_notes_path.exists():
        print('{} notes have already been procesed'.format(str(Path(processed_path)).split('\\')[-1]))
        notes_size += (processed_notes_path.stat().st_size) / 1e9
    else:
        np.save(processed_notes_path, note_tensor)
        notes_size += (processed_notes_path.stat().st_size) / 1e9
    i += 1
    if i > 1:
        break
        

3it [00:00, 74.95it/s]

Processing Angevil Hero II, 1. Andy McKee - Ouray
{'tick': [228, 255, 283, 309, 336, 362, 388, 414, 427, 440, 440, 466, 493, 519, 545, 571, 597, 623, 636, 648, 648, 673, 685, 697, 710, 722, 734, 747, 759, 771, 794, 807, 820, 833, 869, 881, 894, 906, 918, 930, 942, 942, 966, 990, 996, 1016, 1029, 1068, 1079, 1091, 1102, 1114, 1126, 1140, 1152, 1164, 1190, 1202, 1214, 1228, 1266, 1278, 1291, 1303, 1315, 1328, 1340, 1340, 1364, 1388, 1400, 1412, 1425, 1462, 1462, 1474, 1487, 1499, 1511, 1523, 1535, 1548, 1560, 1585, 1598, 1610, 1623, 1661, 1672, 1684, 1696, 1709, 1721, 1733, 1733, 1754, 1779, 1804, 1817, 1857, 1869, 1882, 1894, 1906, 1917, 1929, 1942, 1955, 1980, 1991, 2003, 2015, 2054, 2066, 2078, 2090, 2102, 2113, 2123, 2123, 2150, 2173, 2186, 2198, 2211, 2224, 2224, 2248, 2248, 2272, 2280, 2289, 2297, 2297, 2322, 2330, 2339, 2347, 2347, 2372, 2380, 2389, 2397, 2405, 2422, 2422, 2447, 2460, 2472, 2484, 2496, 2507, 2519, 2532, 2544, 2569, 2569, 2594, 2594, 2619, 2

In [6]:
print('{:4.2f} Gigabytes of song data'.format(song_size))
print('{:3.2f} Gigabytes of notes data'.format(notes_size))

59.22 Gigabytes of song data
0.75 Gigabytes of notes data


In [4]:
# Investigating song length mismatch glitch
song = music2tensor(unprocessed_song_path)
note_tensor = np.load(processed_notes_path)

NameError: name 'unprocessed_song_path' is not defined

In [8]:
print(unprocessed_song_path)
print(unprocessed_path)

X:\Training Data\Unprocessed\Angevil Hero II\1. Joe Satriani - The Forgotten (Part One + Part Two)\song.ogg
X:\Training Data\Unprocessed\Angevil Hero II\1. Joe Satriani - The Forgotten (Part One + Part Two)


In [7]:
song = music2tensor(Path(r'X:\Training Data\Unprocessed\Anti Hero\Noel Pix - Die Legende (m9)') / 'guitar.ogg')
note_tensor = np.load(Path(r'X:\Training Data\Processed\Anti Hero\Noel Pix - Die Legende (m9)') / 'notes.npy')


In [8]:
print(song.shape)
print(note_tensor.shape)

(3, 81, 20072)
(20072,)
