# Dance Dance Convolution - Revisited

Revision of the original "Dance Dance Convolution" paper, that incorporates newer machine learning and AI techniques to "hopefully" improve the original model results.

## Imports

In [1]:
import os
import sys
import pickle
import logging
from tqdm import tqdm
from pathlib import Path
from collections import defaultdict
from os.path import isfile, join, splitext, basename, normpath, exists

import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display

# Weights and Biases (WandB)
import wandb 
from wandb.keras import WandbCallback

# Keras
# IMPORTANT: Do not mix tensorflow and keras imports for layers or optimizers.
import keras
from tensorflow.keras.utils import to_categorical
from keras.optimizers import adam_v2
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Conv2D
from keras.layers import MaxPool2D
from keras.layers import Flatten
from keras.layers import Input
from keras.layers import Bidirectional
from keras.layers import LSTM
from keras.layers import Concatenate
from keras.layers import TimeDistributed
from keras.models import Model

# Custom Functions
from sm_parsing import stepfile_parser
from post_process import add_measure_timestamps, log_spectrogram

## Loading and Cleaning Dataset

In [2]:
# Path to dataset
base_path = "./dataset"

# Extension of the required stepfile
stepfile_ext = ".sm"

# Allowed audio and stepfile extensions
audio_exts = [".ogg", ".mp3", ".wav"]
steps_exts = [".sm", ".ssc"]

# Song packs inside dataset
# (Ignores files that are not directories)
song_packs = [f for f in os.listdir(base_path) if not isfile(join(base_path, f))]

print(f"Found a total of {len(song_packs)} song packs.")

# =======================================
# SONG PACK CLEANING AND DATA EXTRACTION
# =======================================

# Files that dont add anything to training are deleted (videos, images, txts, etc.)
# Wanted file extensions
wanted_ext = audio_exts + steps_exts

# Counter for the number of files deleted.
files_deleted = 0

# Counter for the number of songs encountered
songs_encountered = 0

# Dictionary that will get one entry for each pack
pack_data = {}

# For every song pack
for pack_name in song_packs:

    # Empty dict that stores all relevant filepaths for a song inside a pack
    # All unseen keys are assigned an empty list by default
    song_data = defaultdict(lambda: [])

    # Go through every file in the song pack
    # (including files and subfiles)
    for path, _, files in os.walk(join(base_path, pack_name)):

        # For every file inside the base path
        for file in files: 

            # Get the current file's parent folder (song folder)
            # 1. The absolute path for the parent directory is extracted
            # 2. 'normpath' strips off any trailing slashes
            # 3. 'basename' returns the last part of the path
            parent_name = basename(normpath(Path(path)))

            # If the file has a "pack_name" as a parent the file is outside 
            # a song folder, it is ignored as a result.
            if parent_name in song_packs:
                print(f"Found '{file}' outside of a song folder. Ignoring file.")
                continue

            # The file extension is extracted
            _, ext = splitext(file)

            # File is deleted if it has an unwanted extension
            if ext not in wanted_ext:
                try:
                    os.remove(join(path, file))
                    files_deleted += 1
                except Exception as e:
                    raise Exception(e)

            # All the paths that relate to a song are stored in a dict
            # according to their name and the songpack they belong to
            else:
                song_data[parent_name].append(join(path, file))
    
    # The "song_data" is stored inside the "pack_data"
    # (This is to prevent two packs having the same title for a
    # song and risking overwriting the data for one song.)
    pack_data[pack_name] = song_data

    # We add the number of songs in the pack to "songs_encountered"
    songs_encountered += len(list(song_data.keys()))

# Printout after cleaning
if files_deleted == 0:
    print(f"Dataset already clean. {songs_encountered} songs found. 0 files deleted.")
else:
    print(f"Dataset cleaned successfully. {songs_encountered} songs found. {files_deleted} files deleted.")


Found a total of 6 song packs.
Found 'group.ini' outside of a song folder. Ignoring file.
Dataset already clean. 230 songs found. 0 files deleted.


## Check if Songs Have Both Audio and Note Data

In [3]:
# Dict for the path of all audio files
audio_paths = defaultdict(dict)

# Number of songs with both audio and a stepfile
complete_songs = 0

# For every song in each pack
for pack_name in pack_data.keys():
    for song_name in pack_data[pack_name].keys():

        # We get all the extensions found for a song
        song_folder_exts = [splitext(path)[1] for path in pack_data[pack_name][song_name]]

        # Check one or more audio extensions were found inside the song folder
        audio_check = any([audio_ext in song_folder_exts for audio_ext in audio_exts])

        # Check if the required stepfile extension was found
        sm_check = stepfile_ext in song_folder_exts

        # If the song doesnt pass both checks, the song gets deleted from the dict
        if not(audio_check and sm_check):

            del pack_data[pack_name][song_name]
            print(f"Song '{song_name}' of pack '{pack_name}' does not contain one of the required files for training. Removing song from dataset.")

        else:
            # We extract the path for the songs audio file
            # A path is extracted only if it has one of the required extensions
            audio_path = [path for path in pack_data[pack_name][song_name] if splitext(path)[1] in audio_exts]

            # The path is added to an "audio files" dict
            audio_paths[pack_name][song_name] = audio_path

            # Increase the number of complete songs by one
            complete_songs += 1


print(f"Complete songs: {complete_songs} / {songs_encountered} (Contained both audio and a stepfile file).")


Complete songs: 230 / 230 (Contained both audio and a stepfile file).


## Tag Parsing Stepfiles

In [5]:
# Counter for songs successfully processed
successfully_processed = 0

# Dict to store the tag data for each song in the pack 
tag_data = defaultdict(dict)

# For every pack and song in the dataset
for pack_name in pack_data.keys():
    for song_name in tqdm(pack_data[pack_name], desc=f"{pack_name}"): 

        try:
            # For every path corresponding to the current song, we take
            # the one that contains the extension that we need. Due to it being
            # returned inside of a list, we get the first element.
            stepfile_path = [path for path in pack_data[pack_name][song_name] if stepfile_ext in path][0]

        # If an error occurs while getting the stepfile path,
        # the program skips the current song
        except Exception:
            print(f"No '{stepfile_ext}' file found for song '{song_name}' in song pack '{pack_name}'. Skipping song.")
            continue

        # Step file content is extracted as text
        with open(stepfile_path, 'r', encoding="utf-8") as stepfile:
            stepfile_txt = stepfile.read()
            
        # The text of each song is parsed and turned into a dict of tags
        tag_data[pack_name][song_name] = stepfile_parser(stepfile_txt)

        # Required tags
        required_tags = ['offset', 'bpms', 'notes']

        # Current song tags
        current_tags = list(tag_data[pack_name][song_name].keys())

        # Check if resulting dictionary keys contain all the required tags
        if not all((item in current_tags) for item in required_tags):
            raise Exception(f"Song '{song_name}' of pack '{pack_name}' does not contain all of the required tags: 'offset', 'bpms' and 'notes'.")

        # Increase the number of files succesfully processed
        else:
            successfully_processed += 1


# Tag data is saved to a pickle file
with open('tag_data.pickle', 'wb') as handle:
    pickle.dump(tag_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print("Tag data successfully saved.")

# Successful files
print(f"Number of succesfully processed songs: {successfully_processed} / {songs_encountered}")

Fraxtil's Arrow Arrangements: 100%|██████████| 20/20 [00:17<00:00,  1.17it/s]
Fraxtil's Beast Beats: 100%|██████████| 20/20 [00:17<00:00,  1.16it/s]
In The Groove: 100%|██████████| 67/67 [00:47<00:00,  1.43it/s]
In The Groove 2: 100%|██████████| 66/66 [00:36<00:00,  1.82it/s]
KDA - ALL OUT: 100%|██████████| 7/7 [00:03<00:00,  2.32it/s]
Tsunamix III: 100%|██████████| 50/50 [00:13<00:00,  3.72it/s]


Number of succesfully processed songs: 230 / 230


## Alternative: Load Tag Data

In [4]:
# Load the tag data (deserialize)
with open('tag_data.pickle', 'rb') as file:
    tag_data = pickle.load(file)
    print("Tag data succesfully loaded.")

Tag data succesfully loaded.


## Add Timestamps to all Charts

In [5]:
# Dictionary to store the processed data
measure_data = defaultdict(dict)

# For every pack and song in the dataset
for pack_name in tag_data.keys():
    for song_name in tag_data[pack_name].keys(): 

        # We store the post-processed measures
        measure_data[pack_name][song_name] = add_measure_timestamps(tag_data[pack_name][song_name])

## Audio to Spectrogram

Based off the following article: https://towardsdatascience.com/getting-to-know-the-mel-spectrogram-31bca3e2d9d0

In [None]:
# Dict for all spectrogram data (all songs)
audio_data = defaultdict(dict)

# For every pack and song in the dataset
for pack_name in tag_data.keys():

    # We use "file=sys.stdout" to make the output nicer
    for song_name in tqdm(tag_data[pack_name].keys(), desc=f"{pack_name}", file=sys.stdout): 

        # ===================
        # LOADING AUDIO
        # ===================

        # Extract the path to the song's audio
        audio_path = audio_paths[pack_name][song_name][0]

        # Audio gets loaded
        raw_audio, sample_rate = librosa.load(audio_path)

        # ===================
        # SPECTROGRAM (STFT)
        # ===================

        # Hyperparameters for Librosas's STFT
        # Both the window size and the stride are given in miliseconds.
        # "n_mels" consists of the number of frequency bins that the user after applying the "Mel Scale".
        # We use three different window sizes to capture different amounts of "detail" in the signal.
        window_sizes = [23, 46, 93]            
        stride = 10
        n_mels = 80

        # 3D tensor for the all the STFT results after using each window size
        spectrogram_data = []

        # Lowest number of columns found so far
        lowest_num_col = np.Inf

        # For every window size (in ms)
        for window_size in window_sizes:

            # Calculate parameters for Short Time Fourier Transform (STFT)
            n_fft      = int(round(window_size * sample_rate / 1e3))
            hop_length = int(round(     stride * sample_rate / 1e3))

            # Spectrogram is generated
            spectrogram = librosa.feature.melspectrogram(raw_audio, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)

            # Spectrogram gets scaled into decibels
            spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)

            # Keep a record of the lowest number of columns found
            if spectrogram_db.shape[1] <= lowest_num_col:
                lowest_num_col = spectrogram_db.shape[1]

            # Reduce size of array in case the current number of columns is higher than the
            # lowest recorded number of columns.
            else:
                sys.stderr.flush()
                print("Spectrogram with a higher dimensionality found. Slicing to match the remaining sequences.")
                spectrogram_db = spectrogram_db[:,0:lowest_num_col]

            # New data is appended
            spectrogram_data.append(spectrogram_db)

        # ===================
        # SAVING AUDIO DATA
        # ===================

        # Resize array to have the shape: (Time x N Mel x 3)
        spectrogram_data = np.reshape(np.array(spectrogram_data), (-1, n_mels, 3))

        # Equally spaced values between 0 and the number of frames in the STFT
        k = np.linspace(0, spectrogram_data[0], spectrogram_data.shape[0])

        # Convert the spectrogram frames into seconds (timestamps)
        # (Get in which second a frame occurs)
        time_data = librosa.core.frames_to_time(k, sample_rate, hop_length)

        # Add the current "spectrogram matrix" to "audio_data"
        audio_data[pack_name][song_name] = {"spectrogram": spectrogram_data, "time": time_data}

# The resulting audio data is stored in a pickle file
with open('audio_data.pickle', 'wb') as handle:
    pickle.dump(audio_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Alternative: Load Audio Data

If the audio data generation was run before, the user can re-load all the previously processed assets.

In [6]:
# Load the audio data (deserialize)
with open('audio_data.pickle', 'rb') as file:
    audio_data = pickle.load(file)

## Weights and Biases (WandB) Setup

In [7]:
# RemoveS excesive notifications from WandB during training
logger = logging.getLogger("wandb")
logger.setLevel(logging.ERROR)

# Make the WandB stdout shut up
os.environ["WANDB_SILENT"] = "True"

# Name of the current notebook according to WandB
os.environ["WANDB_NOTEBOOK_NAME"] = "Dance Dance Convolution - Revisited.ipynb"

## Dataset Creation and Augmentation

The spectrogram data is used to create the input of the neural network ($X$), while the step data is used to generate the output data ($y$). The data is doubled (augmented) by flipping horizontally all steps. This is a suitable "augmentation" as it does not destroy the underlying patterns in the songs. 

In [74]:
from sklearn.preprocessing import OneHotEncoder

# ======================
# STEP CLASS ENCODING
# ======================

# There are 9 different types of step. We encode for each one:
# 0 - No Note
# 1 - Normal Note
# 2 - Hold Head
# 3 - Hold/Roll Tail
# 4 - Roll Head
# M - Mine (or "bad" note)
# K - Automatic keysound
# L - Lift note
# F - Fake note
num_step_classes = 9

# One hot encoder fit for detecting the 9 step classes
OHEnc_step = OneHotEncoder(handle_unknown='ignore')
OHEnc_step.fit(np.reshape(np.arange(0, num_step_classes), (-1,1)))

# One hot encoder for the difficulty
difficulties = np.array(["Beginner", "Easy", "Medium", "Hard", "Challenge", "Edit"])
OHEnc_diff = OneHotEncoder(handle_unknown='ignore')
OHEnc_diff.fit(np.reshape(difficulties, (-1,1)))

# Empty lists for the inputs and outputs
X_data = []
y_data = []

# Iterate once again over every pack and song
# We use "file=sys.stdout" to make the output nicer
for pack in tag_data.keys():
    for song in tqdm(tag_data[pack].keys(), desc=f"{pack}", file=sys.stdout): 

        # =======================
        # MEASURE DATA (OUTPUT)
        # =======================

        # For every difficulty that uses a single pad
        for difficulty in measure_data[pack][song]["dance-single"].keys():

            # One hot encoding of the difficulty
            difficulty_enc = OHEnc_diff.transform(np.array([[difficulty]])).toarray()
            
            # Extract the current difficulty chart
            chart = measure_data[pack][song]["dance-single"][difficulty]

            # Timestamp values for timing chart steps
            step_timestamps = chart[:,4]

            # P(Step): Probability of a step occurring
            P_step = 1*np.any(chart[:,0:4], axis=1, keepdims=True)

            # BPM: BPM value during each step
            BPM_step = np.reshape(chart[:,5], (-1, 1))

            # Probability of step class ocurring for each step direction 
            # - P(Left)  : Multiclass probability of a left step occurring
            # - P(Down)  : Multiclass probability of a down step occurring
            # - P(Up)    : Multiclass probability of a up step occurring
            # - P(Right) : Multiclass probability of a right step occurring
            P_left  = OHEnc_step.transform(np.reshape(chart[:,0], (-1, 1))).toarray()
            P_down  = OHEnc_step.transform(np.reshape(chart[:,1], (-1, 1))).toarray()
            P_up    = OHEnc_step.transform(np.reshape(chart[:,2], (-1, 1))).toarray()
            P_right = OHEnc_step.transform(np.reshape(chart[:,3], (-1, 1))).toarray()

            # Concatenation of the previous parts to form the output of the 
            # neural network for a "singles" song chart.
            output_nn = np.hstack((P_step, BPM_step, P_left, P_down, P_up, P_right))

            # Output tuple: 
            # - Chart data with the shape of the output of the neural net
            # - Difficulty one hot encoded (to condition the output)
            # - The step timestamps for timing each step
            y_data.append((output_nn, difficulty_enc, step_timestamps))



Fraxtil's Arrow Arrangements: 100%|██████████| 20/20 [00:00<00:00, 48.56it/s]
Fraxtil's Beast Beats: 100%|██████████| 20/20 [00:00<00:00, 54.42it/s]
In The Groove: 100%|██████████| 67/67 [00:00<00:00, 70.39it/s]
In The Groove 2: 100%|██████████| 66/66 [00:00<00:00, 72.69it/s]
KDA - ALL OUT: 100%|██████████| 7/7 [00:00<00:00, 76.19it/s]
Tsunamix III: 100%|██████████| 50/50 [00:00<00:00, 74.07it/s]


In [75]:
len(y_data)

1148

## Data Augmentation

Charts can be flipped horizontally (mirrored) and their choreography will not be altered in any way. To add aditional training samples we flip all charts.

In [None]:
# PENDING

## Prepare the Data (Reshape and Train/Test/Valid Splits)

In [None]:
# PENDING

## Data Generator (Spectrograms)

(1412, 38)

In [45]:
class TimeSliceGenerator(keras.utils.Sequence):

    def __init__(self, X_data, y_data, batch_size=32, shuffle=True):
        pass

    def on_epoch_end(self):
        pass

    def __getitem__(self, index):
        pass

    def __len__(self):
        return self.n // self.batch_size

## Keras Model

In [51]:
# ==================
# SETTINGS
# ==================

# WandB: 0. Weights and Biases login (only if first time use)
# WandB: 1. New run declaration with all the parameters to track.
run = wandb.init(project="DanceDanceConvolutionX", entity="sanoli",
                 config={
                    "learning_rate": 0.001, 
                    "epochs": 10, 
                    "batch_size": 32,  
                    "loss_function": {
                       "OUT_Stp": "binary_crossentropy",
                       "OUT_bpm": "mse",
                       "OUT_PLeft" : "categorical_crossentropy",
                       "OUT_PRight": "categorical_crossentropy",
                       "OUT_PUp"   : "categorical_crossentropy",
                       "OUT_PDown" : "categorical_crossentropy"
                    },
                    "loss_weights": {
                       "OUT_Stp": 1,
                       "OUT_bpm": 1,
                       "OUT_PLeft" : 1,
                       "OUT_PRight": 1,
                       "OUT_PUp"   : 1,
                       "OUT_PDown" : 1
                    },
                    "architecture": "CNN + BiLSTM + MLP",  
                    "dataset": "Fraxtil, KDA, ITG"
                 })


# Network Weight initialization
# It makes use of the 'glorot' initializer, also known as Xavier's initializer
initializer = keras.initializers.GlorotNormal()

# Number of time slices to feed the network at the same time.
# (Kinda like "time batches")

# Hyperparameters: 
# - batch_size: Number of samples fed to the NN at the same time
# - time_batch_size: Number of time slices fed to the network at the same time
batch_size = 32
time_batch_size = 5

# Difficulties
# - The OHE is casted as a float
# - We repeat the pattern as many times down as there are "time slices"
# - We add an additional 3rd dimension to facilitate the concatenation
difficulty = np.array([[0,0,0,0,1]]).astype(float)
difficulty = np.tile(difficulty, (time_batch_size,1))
difficulty = np.reshape(difficulty, (-1,time_batch_size, 5))

# ==================
# LAYERS
# ==================

# BLOCK 1: CONVOLUTION
# - TimeDistributed is used to add a time dimension to the layers
# - Rule of thumb: Dimension of convolution and max pooling has to match
# - The charts difficulty (OHE) is concatenated with the flattened tensor
IN  = Input(shape=(time_batch_size, 15, 80, 3), name="Input")
CL1 = TimeDistributed(Conv2D(filters=10, kernel_size=(7,3), strides=1, activation="relu", padding="same"), name="Conv2D-1")(IN)
MP1 = TimeDistributed(MaxPool2D(pool_size=3, strides=1), name="MaxPool2D-1")(CL1)
CL2 = TimeDistributed(Conv2D(filters=20, kernel_size=(3,3), strides=1, activation="relu", padding="same"), name="Conv2D-2")(MP1)
MP2 = TimeDistributed(MaxPool2D(pool_size=3, strides=3), name="MaxPool2D-2")(CL2)
F1  = TimeDistributed(Flatten(), name="Flatten")(MP2)
CC1 = Concatenate(axis=2, name="Concat-Diff")([F1, difficulty])

# BLOCK 2: FULLY CONNECTED
D1  = Dense(units=256, activation="relu", kernel_initializer=initializer, name="Dense-1")(CC1)
D2  = Dense(units=128, activation="relu", kernel_initializer=initializer, name="Dense-2")(D1)

# BLOCK 3: RECURRENT NET
BL1 = Bidirectional(LSTM(200, return_sequences=True), name="BiLSTM-1")(D2)
BL2 = Bidirectional(LSTM(200, return_sequences=True), name="BiLSTM-2")(BL1)
BL3 = Bidirectional(LSTM(200, return_sequences=True), name="BiLSTM-3")(BL2)

# BLOCK 4: FULLY CONNECTED 2 (ELECTRIC BOOGALOO)
D3  = Dense(units=128, activation="relu", kernel_initializer=initializer, name="Dense-3")(BL3)
D4  = Dense(units= 64, activation="relu", kernel_initializer=initializer, name="Dense-4")(D3)

# BLOCK 5: SEPARATE OUTPUTS
# y[0]     = Probability of step. Sigmoid
# y[1]     = BPM value for current step. ReLu
# y[2:10]  = Probability distribution of "Left" step (One for each note type. 9 in total). Softmax
# y[11:19] = Probability distribution of "Down" step (One for each note type. 9 in total). Softmax
# y[20:28] = Probability distribution of "Up" step (One for each note type. 9 in total). Softmax
# y[29:37] = Probability distribution of "Right" step (One for each note type. 9 in total). Softmax
OUT_Stp    = Dense(units=1, activation="sigmoid", name="Prob-Step")(D4)
OUT_bpm    = Dense(units=1, activation="relu", name="BPM")(D4)
OUT_PLeft  = Dense(units=9, activation="softmax", name="Prob-Left")(D4)
OUT_PRight = Dense(units=9, activation="softmax", name="Prob-Right")(D4)
OUT_PUp    = Dense(units=9, activation="softmax", name="Prob-Up")(D4)
OUT_PDown  = Dense(units=9, activation="softmax", name="Prob-Down")(D4)

# BLOCK 6: FINAL CONCATENATION
OUT = Concatenate(name="Concat-Outs")([OUT_Stp, OUT_bpm, OUT_PLeft, OUT_PRight, OUT_PUp, OUT_PDown])

# ==================
# MODEL CREATION
# ==================

# A model is created with all the previous parts
model = Model(inputs=IN, outputs=OUT, name="DDCX")

# WandB: 2. Stores the inputs and hyperparameters of the model
config = wandb.config

# The optimizer is selected
optimizer = adam_v2.Adam(learning_rate=config.learning_rate)

# The model is compiled
model.compile(optimizer=optimizer, metrics=["accuracy"], loss=config.loss_function, loss_weights=config.loss_weights)

# The keras summary is stored as a WandB log
wandb.log({"summary": model.summary()})

wandb: wandb version 0.12.2 is available!  To upgrade, please run:
wandb:  $ pip install wandb --upgrade


Model: "DDCX"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input (InputLayer)              [(None, 5, 15, 80, 3 0                                            
__________________________________________________________________________________________________
Conv2D-1 (TimeDistributed)      (None, 5, 15, 80, 10 640         Input[0][0]                      
__________________________________________________________________________________________________
MaxPool2D-1 (TimeDistributed)   (None, 5, 13, 78, 10 0           Conv2D-1[0][0]                   
__________________________________________________________________________________________________
Conv2D-2 (TimeDistributed)      (None, 5, 13, 78, 20 1820        MaxPool2D-1[0][0]                
_______________________________________________________________________________________________

## Model Training

In [None]:
# Se entrena el modelo 
model.fit(X_train, y_train, epochs=config.epochs, batch_size=config.batch_size, validation_data=(X_valid, y_valid), 
       callbacks=[
            # Se envían datos a weights and biases
            WandbCallback(
                data_type="image",                      # Se generan imágenes en el reporte
                monitor="accuracy",                     # Monitorea el accuracy como métrica
                mode="max",                             # Trackea aumentos en accuracy
                save_model=True,                        # Guardar modelo cuando se alcanza un nuevo máximo en accuracy
                validation_data=(X_valid, y_valid),     # WandB hace predicciones a medio proceso y las despliega en el dashboard
            ),          
       ])

# Se mide la precisión con set pruebas
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Error Rate: {round((1-accuracy)*100, 2)}")

# Se loguean los resultados en WandB
wandb.log({"Test Error Rate" : round((1-accuracy)*100, 2)})
run.join()

# Se finaliza el "run" de WandB
run.finish()