# Dance Dance Convolution - Revisited

Revision of the original "Dance Dance Convolution" paper, that incorporates newer machine learning and AI techniques to "hopefully" improve the original model results.

## Imports

In [1]:
import os 
from os.path import isfile, join, splitext, basename, normpath
from pathlib import Path
from collections import defaultdict
import numpy as np
from sm_parsing import stepfile_parser

## Loading and Cleaning Dataset

In [2]:
# Path to dataset
base_path = "./dataset"

# Song packs inside dataset
# (Ignores files that are not directories)
song_packs = [f for f in os.listdir(base_path) if not isfile(join(base_path, f))]

print(f"Found a total of {len(song_packs)} song packs.")

# =======================================
# SONG PACK CLEANING AND DATA EXTRACTION
# =======================================

# Files that dont add anything to training are deleted (videos, images, txts, etc.)
# Wanted file extensions
wanted_ext = [".ssc", ".sm", ".ogg", ".ini"]

# Counter for the number of files deleted.
files_deleted = 0

# Counter for the number of songs encountered
songs_encountered = 0

# Dictionary that will get one entry for each pack
pack_data = {}

# For every song pack
for pack_name in song_packs:

    # Empty dict that stores all relevant filepaths for a song inside a pack
    # All unseen keys are assigned an empty list by default
    song_data = defaultdict(lambda: [])

    # Go through every file in the song pack
    # (including files and subfiles)
    for path, _, files in os.walk(join(base_path, pack_name)):

        # For every file inside the base path
        for file in files: 

            # Get the current file's parent folder (song folder)
            # 1. The absolute path for the parent directory is extracted
            # 2. 'normpath' strips off any trailing slashes
            # 3. 'basename' returns the last part of the path
            parent_name = basename(normpath(Path(path)))

            # If the file has a "pack_name" as a parent the file is outside 
            # a song folder, it is ignored as a result.
            if parent_name in song_packs:
                print(f"Found '{file}' outside of a song folder. Ignoring file.")
                continue

            # The file extension is extracted
            _, ext = splitext(file)

            # File is deleted if it has an unwanted extension
            if ext not in wanted_ext:
                try:
                    os.remove(join(path, file))
                    files_deleted += 1
                except Exception as e:
                    raise Exception(e)

            # All the paths that relate to a song are stored in a dict
            # according to their name and the songpack they belong to
            else:
                song_data[parent_name].append(join(path, file))
    
    # The "song_data" is stored inside the "pack_data"
    # (This is to prevent two packs having the same title for a
    # song and risking overwriting the data for one song.)
    pack_data[pack_name] = song_data

    # We add the number of songs in the pack to "songs_encountered"
    songs_encountered += len(list(song_data.keys()))

# Printout after cleaning
if files_deleted == 0:
    print(f"Dataset already clean. {songs_encountered} songs found. 0 files deleted.")
else:
    print(f"Dataset cleaned successfully. {songs_encountered} songs found. {files_deleted} files deleted.")


Found a total of 6 song packs.
Found 'group.ini' outside of a song folder. Ignoring file.
Dataset already clean. 230 songs found. 0 files deleted.


## Data Extraction from Stepfiles

In [3]:
# Extension of the required stepfile
stepfile_ext = ".sm"

# Counter for songs successfully processed
successfully_processed = 0

# For every pack in the dataset
for pack_name in pack_data.keys():
    
    # For every song in the song pack
    for song_name in pack_data[pack_name]: 

        try:
            # For every path corresponding to the current song, we take
            # the one that contains the extension that we need. Due to it being
            # returned inside of a list, we get the first element.
            stepfile_path = [path for path in pack_data[pack_name][song_name] if stepfile_ext in path][0]

            # Step file content is extracted as text
            with open(stepfile_path, 'r', encoding="utf-8") as stepfile:
                stepfile_txt = stepfile.read()
            
            # The text of each song is parsed and turned into a dict of tags
            stepfile_tags = stepfile_parser(stepfile_txt)

            # Increase the number of files succesfully processed
            successfully_processed += 1

        # If an error occurs while getting the stepfile path,
        # the program skips the current song
        except Exception:
            print(f"No '{stepfile_ext}' found for song '{song_name}' in song pack '{pack_name}'. Skipping song.")
            continue

# Successful files
print(f"Number of succesfully processed songs: {successfully_processed} / {songs_encountered}")

Number of succesfully processed songs: 230 / 230
