In [1]:
import os
import glob
import zipfile
import hashlib
import pandas as pd
from tqdm import tqdm

In [2]:
def create_file_hash(file_path):
    """Create a hash of the file contents."""
    with open(file_path, 'rb') as f:
        return hashlib.sha256(f.read()).hexdigest()

def create_data_hash(data):
    """Create a hash of the data."""
    return hashlib.sha256(data).hexdigest()

def get_map_audio_path(map_data, file_options=None):
    """Get the name of the map audio file."""
    start_idx = map_data.find(b'AudioFilename: ')
    end_idx = map_data.find(b'\n', start_idx)
    audio_path = map_data[start_idx + len(b'AudioFilename: '):end_idx]
    audio_path = audio_path.decode('utf-8').strip()

    if file_options is None or audio_path in file_options:
        return audio_path

    for file_option in file_options:
        if file_option.lower() == audio_path.lower():
            return file_option

def get_map_mode(map_data):
    """Get the mode of the map."""
    start_idx = map_data.find(b'Mode:')
    end_idx = map_data.find(b'\n', start_idx)
    mode = map_data[start_idx + len(b'Mode:'):end_idx]
    return mode.decode('utf-8').strip()

def change_map_audio_path(map_data, new_name):
    start_idx = map_data.find(b'AudioFilename: ')
    end_idx = map_data.find(b'\n', start_idx)
    return map_data[:start_idx] \
        + b'AudioFilename: ' + new_name.encode('utf-8') \
        + map_data[end_idx:]

In [3]:
raw_data_path = 'raw_beatmaps/'
formatted_data_path = 'formatted_beatmaps/'

songs_dir = os.path.join(formatted_data_path, 'songs')
maps_dir = os.path.join(formatted_data_path, 'maps')
song_mapping_path = os.path.join(formatted_data_path, 'song_mapping.csv')

In [4]:
for dir in [songs_dir, maps_dir]:
  os.makedirs(dir, exist_ok=True)

In [5]:
song_mapping = {}
osz_paths = glob.glob(os.path.join(raw_data_path, '*.osz'))

In [6]:
song_idx = 0
map_idx = 0
song_hashes = {}

for osz_path in tqdm(osz_paths):
  try:
    with zipfile.ZipFile(osz_path, 'r') as archive:
      zip_files = set(archive.namelist())
      loaded_audio = {}
      for file_name in zip_files:
        try:
          if file_name.endswith('.osu'):
            # load beatmap file
            with archive.open(file_name, 'r') as beatmap_file:
              # Get name of audio paired audio file
              map_data = beatmap_file.read()
              # Skip map if not standard mode
              map_mode = get_map_mode(map_data)
              if map_mode != '0':
                continue
              audio_path = get_map_audio_path(map_data, zip_files)
              if audio_path in loaded_audio:
                # If other difficulties have used it, reuse the same hash
                audio_hash = loaded_audio[audio_path]
              else:
                # Otherwise, create a new hash
                with archive.open(audio_path, 'r') as audio_file:
                  audio_data = audio_file.read()
                audio_hash = create_data_hash(audio_data)
                loaded_audio[audio_path] = audio_hash

              # If the this same audio has not been used before,
              # save it to the songs directory,
              # otherwise, get the new name of the audio file and reuse it
              if audio_hash not in song_hashes:
                new_song_name = f'{song_idx}.mp3'
                new_song_path = os.path.join(songs_dir, new_song_name)
                with open(new_song_path, 'wb') as song_file:
                  song_file.write(audio_data)
                song_hashes[audio_hash] = new_song_name
                song_idx += 1
              else:
                new_song_name = song_hashes[audio_hash]

              # Change the audio file name in the beatmap file
              new_map_data = change_map_audio_path(map_data, new_song_name)

              # Copy osu file to maps directory
              new_map_name = f'{map_idx}.osu'
              new_map_path = os.path.join(maps_dir, new_map_name)
              with open(new_map_path, 'wb') as new_map_file:
                new_map_file.write(new_map_data)

              map_idx += 1
              song_mapping[new_map_name] = new_song_name
        except Exception as e:
          print(f'Error with file {osz_path}/{file_name}: {e}')
  except Exception as e:
    print(f'Error with file {osz_path}: {e}')

 57%|█████▋    | 9821/17150 [04:20<02:55, 41.76it/s]

Error with file raw_beatmaps\386117.osz: File is not a zip file


 76%|███████▌  | 13060/17150 [05:41<02:00, 34.07it/s]

Error with file raw_beatmaps\6335.osz/An Cafe - Kakusei Heroism ~The Hero Without A Name~ (Weinmagier) [Insane].osu: Bad CRC-32 for file 'Darker_than_Black_-_Kakusei_Heroism_The_Hero_Without_A_Name.mp3'
Error with file raw_beatmaps\6335.osz/An Cafe - Kakusei Heroism ~The Hero Without A Name~ (Weinmagier) [Normal].osu: Bad CRC-32 for file 'Darker_than_Black_-_Kakusei_Heroism_The_Hero_Without_A_Name.mp3'
Error with file raw_beatmaps\6335.osz/An Cafe - Kakusei Heroism ~The Hero Without A Name~ (Weinmagier) [Konata!].osu: Bad CRC-32 for file 'Darker_than_Black_-_Kakusei_Heroism_The_Hero_Without_A_Name.mp3'


 86%|████████▌ | 14673/17150 [06:28<01:03, 39.03it/s]

Error with file raw_beatmaps\772055.osz: File is not a zip file


100%|██████████| 17150/17150 [07:40<00:00, 37.23it/s]


In [10]:
# Save song mapping
song_mapping = pd.DataFrame.from_dict(song_mapping, orient='index')
song_mapping.columns = ['song']
song_mapping.to_csv(song_mapping_path)