In [41]:
import os
from pydub import AudioSegment
import numpy as np

from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, Embedding, LSTM, Bidirectional, Input, concatenate, Dropout, SpatialDropout1D, Flatten, BatchNormalization, Conv1D, MaxPooling1D
from keras.utils.np_utils import to_categorical
from keras import regularizers
from keras.optimizers import Adam
from keras.callbacks import TensorBoard

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Raw data preprocessing:

In [2]:
MAP_DIR_PATH = 'MAPS\\PROTOTYPE MAPS\\'
MAP_NAME = 'Levels'

In [3]:
# Load the beatmap hitcirlces

with open(os.path.join(MAP_DIR_PATH, MAP_NAME + '.osu')) as f:
    content = [x.strip() for x in f.readlines()]

hitobjects_index = [x for x in content].index('[HitObjects]')
hitobjects = [x.split(',')[:3] for x in content[hitobjects_index + 1:]]

for i in range(10):
    print(hitobjects[i])

['376', '307', '14036']
['176', '192', '14888']
['336', '192', '15101']
['200', '248', '15314']
['312', '136', '15527']
['256', '272', '15741']
['256', '112', '15954']
['312', '248', '16167']
['200', '136', '16380']
['56', '56', '16806']


In [4]:
# Preprocesses the labels

print(len(hitobjects))
# X and y cordinate labels
time_values = []
y_x = []
y_y = []

map_start = int(hitobjects[0][-1])

for y in hitobjects:
    y_x.append(y[0])
    y_y.append(y[1])
    time_values.append(int(y[-1]) - map_start)
    
print('Time_values:', time_values[1], 'x:', y_x[1], 'y:', y_y[1])

233
Time_values: 852 x: 176 y: 192


In [5]:
# Load the song

map_song = AudioSegment.from_file(os.path.join(MAP_DIR_PATH, MAP_NAME + '.mp3'), format='mp3')

print('Length before: ', len(map_song))
song_in_map_range = map_song[map_start:int(hitobjects[-1][-1])]
print('Length after: ', len(song_in_map_range))

print('Length of samples in one timestep: ', len(song_in_map_range[0].get_array_of_samples()))

Length before:  100127
Length after:  78405
Length of samples in one timestep:  88


In [132]:
OUTPUTS_PER_SECOND = 20
INPUTS_PER_TIMESTEP = 250

# Loop over each second of the song and get the data and labels for timesteps
# loop over the song ms by ms, if ms has hitcircle on it, append 1(coordinates included) else 0 (coordinates not included)
X = []
y = []
timing_matches = 0
for i in range(0,  len(song_in_map_range)):
    X.append(np.asarray(song_in_map_range[i].get_array_of_samples()))
    if i in time_values:
        y.append([y_x[timing_matches], y_y[timing_matches]])
        timing_matches += 1
    else:
        y.append(['nan', 'nan'])
    

        
print(y[0])
print(y[1])
print('Total timing matches', len([i for i in y if i[0] != 'nan']), 'out of', len(y))
print('Len of X:', len(X), '\nexample values:', X[0][0:5])

print('\n\nX and y pairs:', '\nX:', X[0][0:5], '\ny:', y[0])

['376', '307']
['nan', 'nan']
Total timing matches 232 out of 78405
Len of X: 78405 
example values: [ -174  9910  2588 10408  5989]


X and y pairs: 
X: [ -174  9910  2588 10408  5989] 
y: ['376', '307']


In [135]:
# Reduce X dimensionality
array_min = min(X[0])
array_max = max(X[0])

maxval = 5000 # HYPERPARAMETER to reduce dimensionality

print('Array min:',array_min, 'array max:',array_max)

divider = array_max / maxval
print('Divider:', divider)

print('Before:', X[0][0:10])
print('After:',X[0][0:10] // divider)

print('Max:',max(X[0] // divider))
print('Min:',min(X[0] // divider))

X = X // divider

Array min: -17411 array max: 18648
Divider: 3.7296
Before: [ -174  9910  2588 10408  5989 13656  7966 14921  3180  8682]
After: [ -47. 2657.  693. 2790. 1605. 3661. 2135. 4000.  852. 2327.]
Max: 4999.0
Min: -4669.0


## Preparing data for the classifier:

Turn coordinate values into embeddings

In [151]:
from sklearn.model_selection import train_test_split

stringified_X = []
for ind, x in enumerate(X):
    listed_str = []
    
    for i in x:        
        listed_str.append(str(int(i)))
    stringified_X.append(' '.join(listed_str))

stringified_y = [' '.join(i) for i in y]

X_train, X_val, y_train, y_val = train_test_split(stringified_X, stringified_y, test_size=0.25, shuffle=False)

# turn coordinate values into a big string
print(stringified_y[0:10])
print(stringified_y_val[0:10])
print(X_train[0][0:100])
print(X_val[0][0:100])

['376 307', 'nan nan', 'nan nan', 'nan nan', 'nan nan', 'nan nan', 'nan nan', 'nan nan', 'nan nan', 'nan nan']
['nan nan', '144 192', 'nan nan', 'nan nan', 'nan nan', 'nan nan', 'nan nan', 'nan nan', 'nan nan', 'nan nan']
-47 2657 693 2790 1605 3661 2135 4000 852 2327 -1193 504 -554 1387 2179 3177 2645 1942 199 -1511 -17
-823 -815 -798 -554 -573 -408 -217 -275 25 -185 105 -467 382 -742 676 -790 756 -839 902 -729 786 -61


In [158]:
label_vocab_size = 1500 # Don't really need more than 1000, but for some reason a bigger vocab_size is good for the tokenizer
label_seq_len = 2

features_vocab_size = 15000
feature_seq_len = 88

def tokenization_processing(x_to_tok, x_val_to_tok, vocab_size=1500, seq_len=2, pad=False):

    tokenizer = Tokenizer(num_words=vocab_size,
                                       filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                                       split=" ",
                                       char_level=False, # Could be very very interesting
                                       oov_token=None)

    tokenizer.fit_on_texts(x_val_to_tok + x_to_tok)
    print('True vocab size:', len(tokenizer.word_index))
    
    X_tokenized = tokenizer.texts_to_sequences(x_to_tok)
    X_val_tokenized = tokenizer.texts_to_sequences(x_val_to_tok)
    
    if pad:
        X_tokenized = pad_sequences(X_tokenized, maxlen=seq_len)
        X_val_tokenized = pad_sequences(X_val_tokenized, maxlen=seq_len)
    
    print('Before:', x_val_to_tok[0])
    print('After:', X_val_tokenized[0])
    print('Before:', x_to_tok[0])
    print('After:',X_tokenized[0])
    
    return [X_tokenized, X_val_tokenized]

Y_tokenized, y_val_tokenized = tokenization_processing(stringified_y, stringified_y_val)
X_train_tokenized, X_val_tokenized = tokenization_processing(X_train, X_val, vocab_size=features_vocab_size, seq_len=feature_seq_len, pad=True)

True vocab size: 179
Before: nan nan
After: [1, 1]
Before: 376 307
After: [148, 60]
True vocab size: 8786
Before: -823 -815 -798 -554 -573 -408 -217 -275 25 -185 105 -467 382 -742 676 -790 756 -839 902 -729 786 -610 109 -731 -337 -582 -224 -183 -153 -207 -263 -422 -254 -387 -112 -447 -39 -737 -112 -909 -105 -872 118 -697 223 -451 39 -269 -40 -57 163 247 324 345 415 230 683 162 1041 118 1277 197 1258 396 941 338 688 245 685 482 489 544 42 236 -172 75 -42 151 243 252 497 245 621 96 818 102 1065 236
After: [ 608  545  972  718  758  255   56   95  412  563  128  243  217  670
  761  439  483 1082 1088  883  555  751  111  361  582  262  130  119
  120  417  527  667   31  166  108  479   52  510  108  992  128 1112
  458  485   75  700   52  468  427   78  392  517  182   83  601  552
  475  543  769  458 1107  385 1380  386  733   91  940  441  804  219
  661  710   29  440   27   73   29  532  160  398  190  441  825  461
  936    8 1309  440]
Before: -47 2657 693 2790 1605 3661 2135 40

In [7]:
# Preprocess the audio data

OUTPUTS_PER_SECOND = 20
INPUTS_PER_TIMESTEP = 250

# Loop over each second of the song and get the audio data for timesteps
X = []
for ind, i in enumerate(range(0,  len(song_in_map_range), 1000)):
    X.append([])
    
    # Step in timestep frequency, eg 4
    for j in range(0, len(song_in_map_range[i:i+1000]), 1000//INPUTS_PER_TIMESTEP):
        X[ind].append(song_in_map_range[j].get_array_of_samples())
        
    # This doesn't work because of the slicing step:
    #X.append(song_in_map_range[i:i+1000:1000//INPUTS_PER_TIMESTEP])            
    

print('Length of the song (X): ', len(X))
print('Length of one second: ', len(X[0]))

Length of the song (X):  79
Length of one second:  250
