# Problem Statement

The Bach chorales dataset is made of 382 chorales composed by Johann Sebastian Bach. Each chorale is 100 to 640 time steps long, and each time step contains 4 integers, where each integer corresponds to a note's index on a piano.

We are trying to train a model that can predict the next four notes from a sequene of time steps.

# Import Libraries

In [50]:
from tensorflow import keras
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
import pandas as pd
import numpy as np
import os

# Load Data

In [2]:
def readfiles(path):
    X = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        df = pd.read_csv(file_path)
        X.append(df.values.tolist())
    return X

In [3]:
# Load Data
train = readfiles("./jsb_chorales/train")
test = readfiles("./jsb_chorales/test")
valid = readfiles("./jsb_chorales/valid")


In [4]:
print(len(train), 'train sequences')
print(len(valid), 'valid sequences')
print(len(test), 'test sequences')

229 train sequences
76 valid sequences
77 test sequences


The sequences do not have uniform length. For efficient computation, it is best to pad the sequences so that they have the same length.

In [5]:
def maxlen(set):
    return max(len(seq) for seq in set)
print(maxlen(train))
print(maxlen(valid))
print(maxlen(test))

516
576
640


In [6]:
maxlen = 576 # setting maxlen to the longest sequence in the valid set

train_pad = pad_sequences(train, dtype=np.float32, maxlen=maxlen)
valid_pad = pad_sequences(valid, dtype=np.float32, maxlen=maxlen)
test_pad = pad_sequences(test, dtype=np.float32, maxlen=maxlen)

In [7]:
print('train_pad shape:', train_pad.shape)
print('valid_pad shape:', valid_pad.shape)
print('test_pad shape:', test_pad.shape)

train_pad shape: (229, 576, 4)
valid_pad shape: (76, 576, 4)
test_pad shape: (77, 576, 4)


We need to split the data into x_train, y_train, x_valid, y_valid, etc.

In [13]:
x_train, y_train = train_pad[:, :-1], train_pad[:, -1:]
x_valid, y_valid = valid_pad[:, :-1], valid_pad[:, -1:]
x_test, y_test = test_pad[:, :-1], test_pad[:, -1:]

# Build a RNN

## SimpleRNN

In [69]:
model_rnn = Sequential([
    keras.Input(shape=train_pad.shape[1:]),
    SimpleRNN(4)
])

In [70]:
model_rnn.summary()

In [71]:
optimizer = keras.optimizers.Adam(learning_rate=0.01)
model_rnn.compile(loss="mse", optimizer=optimizer, metrics=['accuracy'])

In [72]:
model_rnn.fit(x_train, y_train, batch_size=16, epochs=10, validation_data=(x_valid, y_valid))

Epoch 1/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.9926 - loss: 3501.3745 - val_accuracy: 0.9868 - val_loss: 3500.0366
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.9957 - loss: 3431.4453 - val_accuracy: 0.9868 - val_loss: 3494.2632
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.9949 - loss: 3405.9041 - val_accuracy: 0.9868 - val_loss: 3494.2632
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - accuracy: 0.9963 - loss: 3419.1072 - val_accuracy: 0.9868 - val_loss: 3494.2632
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.9926 - loss: 3423.5281 - val_accuracy: 0.9868 - val_loss: 3494.2632
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.9963 - loss: 3397.7573 - val_accuracy: 0.9868 - val_loss: 3494.263

<keras.src.callbacks.history.History at 0x2a5a14cb8d0>

In [73]:
loss, accuracy = model_rnn.evaluate(x_valid, y_valid, batch_size=16)
print('Test loss:', loss)
print('Test accuracy:', accuracy)

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.9739 - loss: 3530.7993
Test loss: 3494.26318359375
Test accuracy: 0.9868420958518982
