# Problem Statement

The Bach chorales dataset is made of 382 chorales composed by Johann Sebastian Bach. Each chorale is 100 to 640 time steps long, and each time step contains 4 integers, where each integer corresponds to a note's index on a piano.

We are trying to train a model that can predict the next four notes from a sequene of time steps.

# Import Libraries

In [1]:
from tensorflow import keras
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import SimpleRNN
import pandas as pd
import numpy as np
import os

# Load Data

In [2]:
def readfiles(path):
    X = []
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        df = pd.read_csv(file_path)
        X.append(df.values.tolist())
    return X

In [3]:
# Load Data
train = readfiles("./jsb_chorales/train")
test = readfiles("./jsb_chorales/test")
valid = readfiles("./jsb_chorales/valid")


In [4]:
print(len(train), 'train sequences')
print(len(valid), 'valid sequences')
print(len(test), 'test sequences')

229 train sequences
76 valid sequences
77 test sequences


The sequences do not have uniform length. For efficient computation, it is best to pad the sequences so that they have the same length.

In [5]:
def maxlen(set):
    return max(len(seq) for seq in set)
print(maxlen(train))
print(maxlen(valid))
print(maxlen(test))

516
576
640


In [6]:
maxlen = 576 # setting maxlen to the longest sequence in the valid set

train_pad = pad_sequences(train, dtype=np.float32, maxlen=maxlen)
valid_pad = pad_sequences(valid, dtype=np.float32, maxlen=maxlen)
test_pad = pad_sequences(test, dtype=np.float32, maxlen=maxlen)

In [7]:
print('train_pad shape:', train_pad.shape)
print('valid_pad shape:', valid_pad.shape)
print('test_pad shape:', test_pad.shape)

train_pad shape: (229, 576, 4)
valid_pad shape: (76, 576, 4)
test_pad shape: (77, 576, 4)


In [13]:
x_train, y_train = train_pad[:, :-1], train_pad[:, -1:]
x_valid, y_valid = valid_pad[:, :-1], valid_pad[:, -1:]
x_test, y_test = test_pad[:, :-1], test_pad[:, -1:]

# Build a RNN

## SimpleRNN

In [21]:
model_rnn = Sequential([
    keras.Input(shape=(None, train_pad.shape[-1])),
    SimpleRNN(1)
])

In [22]:
optimizer = keras.optimizers.Adam(learning_rate=0.01)
model_rnn.compile(loss="mse", optimizer=optimizer)

We need to split the data into x_train, y_train, x_valid, y_valid, etc.

In [23]:
model_rnn.fit(x_train, y_train, batch_size=5, epochs=10, validation_data=(x_valid, y_valid))

Epoch 1/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - loss: 3331.5400 - val_loss: 3389.9473
Epoch 2/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 3307.6670 - val_loss: 3389.9473
Epoch 3/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 3325.5991 - val_loss: 3389.9473
Epoch 4/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 3286.7603 - val_loss: 3389.9473
Epoch 5/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 3288.3254 - val_loss: 3389.9473
Epoch 6/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 3311.0630 - val_loss: 3389.9473
Epoch 7/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 3305.2632 - val_loss: 3389.9473
Epoch 8/10
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - loss: 3304.0364 - val_loss: 3389.9473


<keras.src.callbacks.history.History at 0x2a597c10dd0>