In [189]:
from ast import literal_eval
import pandas as pd
from operator import itemgetter
file_path = r"..\data\00_raw\bach\bach_bwv1.6.csv"


def transform_data(data: pd.DataFrame) -> pd.DataFrame:
    """function performs all transformations so that all partitures have the same structure"""

    static_cols = ["measure", "composer", "corpus"]
    partition_cols = [x for x in data.columns if x not in static_cols]
    for col in partition_cols:
        data[col] = data[col].apply(lambda x: literal_eval(x))

    data = data.\
        melt(
            id_vars=static_cols, var_name="partition", value_name="notes"
        ).\
        explode("notes")

    na_mask = data.notes.isnull()
    data.loc[na_mask, "notes"] = data.loc[na_mask,"notes"].fillna("[None, None]").apply(eval)

    data["node_id"] = range(len(data))
    split = data["notes"].transform({"pitch": itemgetter(0), "duration": itemgetter(1)})
    data["pitch"] = split.pitch
    data["duration"] = split.duration.apply(lambda x: f"{x:.4f}")

    return data
    
data = pd.read_csv(file_path, sep=";", na_values="None")
data.head()

Unnamed: 0,horn__,soprano,alto,tenor,bass,measure,composer,corpus
0,"[['F4', 1.0]]","[['F4', 1.0]]","[['C4', 1.0]]","[['A3', 1.0]]","[['F3', 1.0]]",0,bach,bwv1.6
1,"[['G4', 0.5], ['C4', 0.5], ['F4', 0.5], ['F3',...","[['C5', 1.0], ['A4', 1.0], ['F4', 1.0], ['C5',...","[['C4', 1.0], ['C4', 1.0], ['D4', 1.0], ['F4',...","[['G3', 1.0], ['A3', 1.0], ['A3', 1.0], ['A3',...","[['E3', 1.0], ['F3', 1.0], ['D3', 1.0], ['A2',...",1,bach,bwv1.6
2,"[['F4', 0.25], ['B-4', 0.25], ['A4', 0.25], ['...","[['D5', 1.0], ['D5', 1.0], ['C5', 1.0], ['C5',...","[['B-4', 1.0], ['B-4', 1.0], ['A4', 1.0], ['A4...","[['F3', 1.0], ['F4', 1.0], ['F4', 1.0], ['F4',...","[['B-2', 0.5], ['C3', 0.5], ['D3', 0.5], ['E3'...",2,bach,bwv1.6
3,"[['F4', 0.5], ['G4', 0.25], ['A4', 0.25], ['B-...","[['D5', 1.0], ['E5', 1.0], ['F5', 1.0], ['E5',...","[['B-4', 1.0], ['B-4', 1.0], ['C5', 0.5], ['D5...","[['F4', 1.0], ['G4', 1.0], ['F4', 0.5], ['D4',...","[['B-3', 1.0], ['A3', 0.5], ['G3', 0.5], ['A3'...",3,bach,bwv1.6
4,"[['C5', 0.5], ['B4', 0.25], ['A4', 0.25], ['B4...","[['D5', 1.0], ['D5', 1.0], ['C5', 1.0], ['A4',...","[['A4', 1.0], ['G4', 1.0], ['G4', 1.0], ['F4',...","[['F4', 0.5], ['E4', 0.5], ['D4', 1.0], ['E4',...","[['F3', 1.0], ['G3', 1.0], ['C3', 1.0], ['F3',...",4,bach,bwv1.6


In [190]:
uniformed_data = transform_data(data)
uniformed_data.head()

Unnamed: 0,measure,composer,corpus,partition,notes,node_id,pitch,duration
0,0,bach,bwv1.6,horn__,"[F4, 1.0]",0,F4,1.0
1,1,bach,bwv1.6,horn__,"[G4, 0.5]",1,G4,0.5
1,1,bach,bwv1.6,horn__,"[C4, 0.5]",2,C4,0.5
1,1,bach,bwv1.6,horn__,"[F4, 0.5]",3,F4,0.5
1,1,bach,bwv1.6,horn__,"[F3, 0.5]",4,F3,0.5


In [191]:
import pathlib
from tqdm import tqdm
from functools import reduce

all_files = []
for path in tqdm(pathlib.Path("..\\data\\00_raw\\bach\\").iterdir()):
    if path.is_file():
        data = pd.read_csv(path, sep=";")
        all_files.append(transform_data(data))

all_files = reduce(lambda x, y: pd.concat([x,y]), all_files)
all_files.head()

408it [00:07, 57.19it/s]


Unnamed: 0,measure,composer,corpus,partition,notes,node_id,pitch,duration
0,0,bach,bwv1.6,horn__,"[F4, 1.0]",0,F4,1.0
1,1,bach,bwv1.6,horn__,"[G4, 0.5]",1,G4,0.5
1,1,bach,bwv1.6,horn__,"[C4, 0.5]",2,C4,0.5
1,1,bach,bwv1.6,horn__,"[F4, 0.5]",3,F4,0.5
1,1,bach,bwv1.6,horn__,"[F3, 0.5]",4,F3,0.5


In [192]:
# filter out corpuses with chords
bad_corpus = ["bwv248.9-1", "bwv846", "bwv248.23-2"]
all_files = all_files[all_files.corpus.apply(lambda x: x not in bad_corpus)]
all_files.pitch = all_files.pitch.fillna("P")


In [193]:
# save preliminary data
all_files.to_csv(r"..\data\01_preprocessed\bach.csv", sep=";", index=False)

## Duration vocabulary

In [194]:
all_files.duration.value_counts()

1.0000    55776
0.5000    39690
2.0000     6925
0.2500     5463
3.0000     2341
1.5000     1456
4.0000     1107
0.7500      150
0.1250       86
6.0000       21
8.0000        4
0.0000        3
Name: duration, dtype: int64

In [195]:
import numpy as np
duration_vocab = [f"{x:.4f}" for x in np.arange(.0,8.1,.125)]
duration_vocab[:15]

['0.0000',
 '0.1250',
 '0.2500',
 '0.3750',
 '0.5000',
 '0.6250',
 '0.7500',
 '0.8750',
 '1.0000',
 '1.1250',
 '1.2500',
 '1.3750',
 '1.5000',
 '1.6250',
 '1.7500']

## Pitch vocabulary

In [143]:
all_files.pitch.value_counts()

D4      7905
G4      6905
A4      6823
E4      6785
A3      6066
        ... 
C#2        1
D#2        1
G-5        1
F##4       1
F##3       1
Name: pitch, Length: 85, dtype: int64

In [196]:
octaves = range(1,7,1)
accents = ["", "#", "##", "-", "--"]
pitches = ["A", "B", "C", "D", "E", "F", "G"]

pitch_vocab = []
for pitch in pitches:
    for accent in accents:
        for octave in octaves:
            pitch_vocab.append(f"{pitch}{accent}{octave}")

pitch_vocab[:15]

['A1',
 'A2',
 'A3',
 'A4',
 'A5',
 'A6',
 'A#1',
 'A#2',
 'A#3',
 'A#4',
 'A#5',
 'A#6',
 'A##1',
 'A##2',
 'A##3']

In [197]:
all_files.head(10)

Unnamed: 0,measure,composer,corpus,partition,notes,node_id,pitch,duration
0,0,bach,bwv1.6,horn__,"[F4, 1.0]",0,F4,1.0
1,1,bach,bwv1.6,horn__,"[G4, 0.5]",1,G4,0.5
1,1,bach,bwv1.6,horn__,"[C4, 0.5]",2,C4,0.5
1,1,bach,bwv1.6,horn__,"[F4, 0.5]",3,F4,0.5
1,1,bach,bwv1.6,horn__,"[F3, 0.5]",4,F3,0.5
1,1,bach,bwv1.6,horn__,"[A3, 0.5]",5,A3,0.5
1,1,bach,bwv1.6,horn__,"[F3, 0.5]",6,F3,0.5
1,1,bach,bwv1.6,horn__,"[A3, 0.5]",7,A3,0.5
1,1,bach,bwv1.6,horn__,"[C4, 0.5]",8,C4,0.5
2,2,bach,bwv1.6,horn__,"[F4, 0.25]",9,F4,0.25


In [198]:
all_files[(all_files.corpus == "bwv1.6") & (all_files.measure == 0)]

Unnamed: 0,measure,composer,corpus,partition,notes,node_id,pitch,duration
0,0,bach,bwv1.6,horn__,"[F4, 1.0]",0,F4,1.0
21,0,bach,bwv1.6,soprano,"[F4, 1.0]",154,F4,1.0
42,0,bach,bwv1.6,alto,"[C4, 1.0]",225,C4,1.0
63,0,bach,bwv1.6,tenor,"[A3, 1.0]",315,A3,1.0
84,0,bach,bwv1.6,bass,"[F3, 1.0]",409,F3,1.0


In [199]:
all_files[all_files.duration.isnull()]

Unnamed: 0,measure,composer,corpus,partition,notes,node_id,pitch,duration


In [200]:
durations = all_files.\
    groupby(["corpus", "partition","measure"]).\
    agg({"duration": sum}).\
    reset_index().\
    groupby("corpus").\
    agg({"duration": max})

durations.reset_index().duration.value_counts()


3.0000                                  114
4.0000                                   56
3.00001.0000                             51
2.00001.00001.0000                       40
2.00002.0000                             38
2.00001.0000                             36
1.50000.50002.0000                       16
1.00001.00001.00001.0000                 10
1.50000.50001.00001.0000                 10
1.00001.00002.0000                        6
3.00000.50000.5000                        5
1.50000.50001.0000                        5
1.50000.50001.00000.50000.5000            3
2.00001.50000.5000                        3
6.0000                                    2
1.50001.00000.50001.0000                  2
8.0000                                    1
2.00000.50000.50000.50000.5000            1
2.00000.50000.50001.0000                  1
1.50000.50000.50000.25000.25001.0000      1
3.00000.75000.2500                        1
1.50001.00000.50000.50000.5000            1
1.00001.50001.00000.5000        

In [201]:
all_files.partition.value_counts()

bass                        25995
tenor                       25367
alto                        24624
soprano                     21035
continuo                     1417
timpani                      1287
trumpet__                    1279
violin__                     1181
horn__                       1167
violin__.1                   1138
viola                        1077
oboe__                        996
horn__.1                      926
trumpet__.1                   858
oboe__.1                      831
trumpet__.2                   762
violin                        451
bass_continuo                 303
tenor_viola                   273
organ                         272
oboe                          172
recorders                     156
alto_viloin__                 146
soprano__.1                   129
soprano__                     124
soprano_oboe_____violin_      121
recorder__.2                  116
oboe__.2                      112
recorder__.1                  112
cornet__.1    

# Data preprocessing

In [202]:
import tensorflow as tf

ids_from_pitches = tf.keras.layers.StringLookup(
    vocabulary=pitch_vocab, mask_token=None
)

ids_from_durations = tf.keras.layers.StringLookup(
    vocabulary=duration_vocab, mask_token=None
)

tenors = all_files[all_files.partition == "tenor"]
pitch_ids = ids_from_pitches(tenors.pitch)
duration_ids = ids_from_durations(tenors.duration)


In [203]:
pitch_ids

<tf.Tensor: shape=(25367,), dtype=int64, numpy=array([  3, 183,   3, ...,  33,  64,  33], dtype=int64)>

In [204]:
duration_ids

<tf.Tensor: shape=(25367,), dtype=int64, numpy=array([9, 9, 9, ..., 3, 9, 9], dtype=int64)>