In [55]:
from music21 import *
import numpy as np
import pandas as pd
import glob
from sklearn import tree
from sklearn.model_selection import *
import graphviz

### Using music21 and SKLearn to classify hymns as Bach versus non-Bach

Load Bach corpus and test some features of music21

In [3]:
bach_corpus = corpus.search('bach', fileExtensions='xml')

In [15]:
hymn = converter.parse('holdroyd/holdroyd1.xml')

In [None]:
hymn.show()

In [11]:
bach_corpus

<music21.metadata.bundles.MetadataBundle {412 entries}>

In [27]:
bach_corpus[0].parse()

<music21.stream.Score 0x7fa0362d4190>

Two feature extractor functions, number of key changes and number of accidentals

In [6]:
# calculate number of key changes in a stream
def num_key_changes(stream):
    p = stream.parts[0]
    pFlat = p.flat
    keySigs = pFlat.getElementsByClass('KeySignature')
    # for k in keySigs:
    #     print(k.measureNumber)
    return len(keySigs)

In [7]:
# calculate number of accidentals in a stream
def num_accidentals(stream):
    allPitches = stream.flat.pitches
    accidentals = 0
    for p in allPitches:
        if p.accidental is not None and p.accidental.name != 'natural':
            accidentals += 1
    return accidentals

Test custom feature extractors and jSymbolic features

In [28]:
fe = features.jSymbolic.DirectionOfMotionFeature(bach_corpus[0].parse())
feature = fe.extract()
feature.vector

[0.48743718592964824]

In [8]:
num_key_changes(bach_corpus[0].parse())

1

In [9]:
num_accidentals(bach_corpus[0].parse())

38

In [None]:
bach_key_changes = []
for piece in bach_corpus:
    chorale = piece.parse()
    bach_key_changes.append(num_key_changes(chorale))

bach_key_changes

In [None]:
bach_num_accidentals = []
for piece in bach_corpus:
    chorale = piece.parse()
    bach_num_accidentals.append(num_accidentals(chorale))

bach_num_accidentals

In [None]:
bach_motion = []
for piece in bach_corpus:
    chorale = piece.parse()
    feature = features.jSymbolic.DirectionOfMotionFeature(chorale)
    bach_motion.append(feature.extract().vector)

bach_motion

#### Constructing the corpus consisting of BWV 250-438 chorales and 104 other hymns from various composers

Parse each MusicXML file and create pandas dataframe of corpus 

In [4]:
rows = []
files = glob.glob('non_bach/*.xml')
for file in files:
    hymn = converter.parse(file)
    # print(hymn.notes.streamLength)
    if len(hymn.getElementsByClass(stream.Part)[0]) == 0:
        continue
    name = (file.split('/')[1]).split('.')[0]
    rows.append([hymn, name, 0])

df = pd.DataFrame(rows, columns=['hymn', 'name', 'is_bach'])
df

Unnamed: 0,hymn,name,is_bach
0,"[<music21.text.TextBox 'Stuttgart,...'>, <musi...",4_hassler,0
1,"[<music21.text.TextBox 'Von Himmel...'>, <musi...",6_hassler,0
2,"[<music21.text.TextBox 'To the Wor...'>, <musi...",11_doane,0
3,"[<music21.text.TextBox 'Vater Unse...'>, <musi...",5_hassler,0
4,"[<music21.text.TextBox 'Jesus Is T...'>, <musi...",stebbins1,0
...,...,...,...
99,"[<music21.text.TextBox 'Scatter Su...'>, <musi...",excell8,0
100,"[<music21.text.TextBox 'Helmsley, ...'>, <musi...",madan1,0
101,"[<music21.text.TextBox 'Labor On'>, <music21.t...",19_doane,0
102,"[<music21.text.TextBox 'Christ the...'>, <musi...",walsh1,0


In [5]:
bach_corpus = corpus.search('bach', fileExtensions='xml')
rows = []
for piece in bach_corpus[143:354]:
    hymn = piece.parse()
    if len(hymn.getElementsByClass(stream.Part)[0]) == 0:
        continue
    # get bwv number of each piece
    name = ''.join((hymn.corpusFilepath.split('/')[1]).split('.')[0:-1])
    rows.append([hymn, name, 1])

df2 = pd.DataFrame(rows, columns=['hymn', 'name', 'is_bach'])
df2

Unnamed: 0,hymn,name,is_bach
0,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv256,1
1,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv250,1
2,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv251,1
3,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv252,1
4,"[<music21.text.TextBox 'BWV 253'>, <music21.te...",bwv253,1
...,...,...,...
206,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv434,1
207,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv435,1
208,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv436,1
209,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv437,1


In [6]:
dataset = pd.concat([df, df2], ignore_index=True)
dataset

Unnamed: 0,hymn,name,is_bach
0,"[<music21.text.TextBox 'Stuttgart,...'>, <musi...",4_hassler,0
1,"[<music21.text.TextBox 'Von Himmel...'>, <musi...",6_hassler,0
2,"[<music21.text.TextBox 'To the Wor...'>, <musi...",11_doane,0
3,"[<music21.text.TextBox 'Vater Unse...'>, <musi...",5_hassler,0
4,"[<music21.text.TextBox 'Jesus Is T...'>, <musi...",stebbins1,0
...,...,...,...
310,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv434,1
311,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv435,1
312,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv436,1
313,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv437,1


Create list of jSymbolic features used in corpus construction and initialize dataframe for each feature

In [7]:
jsymbolic_features = []
fs = features.jSymbolic.extractorsById
for k in fs:
    for i in range(len(fs[k])):
        if fs[k][i] is not None:
            if k != 'I':
                n = fs[k][i].__name__
                if fs[k][i] not in features.jSymbolic.featureExtractors:
                    continue
                jsymbolic_features.append([k, i])

print(len(jsymbolic_features))

58


In [29]:
for feature in jsymbolic_features:
    if feature in [['M', 10], ['T', 1], ['T', 2], ['T', 3], ['M', 1], ['M', 6], ['P', 12], ['P', 13], ['P', 14], ['P', 15], ['P', 16], ['M', 10], ['M', 11], ['M', 12], ['M', 13], ['M', 14], ['M', 15], ['M', 16], ['M', 17], ['M', 18], ['M', 19], ['M', 9], ['M', 7], ['P', 2], ['P', 3], ['P', 4], ['P', 5], ['P', 6], ['P', 19], ['P', 21], ['R', 19], ['R', 20], ['R', 21], ['R', 22], ['R', 23], ['R', 24], ['R', 25], ['M', 18], ['M', 19]]:
        jsymbolic_features.remove(feature)

print(jsymbolic_features)

[['M', 2], ['M', 3], ['M', 4], ['M', 5], ['M', 8], ['M', 17], ['P', 1], ['P', 7], ['P', 8], ['P', 9], ['P', 10], ['P', 11], ['P', 20], ['P', 22], ['R', 15], ['R', 17], ['R', 18], ['R', 30], ['R', 31], ['R', 32], ['R', 33], ['R', 34], ['R', 35], ['R', 36]]


In [15]:
test_dataset = dataset
test_dataset

Unnamed: 0,hymn,name,is_bach,AverageMelodicIntervalFeature,MostCommonMelodicIntervalFeature,DistanceBetweenMostCommonMelodicIntervalsFeature,MostCommonMelodicIntervalPrevalenceFeature,NumberOfCommonMelodicIntervalsFeature,AmountOfArpeggiationFeature,ChromaticMotionFeature,...,AverageTimeBetweenAttacksFeature,AverageTimeBetweenAttacksForEachVoiceFeature,InitialTempoFeature,InitialTimeSignatureFeature,CompoundOrSimpleMeterFeature,TripleMeterFeature,QuintupleMeterFeature,ChangesOfMeterFeature,DurationFeature,AverageNumberOfIndependentVoicesFeature
0,"[<music21.text.TextBox 'Stuttgart,...'>, <musi...",4_hassler,0,3,0,2,0,6,0,0,...,0,0,120,4,0,0,0,0,16,0
1,"[<music21.text.TextBox 'Von Himmel...'>, <musi...",6_hassler,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"[<music21.text.TextBox 'To the Wor...'>, <musi...",11_doane,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"[<music21.text.TextBox 'Vater Unse...'>, <musi...",5_hassler,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"[<music21.text.TextBox 'Jesus Is T...'>, <musi...",stebbins1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv434,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
311,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv435,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
312,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv436,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
313,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv437,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
feature_cols = []
for js_feature in jsymbolic_features:
    f = features.jSymbolic.getExtractorByTypeAndNumber(js_feature[0], js_feature[1])
    test_dataset[f.__name__] = np.nan
    feature_cols.append(f.__name__)

In [28]:
test_dataset

Unnamed: 0,hymn,name,is_bach,AverageMelodicIntervalFeature,MostCommonMelodicIntervalFeature,DistanceBetweenMostCommonMelodicIntervalsFeature,MostCommonMelodicIntervalPrevalenceFeature,NumberOfCommonMelodicIntervalsFeature,AmountOfArpeggiationFeature,ChromaticMotionFeature,...,AverageTimeBetweenAttacksFeature,AverageTimeBetweenAttacksForEachVoiceFeature,InitialTempoFeature,InitialTimeSignatureFeature,CompoundOrSimpleMeterFeature,TripleMeterFeature,QuintupleMeterFeature,ChangesOfMeterFeature,DurationFeature,AverageNumberOfIndependentVoicesFeature
0,"[<music21.text.TextBox 'Stuttgart,...'>, <musi...",4_hassler,0,3.3,0.0,2.0,0.4,6,0.5,0,...,0.483871,0,120.0,4.0,0.0,0.0,0.0,0.0,16.0,0
1,"[<music21.text.TextBox 'Von Himmel...'>, <musi...",6_hassler,0,,,,,0,,0,...,,0,,,,,,,,0
2,"[<music21.text.TextBox 'To the Wor...'>, <musi...",11_doane,0,,,,,0,,0,...,,0,,,,,,,,0
3,"[<music21.text.TextBox 'Vater Unse...'>, <musi...",5_hassler,0,,,,,0,,0,...,,0,,,,,,,,0
4,"[<music21.text.TextBox 'Jesus Is T...'>, <musi...",stebbins1,0,,,,,0,,0,...,,0,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv434,1,,,,,0,,0,...,,0,,,,,,,,0
311,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv435,1,,,,,0,,0,...,,0,,,,,,,,0
312,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv436,1,,,,,0,,0,...,,0,,,,,,,,0
313,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv437,1,,,,,0,,0,...,,0,,,,,,,,0


For each feature in list, extract feature using jSymbolic function for each piece in the corpus

In [30]:
for js_feature in jsymbolic_features:
    f = features.jSymbolic.getExtractorByTypeAndNumber(js_feature[0], js_feature[1])
    for index, row in test_dataset.iterrows():
        feature = f(row['hymn'])
        try:
            fe = feature.extract().vector
        except:
            fe = [0]
        # print('feature', f, 'result', fe)
        test_dataset.at[index,f.__name__] = fe[0]
        # print(test_dataset.at[index,f.__name__])

In [31]:
test_dataset

Unnamed: 0,hymn,name,is_bach,AverageMelodicIntervalFeature,MostCommonMelodicIntervalFeature,DistanceBetweenMostCommonMelodicIntervalsFeature,MostCommonMelodicIntervalPrevalenceFeature,NumberOfCommonMelodicIntervalsFeature,AmountOfArpeggiationFeature,ChromaticMotionFeature,...,AverageTimeBetweenAttacksFeature,AverageTimeBetweenAttacksForEachVoiceFeature,InitialTempoFeature,InitialTimeSignatureFeature,CompoundOrSimpleMeterFeature,TripleMeterFeature,QuintupleMeterFeature,ChangesOfMeterFeature,DurationFeature,AverageNumberOfIndependentVoicesFeature
0,"[<music21.text.TextBox 'Stuttgart,...'>, <musi...",4_hassler,0,3.300000,0.0,2.0,0.400000,6,0.500000,0,...,0.483871,0,120.0,4.0,0.0,0.0,0.0,0.0,16.000,0
1,"[<music21.text.TextBox 'Von Himmel...'>, <musi...",6_hassler,0,3.790123,2.0,1.0,0.271605,0,0.345679,0,...,,0,120.0,4.0,0.0,0.0,0.0,0.0,16.250,0
2,"[<music21.text.TextBox 'To the Wor...'>, <musi...",11_doane,0,2.818182,1.0,1.0,0.363636,0,0.363636,0,...,,0,120.0,4.0,0.0,0.0,0.0,0.0,32.000,0
3,"[<music21.text.TextBox 'Vater Unse...'>, <musi...",5_hassler,0,3.022222,2.0,3.0,0.266667,0,0.422222,0,...,,0,120.0,4.0,0.0,0.0,0.0,0.0,24.000,0
4,"[<music21.text.TextBox 'Jesus Is T...'>, <musi...",stebbins1,0,0.400000,0.0,2.0,0.800000,0,0.800000,0,...,,0,120.0,6.0,1.0,0.0,0.0,0.0,23.625,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
310,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv434,1,2.005952,2.0,1.0,0.386905,0,0.238095,0,...,,0,120.0,4.0,0.0,0.0,0.0,0.0,18.000,0
311,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv435,1,2.230047,2.0,1.0,0.352113,0,0.258216,0,...,,0,120.0,4.0,0.0,0.0,0.0,0.0,22.000,0
312,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv436,1,2.293578,2.0,1.0,0.408257,0,0.256881,0,...,,0,120.0,4.0,0.0,0.0,0.0,0.0,28.000,0
313,"[<music21.text.TextBox 'PDF © 2004...'>, <musi...",bwv437,1,2.403654,2.0,1.0,0.388704,0,0.229236,0,...,,0,120.0,4.0,0.0,0.0,0.0,0.0,64.000,0


Create training dataset from corpus for use in decision tree training

In [40]:
test_dataset.to_csv('dataset1.csv')

In [41]:
test_dataset = test_dataset.fillna(0)

In [42]:
X = test_dataset.loc[:, feature_cols]
X.shape

(315, 28)

In [44]:
y = test_dataset.is_bach
y

0      0
1      0
2      0
3      0
4      0
      ..
310    1
311    1
312    1
313    1
314    1
Name: is_bach, Length: 315, dtype: int64

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

In [54]:
# clf.score(X_train, y_train)
clf.score(X_test, y_test)

0.9682539682539683

In [60]:
class_names = ['non-bach', 'bach']

array([0, 1])

In [61]:
dot_data = tree.export_graphviz(clf,
                                out_file=None,
                                feature_names=feature_cols,  
                                # class_names=clf.classes_,  
                                filled=True, rounded=True,  
                                special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("bach") 

'bach.pdf'