# Parse and prepare a dataset of abc music notations

Download the [Nottingham Dataset](https://github.com/jukedeck/nottingham-dataset) or this [dataset of abc music notation from Henrik Norbeck ](http://norbeck.nu/abc/download.asp) select the 'one big zip file (549 kilobytes).' at the end of the page. 

If we use the Henrik Norbeck DS the first thing we are going to do is parse all the files and concatenate the text in one 'big' text file.

We will train our model using Char-RNN for TF, you can clone it from [https://github.com/sherjilozair/char-rnn-tensorflow](https://github.com/sherjilozair/char-rnn-tensorflow)

## ABC Notations

We will need some software to work with `abc` and `mid` files, you can install by using on Ubuntu:

```
$ sudo apt-get install abcmidi timidity
```

On Mac:


```
$ brew install abcmidi timidity
```

For mac user you can also install [easy abc](https://www.nilsliberg.se/ksp/easyabc/) to read the files

Here’s a simple example:

```
X: 1
T:"Hello world in abc notation"
M:4/4
K:C
"Am" C, D, E, F,|"F" G, A, B, C|"C"D E F G|"G" A B e c
```

To test the installation we can listen to this by saving the above snippet into a `hello.abc` file and running (Mac and Ubuntu):

```
$ abc2midi hello.abc -o hello.mid && timidity hello.mid
```

In [1]:
import os

# input_folder_fp = '/home/gu-ma/Downloads/hn201809'
input_folder_fp = '/Users/guillaume/Downloads/nottingham-dataset/ABC_cleaned'
abc_raw_txt = ''
abc_all_txt = ''

# Parse all files in the input folders
for root, subdirs, files in os.walk(input_folder_fp):
    print(root)
    for filename in files:
        file_path = os.path.join(root, filename)
        print('\t- %s ' % filename)
        if filename.lower().endswith('.abc'):
            with open(file_path, 'r') as f:
                abc_raw_txt += f.read()

print('\nabc_raw_txt:\n--\n' + abc_raw_txt[:1000])

/Users/guillaume/Downloads/nottingham-dataset/ABC_cleaned
	- playford.abc 
	- morris.abc 
	- reelsm-q.abc 
	- reelsr-t.abc 
	- jigs.abc 
	- xmas.abc 
	- slip.abc 
	- ashover.abc 
	- reelsh-l.abc 
	- waltzes.abc 
	- hpps.abc 
	- reelsu-z.abc 
	- reelsd-g.abc 
	- reelsa-c.abc 

abc_raw_txt:
--

X: 1
T:The Alderman's Hat
% Nottingham Music Database
S:Playford
Y:AB
M:4/4
L:1/4
K:Gm
P:A
D|"Gm"GB2G|"D7"^Fd2D|"Gm"GB2G|"D7"^FA2D|"Gm"GB2G|"D"^Fg -"Cm"gc|\
"Gm/d"B2 "D7"A2|"Gm"G3:|
P:B
d|"Gm"dg2d|"Gm"gb2d|"F"cf2c|"F"fa2c|"Eb"Be2d/2c/2|"Gm"dg -"Cm"gc|\
"Gm/d"B2 "D7"A2|"Gm"G3:|


X: 2
T:CAL 1
% Nottingham Music Database
S:Mark Knopfler
Y:AABABA
M:9/8
L:1/8
K:G
P:A
|G2E |"G"D3 -D2E G2B|"Em"A3 G3 -G2A|"C"A3 G3 E3|"G"D3 -D3 B2A|\
"Em"G3 -G2F G2A|
"G"B3 -B2A GAB|"Am"A3 -A2B A2G|E3 F3 G3|"D/f+"A3 A3 G3|\
"G"B3 -B2A B2d|
"C"e3 "G/b"d3 B3|"D"A3 -A3 G2A|"Em"B3 A3 G3|"C"E3 -E3 D3|\
"G"D3 -D3 -D3|"G"D6|
P:B
A3|"D"A3 B3 c3|"G"d3 -d2c B2d|"Am"c3 -c3 -c3|"G/b"B3 A3 G3|
"C"A3 B3 c3|"Em"B3 -B3 A2G|"Am"E3 c3 B2G|"

Then we remove the 'unecessary' parts, clean up the text

In [4]:
import re

# Helper function to extract (and delete) chunks of text from abc_raw_text
def extract_text(regex, txt, delete):
    output = ''
    # extract the text
    for result in re.findall(regex, txt, re.S):
        output += result + "\n"
    # delete from the original file
    if delete:        
        global abc_raw_txt
        abc_raw_txt = (re.sub(regex, '', abc_raw_txt, flags=re.S))
    # remove empty lines
    abc_raw_txt = ''.join([s for s in abc_raw_txt.strip().splitlines(True) if s.strip()])
    return output

# Helper function to delete selected lines from a text
def delete_lines(regex, txt):
    txt = (re.sub(regex, '', txt, flags=re.S))
    txt = ''.join([s for s in txt.strip().splitlines(True) if s.strip()])
    return txt

# Extract intro text
useless_txt = extract_text(r'(This file.*?- Questions?.[^\n]*)', abc_raw_txt, True)

# Save the file without the intro text
abc_all_txt = abc_raw_txt

# Delete 'comments'
abc_raw_txt = delete_lines(r'".[^\n]*', abc_raw_txt)
# Delete Lyrics
abc_raw_txt = delete_lines(r'%.[^\n]*', abc_raw_txt)
# Delete some more comments
abc_raw_txt = delete_lines(r'W:.[^\n]*', abc_raw_txt)

# Extract headers
abc_headers_txt = extract_text(r'(X:.*?K:.[^\n]*)', abc_raw_txt, True)

print('\nabc_raw_txt:\n--\n' + abc_raw_txt[:1000])
print('\nabc_headers_txt:\n--\n' + abc_headers_txt[:1000])


abc_raw_txt:
--
P:A
D|
P:B
d|
P:A
|G2E |
P:B
A3|
D2G |
P:P
B2c2||
P:A
P:B
P:A
f/2g/2|
P:B
c/2d/2|
P:A
P:B
P:A
|:D|
|
P:B
|:g|fe dc|
a|
P:A
A|
P:B
a|
|:A|
K:G
|:e/2f/2|
|:
P:A
^c|
P:B
c/2B/2|
|:
P:A
g|
P:B
g|
P:C
c|
Q:1/4=180
D|:
M:6/8
L:1/8
Q:1/4=135
|:
|:
M:4/4
L:1/4
Q:1/4=180
|:
M:4/4
L:1/4
|:
P:A
C|
P:B
|:F3/2F/2F AGF A/2c3/2C|F2F A2G F2C|F3/2F/2F AGF A/2c3/2C|F3/2C/2F A2G F2C:|
P:C
c2B AGF Ac2|G2G G2A B3|c2B AGF A/2c3/2C|F3/2C/2F A2G F2C:|
P:A
D| g2 fe|d2 eF/2G/2|BB cA|G3 :|
P:B
d|BG Bd|gg eg|fd ec|d2 ef|g2 f|
P:C
e|
P:A
E|
P:B
A2e c/2A3/2c|d2B G2E|
P:C
c3/2B/2A F2E|EFG A2E|
P:D
cde f3|Bcd e3|cde f3|Bcd e2(3e/2f/2g/2|a2e c/2A3/2c|d2B G2E||
P:A
FG|A3B ABcd|B2G2 G2AB|c2c2 cBAG|F2F2 F2:|
P:B
fe|d2c2 c2FG|A2A2 A2fe|d2c2 B2A2|G2F2 F2fe|
d2c2 c2FG|A2A2 A2fe|d2c2 A2F2|G2F2 F2:|F2|
P:A
D|
P:B
B/2c/2|
P:A
P:B
efe e2f|g3 f2e|d2B BdB|A2F A2A|Bcd e2f|d3 d3||
P:A
P:B
P:C
M:4/4
L:1/4
M:6/8
L:1/8
P:A
A|
K:G
P:B
d|
P:C
d|
P:A
G/2A/2|
P:B
d|
P:C
d|
M:4/4
L:1/4
P:A
D/2E/2|
P:B
(3D/2F/2A/2|
P:A
D/2E

In [5]:
print(len(abc_raw_txt))
print(len(abc_headers_txt))

22260
63186


Once we have what we need we can save the file to disk

In [6]:
output_raw_fp = os.path.join(input_folder_fp, 'abc_raw.txt')
output_all_fp =  os.path.join(input_folder_fp, 'abc_all.txt')
output_header_fp =  os.path.join(input_folder_fp, 'abc_headers.txt')

with open(output_raw_fp, 'w') as f:
    f.write(abc_raw_txt)
    
with open(output_all_fp, 'w') as f:
    f.write(abc_all_txt)
    
with open(output_header_fp, 'w') as f:
    f.write(abc_headers_txt)

Now that we have our input text file ready we can run it through our RNN, we will use char-rnn for tensorflow, you can download it and install it from [here](https://github.com/sherjilozair/char-rnn-tensorflow) 

In [60]:
import shutil
import subprocess

charrnn_folder_fp = '/home/gu-ma/Documents/Projects/201809-HSLU-COMPPX/References/char-rnn-tensorflow'

# We try with the full text first
shutil.move(output_all_fp, os.path.join(charrnn_folder_fp, 'data', 'abc', 'input.txt'))

'/home/gu-ma/Documents/Projects/201809-HSLU-COMPPX/References/char-rnn-tensorflow/data/abc/input.txt'

Go to the directory and run the training