## Notebook to extract pitch and rhythm features from ABC-notated melodies

All files are iterated through and pitch and rhythm vectors are extracted.

A .csv file is created to store these features, along with related melodies (in terms of date).

In [1]:
import music21 
import os
import csv
import pickle

### Function to extract pitch and rhythm vectors:

In [6]:
def extract_features(input_folder):
    ''' 
    Function to iterate through a folder of .abc files, extracting pitch and rhythm vectors and storing them.

    Input:
    - input_folder (str): folder-path for the folder containing .abc files

    Returns:
    - features_dict (dict): dictionary containing each unique file-name as keys and tuples of (pitch_vector, rhythm_vector) as values
    - exceptions (list): list of all files which could not be processed

    '''

    folder = input_folder

    # Empty dict to store features:
    features_dict = {}

    # Empty list to store empty tunes:
    empty_tunes = []

    # Iterate through all .abc tunes:
    for root, _, files in os.walk(folder):
        num_tunes = len(files)
        count = 0
        exceptions = []
        for file_name in files:
            count+=1
            file_path = os.path.join(root, file_name)
            if file_path.endswith('.abc'):
                try:
                    # Parse abc file into music21 stream:
                    score = music21.converter.parse(file_path)

                    # Initialise empty lists to store pitch and rhythm vectors:
                    pitch_vector = []
                    rhythm_vector = []

                    # Iterate through all notes and get pitch number:
                    notes = score.recurse().notes
                    pitches = []
                    for note in notes:
                        if note.isNote:
                            pitch_num = note.pitch.midi
                            pitches.append(pitch_num)

                    # Get differences in pitch, to get a pitch vector:
                    pitch_vector = []
                    for i in range((len(pitches))-1):
                        note_i = pitches[i]
                        note_j = pitches[i+1]
                        note_diff = note_j - note_i
                        pitch_vector.append(note_diff)
                    
                    # Iterate through all notes and get relative note length (for rhythm vector):
                    ''' 
                    1.0 = Crotchet (quarter note)
                    1.5 = Dotted Crotchet
                    0.5 = Quaver (eighth note)
                    0.25 = Semiquaver (sixteenth note)
                    etc...
                    '''
                    notes = score.recurse().notes
                    for note in notes:
                        if note.isNote or note.isRest:
                            note_len = round(float(note.quarterLength), 2)
                            rhythm_vector.append(note_len)
                    
                    # If a tune has an empty pitch and/or rhythm vector, skip:
                    if (not pitch_vector) or (not rhythm_vector):
                        print('Found empty tune: ', file_name)
                        empty_tunes.append(file_name)
                    else:

                        # Store as a triple:
                        '''
                        Triple: (file_name, pitch_vector, rhythm_vector)
                        '''
                        features_dict[file_name] = (pitch_vector, rhythm_vector)

                    if count % 100 == 0:
                        print(f'Processed: {count} tunes out of {num_tunes}')
                except:
                    exceptions.append(file_name)

    print('Number of exceptions:', len(exceptions))
    print('Empty files: ', empty_tunes)

    return features_dict, exceptions

### Call function to extract features:

In [7]:
features_dict, exceptions = extract_features('./Dataset/tune_abc_files')

Processed: 100 tunes out of 13836
Processed: 200 tunes out of 13836
Processed: 300 tunes out of 13836
Processed: 400 tunes out of 13836
Found empty tune:  1830_Daventry (Continued).abc
Processed: 500 tunes out of 13836
Processed: 600 tunes out of 13836
Found empty tune:  1830_Portugal New (Continued).abc
Processed: 700 tunes out of 13836
Processed: 800 tunes out of 13836
Found empty tune:  1825_Rofs Castel. JBut.102 *missing*.abc
Found empty tune:  1830_Sprowton (Lodge) Continued.abc
Processed: 900 tunes out of 13836
Processed: 1000 tunes out of 13836
Processed: 1100 tunes out of 13836
Processed: 1200 tunes out of 13836
Found empty tune:  1798_Chapter of Kings.(2 voices) TJD.09.abc
Processed: 1300 tunes out of 13836
Processed: 1400 tunes out of 13836
Processed: 1500 tunes out of 13836
Processed: 1600 tunes out of 13836
Processed: 1700 tunes out of 13836
Processed: 1800 tunes out of 13836
Processed: 1900 tunes out of 13836
Processed: 2000 tunes out of 13836
Processed: 2100 tunes out of 

In [8]:
exceptions

['1721_Burlington House. PDF2.215.abc',
 '1721_Happy Couple. PFD2.300.abc',
 '1815_Lady Mary Ramseys Strathspey. MBe.74.abc',
 '1810_Ashleys Hornpipe. RH.448.abc',
 '1750_Tom Jolly. JJo2.091.abc',
 '1762_Parson in the Suds. JBa.43, The.abc',
 '1762_Mopping Nelly,Part of. JBa.72.abc',
 '1748_Sprightly Nancy. JJo4.028.abc',
 '1694_Jenny Come Down To Jocko. HA.040.abc',
 '1770_King & the Miller.  WCl.22, The.abc',
 '1694_Rood House Rant. HA.010.abc',
 '1779_Shutter Hornpipe. JBi.81.abc',
 '1751_Junketting Bout. JJo6.54, The.abc',
 '1815_Heel & Fling. MBe.41.abc',
 '1799_Quick Step. JMP.001.abc',
 '1721_Whitsun-holidays. Or... PFD2.230.abc',
 '1762_Quakers Hornpipe. JBa.36, The.abc',
 '1851_Morpeth Rant. WTG.[036].abc',
 '1694_Gigg. HA.006, A.abc',
 '1825_Isle Of France.JBut.1002, The.abc',
 '1770_Kick ye Buckett. WCl.28.abc',
 '1762_Creef Fair. JBa.84.abc',
 '1748_Cheese and Bread. SenH.071.abc',
 "1827_John Clare's H'pipe, aka. JC.181.abc",
 '1827_Saxe Coburg. JC.205.abc',
 '1812_Royal R

### Save to .pkl:

In [9]:
with open ('./Dataset/file_features_clean.pkl', 'wb') as F:
    pickle.dump(features_dict, F)

### Save to TSV File

1651, 1740, 1744, 1745, 1748, 1750, 1751, 1756, 1758, 1759, 1762, 1765, 1767, 1772, 1774, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1793, 1794, 1798, 1799, 1800, 1801,, 1803, 1809, 1810, 1825, 1830, 1837, 1888, 1890, 1900, 1904, 1905

In [10]:
type(features_dict)

dict

In [11]:
headers = ['file_name', 'pitch_vector', 'rhythm_vector']

with open('files_features_clean.tsv', mode='w', newline='') as file:
    writer = csv.writer(file, delimiter='\t')
    
    # Write the header row:
    writer.writerow(['file_name', 'pitch_vector', 'rhythm_vector'])
    
    # Write each key-tuple pair to the .tsv:
    for key, value in features_dict.items():
        writer.writerow([key, value[0], value[1]])