## Notebook to create normalised feature vectors (length invariant)



**Normalisation process:**


Pitch:
- Count occurences of each pitch interval

Rhythm:
- Count occurences of each note length interval

In [1]:
import pandas as pd
import numpy as np
import pickle

### Load original file features data:

In [4]:
tunes_data=pd.read_csv('./Dataset/files_features_clean.tsv',sep='\t')

In [3]:
tunes_data

Unnamed: 0,file_name,pitch_vector,rhythm_vector
0,1795_Gretna Green. VWMLa.193.abc,"[-2, -5, 0, 0, -1, 1, -3, 8, 0, 0, -1, 3, -3, ...","[0.25, 0.25, 1.0, 0.5, 0.75, 0.25, 0.5, 1.0, 0..."
1,1827_Bath Waltz. JC.168.abc,"[9, -4, -5, 9, -4, 4, 1, -3, -2, -1, 3, -3, -4...","[0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 1.0, 0.25,..."
2,1860_Saxon Waltz. TLY.030.abc,"[1, -1, -4, 9, -2, -2, -1, -9, 5, 4, -2, -7, 4...","[1.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, ..."
3,1756_Better Day Better Deed. WCD3.abc,"[2, -2, -2, -1, -2, 2, 1, -1, -2, -2, -5, 2, 2...","[1.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, ..."
4,1833_Green Hills of Tyrol. LW.099.abc,"[5, 2, 2, -4, 4, 1, 2, 2, -5, 5, -2, -2, -3, -...","[0.5, 0.5, 0.5, 1.5, 0.5, 0.5, 0.5, 1.5, 0.5, ..."
...,...,...,...
13611,1837_Untitled.#15. JBs.117.abc,"[0, 5, -5, -3, -4, 0, 2, 2, 1, -1, -2, 0, 0, 0...","[0.75, 0.25, 2.0, 1.5, 0.5, 1.0, 0.5, 0.5, 0.5..."
13612,1834_Irish Washerwoman. BF12.12.abc,"[-1, -4, 0, -5, 5, 0, 4, -4, 4, 3, -2, -1, 1, ...","[1.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, ..."
13613,1758_Charles's Jigg THO1.045.abc,"[-7, 0, 0, 9, -2, -7, 0, 0, 9, -2, -2, -1, 1, ...","[0.5, 0.5, 0.5, 1.0, 0.5, 0.5, 0.5, 0.5, 1.0, ..."
13614,"1699_SPRING. TBr.05, THE.abc","[7, -2, 2, -2, -2, 2, -3, 5, -8, -4, 5, 2, 0, ...","[1.0, 2.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0, ..."


### Create and save Normalised pitch vectors:

In [5]:
normalised_pitch_vectors = {}

count = 0

# Iterate through each tune:
for tune in tunes_data.index:
    intervals = dict.fromkeys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 0)
    tune_name  = tunes_data['file_name'][tune]
    tune_pitch_vector = tunes_data['pitch_vector'][tune]

    # Count the occurences of each interval:
    for interval in tune_pitch_vector:
        try:
            intervals[int(interval)] += 1
        except:
            pass
    
    # Save to dict:
    normalised_pitch_vectors[tune_name] = list(intervals.values())
    count += 1
    if count % 100 == 0:
        print(f'Processed: {count} tunes')

# Save to pickle file: 
with open('normalised_pitch_vectors_clean.pkl', 'wb') as handle:
    pickle.dump(normalised_pitch_vectors, handle, protocol=pickle.HIGHEST_PROTOCOL)

Processed: 100 tunes
Processed: 200 tunes
Processed: 300 tunes
Processed: 400 tunes
Processed: 500 tunes
Processed: 600 tunes
Processed: 700 tunes
Processed: 800 tunes
Processed: 900 tunes
Processed: 1000 tunes
Processed: 1100 tunes
Processed: 1200 tunes
Processed: 1300 tunes
Processed: 1400 tunes
Processed: 1500 tunes
Processed: 1600 tunes
Processed: 1700 tunes
Processed: 1800 tunes
Processed: 1900 tunes
Processed: 2000 tunes
Processed: 2100 tunes
Processed: 2200 tunes
Processed: 2300 tunes
Processed: 2400 tunes
Processed: 2500 tunes
Processed: 2600 tunes
Processed: 2700 tunes
Processed: 2800 tunes
Processed: 2900 tunes
Processed: 3000 tunes
Processed: 3100 tunes
Processed: 3200 tunes
Processed: 3300 tunes
Processed: 3400 tunes
Processed: 3500 tunes
Processed: 3600 tunes
Processed: 3700 tunes
Processed: 3800 tunes
Processed: 3900 tunes
Processed: 4000 tunes
Processed: 4100 tunes
Processed: 4200 tunes
Processed: 4300 tunes
Processed: 4400 tunes
Processed: 4500 tunes
Processed: 4600 tun

In [6]:
normalised_pitch_vectors

{'1795_Gretna Green. VWMLa.193.abc': [12,
  14,
  13,
  13,
  5,
  4,
  0,
  2,
  2,
  0,
  0,
  0,
  0],
 '1827_Bath Waltz. JC.168.abc': [4, 9, 10, 10, 8, 4, 0, 2, 0, 4, 0, 0, 0],
 '1860_Saxon Waltz. TLY.030.abc': [1, 9, 12, 2, 17, 7, 0, 6, 0, 12, 0, 0, 0],
 '1756_Better Day Better Deed. WCD3.abc': [1,
  22,
  49,
  4,
  1,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 '1833_Green Hills of Tyrol. LW.099.abc': [8,
  10,
  33,
  10,
  12,
  16,
  0,
  7,
  0,
  4,
  0,
  0,
  0],
 "1875_My Love She's But a Lassie Yet WES.073.abc": [6,
  28,
  26,
  9,
  7,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0],
 '1825_Come Haste to the Wedding. JBut.288.abc': [14,
  13,
  30,
  25,
  9,
  2,
  1,
  0,
  2,
  0,
  0,
  0,
  0],
 '1765_Come if you can. THO2.003.abc': [2,
  6,
  3,
  13,
  15,
  6,
  2,
  3,
  0,
  0,
  0,
  0,
  0],
 '1825_Lovely Maley JBut.471.abc': [4, 24, 36, 11, 8, 7, 0, 2, 0, 0, 0, 0, 0],
 '1756_Corelis Gavot. WCD3.abc': [0, 10, 18, 8, 1, 6, 0, 4, 0, 0, 0, 0, 0],
 '1810_To all good Las

### Create and save normalised rhythm vectors:

In [7]:
normalised_rhythm_vectors = {}

count = 0

# Iterate through erach tune:
for tune in tunes_data.index:
    note_lengths = dict.fromkeys([0.0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5], 0)
    tune_name  = tunes_data['file_name'][tune]
    tune_rhythm_vector = tunes_data['rhythm_vector'][tune]
    tune_rhythm_vector = tune_rhythm_vector.split(",")

    # Count the occurences of each interval:
    for note_length in tune_rhythm_vector:
        try:
            note_lengths[float(note_length)] += 1
        except:
            pass
    
    # Save to dict:
    normalised_rhythm_vectors[tune_name] = list(note_lengths.values())
    count += 1
    if count % 100 == 0:
        print(f'Processed: {count} tunes')

    # if count > 0:
    #     break

# Save to pickle file:
with open('./Dataset/normalised_rhythm_vectors_clean.pkl', 'wb') as handle:
    pickle.dump(normalised_rhythm_vectors, handle, protocol=pickle.HIGHEST_PROTOCOL)

Processed: 100 tunes
Processed: 200 tunes
Processed: 300 tunes
Processed: 400 tunes
Processed: 500 tunes
Processed: 600 tunes
Processed: 700 tunes
Processed: 800 tunes
Processed: 900 tunes
Processed: 1000 tunes
Processed: 1100 tunes
Processed: 1200 tunes
Processed: 1300 tunes
Processed: 1400 tunes
Processed: 1500 tunes
Processed: 1600 tunes
Processed: 1700 tunes
Processed: 1800 tunes
Processed: 1900 tunes
Processed: 2000 tunes
Processed: 2100 tunes
Processed: 2200 tunes
Processed: 2300 tunes
Processed: 2400 tunes
Processed: 2500 tunes
Processed: 2600 tunes
Processed: 2700 tunes
Processed: 2800 tunes
Processed: 2900 tunes
Processed: 3000 tunes
Processed: 3100 tunes
Processed: 3200 tunes
Processed: 3300 tunes
Processed: 3400 tunes
Processed: 3500 tunes
Processed: 3600 tunes
Processed: 3700 tunes
Processed: 3800 tunes
Processed: 3900 tunes
Processed: 4000 tunes
Processed: 4100 tunes
Processed: 4200 tunes
Processed: 4300 tunes
Processed: 4400 tunes
Processed: 4500 tunes
Processed: 4600 tun

In [8]:
normalised_rhythm_vectors

{'1795_Gretna Green. VWMLa.193.abc': [0, 8, 41, 3, 8, 0, 2, 0, 0, 0, 0],
 '1827_Bath Waltz. JC.168.abc': [2, 20, 23, 0, 4, 0, 1, 0, 0, 0, 0],
 '1860_Saxon Waltz. TLY.030.abc': [0, 0, 53, 0, 0, 0, 11, 0, 1, 0, 0],
 '1756_Better Day Better Deed. WCD3.abc': [0, 0, 74, 0, 4, 0, 3, 0, 0, 0, 0],
 '1833_Green Hills of Tyrol. LW.099.abc': [0, 0, 79, 0, 7, 0, 9, 0, 2, 0, 0],
 "1875_My Love She's But a Lassie Yet WES.073.abc": [0,
  22,
  39,
  8,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 '1825_Come Haste to the Wedding. JBut.288.abc': [2,
  8,
  77,
  0,
  2,
  0,
  3,
  0,
  0,
  0,
  0],
 '1765_Come if you can. THO2.003.abc': [0, 42, 5, 0, 0, 0, 0, 0, 0, 0, 0],
 '1825_Lovely Maley JBut.471.abc': [0, 0, 83, 0, 5, 0, 0, 0, 0, 0, 0],
 '1756_Corelis Gavot. WCD3.abc': [0, 14, 0, 14, 16, 0, 0, 0, 0, 0, 0],
 '1810_To all good Lasses. TS.002.abc': [0, 10, 73, 6, 5, 0, 0, 0, 4, 0, 0],
 '1779_Swab the Decks. JBi.55.abc': [0, 6, 52, 0, 3, 0, 0, 0, 0, 0, 0],
 '1731_Downfall of a Ginn a Hornpipe.abc': [0,
  0,