# Day 5: Reely Not That Different

[Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) + [ABC Tunes](http://www.norbeck.nu/abc/) (traditional Irish music)

In [1]:
import Levenshtein
import glob
import itertools

In [2]:
filelist = glob.glob('tunes/hnr*.abc')

In [3]:
text = ''

for filename in filelist:
    with open(filename, 'r') as fp:
        text += fp.read()

In [4]:
tunes = text.split('X:')[1:]

In [5]:
lines = []
master = []

for index, tune in enumerate(tunes):
    body = tune.split('K:')[1].split('\r\n')[1:]
    name = tune.split('T:')[1].split('\r\n')[0]
    
    first_line = body[0].split('|')
    first_line = [measure.replace(':', '') for measure in first_line]
    first_line = [measure.replace('1 ', '') for measure in first_line]
    first_line = [measure.replace(' ', '') for measure in first_line]
    first_line = [measure.strip() for measure in first_line]
    
    if len(first_line[:4]) != 4:
        continue
        
    if len(first_line[0]) < 4:
        continue
    
    first_line = " ".join(first_line[:4])
    
    lines.append(first_line)
    
    tune = {
        'id_' : index,
        'name' : name,
        'first_line' : first_line
    }
    
    master.append(tune)

In [6]:
lines[0], master[0]

('BG~G2BGcG BG~G2Bdgd BG~G2BdcB ADFGABcA',
 {'first_line': 'BG~G2BGcG BG~G2Bdgd BG~G2BdcB ADFGABcA',
  'id_': 0,
  'name': 'Flogging Reel, The'})

In [7]:
corpus = ["".join(line) for line in lines]

In [8]:
all_pairs = itertools.combinations(corpus, 2)

In [9]:
data = []
for pair in all_pairs:
    row = [pair[0], pair[1], Levenshtein.distance(*pair)]
    data.append(row)

In [10]:
data.sort(key=lambda x: x[2])

In [11]:
top_25 = data[:25]

In [12]:
top_25

[['cE~E2cded cE~E2G2AB cE~E2cded cABG~A3B',
  'cE~E2cded cE~E2G2AB cE~E2cded cABG~A3B',
  0],
 ['FEDEFA~A2 BAdABAdA FEDEFA~A2 BAdAFE~E2',
  'FEDEFA~A2 BAdABAdA FEDEFA~A2 BAdAFE~E2',
  0],
 ['~b3a~g3e dB~B2dega ~b3a~g3e dBGABA~A2',
  '~b3a~g3e dB~B2dega ~b3a~g3e dBGABA~A2',
  0],
 ['AG~G2AGFD AG~G2Addc AG~G2AGFD FEFGABcA',
  'AG~G2AGFD AG~G2Addc AG~G2AGFD =FEFGABcA',
  1],
 ['~B3G~A3G FDADBDAF DGGFG2ge fddcABcA',
  '~B3GABAG FDADBDAF DGGFG2ge fddcABcA',
  3],
 ['ea~a2eg~g2 ea~a2ABcd ea~a2efge afged2cd',
  'ea~a2efgf ea~a2ABcd ea~a2efge afged2cd',
  3],
 ['G2BdcAFA GABdg2fg ec~c2dB~B2 cAABAFDF',
  '~G3BcABA GABdg2fg ec~c2dB~B2 cAABAFDF',
  4],
 ['~E3FABcA ~A2cABFAF ~E3FABcA BF~F2BFAF',
  '~E3FABcd eAcABAFA ~E3FABcA BF~F2BFAF',
  5],
 ['EAABe2dB e2dBGABG EAAB~e3f gedBBAAG',
  'EAABe2dB eBdBGAAG EAABedef gedBBAAG',
  5],
 ['DFEFD2AB c2cGEFGE DFEFD2AB cAGEED~D2',
  'DFEFD2dB cAAGEFGE DFEFD2dB cAGEEDDE',
  6],
 ['e2ABcdec d2BGdG(3Bcd eAABcdef gedgeAAd',
  'e2ABcdec dGBGdG(3Bcd e2ABcdea gedBB

In [14]:
def get_tune_name(tune):
    return (item['name'] for item in master if item['first_line'] == tune).next()

In [15]:
for pair in top_25:
    print "{} ~ {} with L(x, y) = {}".format(get_tune_name(pair[0]), get_tune_name(pair[1]), pair[2])

Scotch Mary ~ Scotch Mary with L(x, y) = 0
Graf Spee, The ~ Graf Spee, The with L(x, y) = 0
Primrose Lass, The ~ Primrose Lass, The with L(x, y) = 0
Steampacket, The ~ Steampacket, The with L(x, y) = 1
Upstairs in a Tent ~ Peg McGrath's with L(x, y) = 3
Mick Hoy's ~ Fair Haired Lass, The with L(x, y) = 3
Patricia Wilmot's Reel ~ Patricia Wilmot's Reel with L(x, y) = 4
McFadden's Handsome Daughter ~ Kit O'Connor with L(x, y) = 5
Sweetheart Reel, The ~ Temple Hill with L(x, y) = 5
Scartaglen Reel, The ~ Scartaglen Reel, The with L(x, y) = 6
Eel in the Sink, The ~ Blackthorn, The with L(x, y) = 6
Solus Lillis' Reel ~ Mama's Pet with L(x, y) = 7
Old Maids of Galway, The ~ Paddy Gone to France with L(x, y) = 7
Finbar Dwyer's ~ Finbar Dwyer's with L(x, y) = 7
Union Reel, The ~ Moher Reel with L(x, y) = 7
Drag Her round the Road ~ Drag Her round the Road with L(x, y) = 8
Coalminer's Reel, The ~ Denis Murphy's with L(x, y) = 8
Dublin Reel, The ~ Dublin Reel, The with L(x, y) = 8
Old Gorman's R