# Chord Cleaning

### Imports

In [1]:
import pandas as pd
import numpy as np
import re
import ast
from collections import Counter
import matplotlib.pyplot as plt
import os
import json

### Getting workable DF

In [2]:
data_path = os.getenv('DATA_PATH')

In [3]:
raw_df = pd.read_csv(data_path)
slim_raw_df = raw_df[['artist_name', 'song_name', 'chords', 'genres', 'popularity']]
slim_raw_df.head(3)

Unnamed: 0,artist_name,song_name,chords,genres,popularity
0,Justin Bieber,"10,000 Hours",{3: 'G G/B ...,"['canadian pop', 'pop', 'post-teen pop']",100
1,Justin Bieber,2 Much,"{1: 'Intro: F#m7 D2 ', 4: 'F#m7 ', 8: 'D2 ', ...","['canadian pop', 'pop', 'post-teen pop']",100
2,Justin Bieber,2u (feat. David Guetta),{1: 'Em D C C...,"['canadian pop', 'pop', 'post-teen pop']",100


## Cleaning

### Clean DF

In [4]:
chords_ser_raw = slim_raw_df.chords.copy()

chords_ser_lists = pd.Series(index=range(len(chords_ser_raw)), dtype='object')

for i, song in enumerate(chords_ser_raw):
    song_mm_ch_dict = ast.literal_eval(song)
    song_ch_lol = ' '.join([ch for ch in song_mm_ch_dict.values()])
    song_ch_lol = re.sub(r'[()]', '', song_ch_lol)
    song_ch_lol = re.sub(r'\s+', ' ', song_ch_lol)
    chords_ser_lists.iloc[i] = song_ch_lol

chords_ser_lists.head()

0    G G/B C G G G/B C G G Em C G G Em C G G Em C G...
1    Intro: F#m7 D2 F#m7 D2 F#m7 D2 E F#m7 A/C# E D...
2    Em D C C D Em Em D C C D Em Em D C Am D Em G C...
3     Intro: Em Bm Am C 2x Em Bm Am C Em Bm Am C Bm...
4    Intro: Gm - Dm - C - C x2 Gm Dm C C Gm Dm C C ...
dtype: object

In [5]:
cleaned_chords_counter = Counter()
for song in chords_ser_lists:
    try:
        l = song.split(' ')
    except:
        print(song)
        break
    cleaned_chords_counter.update(l)
del cleaned_chords_counter['']

cleaned_chord_count_df = pd.Series(dict(cleaned_chords_counter)).to_frame('chord_count')
cleaned_cc_df = cleaned_chord_count_df.sort_values(by='chord_count', ascending=False).reset_index(names=['chord'])
print('the total amount of unique "chords" is: ', len(cleaned_cc_df))

the total amount of unique "chords" is:  87096


### String Cleaning (takes ages)

In [6]:
# raw_chords_ser = chords_ser_lists.copy()
# cleaned_chords_ser = pd.Series(index=range(len(raw_chords_ser)), dtype='object')

In [7]:
# # bar charts
# re1 = r'\S*[|]\S+[|]'
# # repeat symbols 
# re2 = r'%|2x|x[0-9]|x\s+[0-9]|\s-\s|\s/\s'
# # non-useful punctuation
# re3 = r'[|,.*?]|\\|~'
# # more than 3 numbers in a row
# re4 = r'[0-9]{4,}'
# # words starting with letters after G
# re5 = r'\s+[H-Zh-z]\w*' 
# # special words
# words = ['intro', 'interlude', 'bridge', 'INTRODUÇÃO', 'instrumental', 
#          'solo', 'chorus', 'riff', 'break', 'guitar', 'verse', 'dução', 
#          'final', 'once', '-once', 'hammer', 'outro', '-stop', 'repeat']
# re6 = r"|".join(words)

# full_re = re.compile("|".join([re1, re2, re3, re4, re5, re6]), re.IGNORECASE)

In [8]:
# for i, song in enumerate(raw_chords_ser):
    
#     cleaned_song = ''
#     j = 0
#     for line in song:

#         # remove unhelpful symbols (as shown above)
#         subs = re.sub(full_re, ' ', line, )
        
#         # remove parentheses, brackets, and colons
#         cleaned_string = re.sub(r'[(){}:]', '', subs)

#         # remove multiple spaces
#         mult_sp_rem_ch = re.sub(r'\s+', ' ', cleaned_string)
        
#         # remove front and trailing white spaces
#         beg_end_sp_rem_song = re.sub(r'^\s+|\s+$', '', mult_sp_rem_ch)

#         if j < len(song)-1:
#             cleaned_song += beg_end_sp_rem_song + ' '
#             j += 1
#         else:
#             cleaned_song += beg_end_sp_rem_song
            
#     cleaned_chords_ser.iloc[i] = cleaned_song

## Creating List of Correctly Formatted Chords

In [9]:
major_chords = ['C', 'C#', 'Db', 'D', 'D#', 'Eb', 'E', 'F', 'F#', 'Gb', 'G', 'G#', 'Ab', 'A', 'A#', 'Bb', 'B']

In [10]:
chords_df = pd.DataFrame(index=['dim', 'm', '', 'aug', '5',
                                'dim7', 'm7b5', 'm7', 'mM7', '7', 'M7',
                                '7#5', '7b5', 
                                'm6', '6', '67', '69',
                                'm9', '9', '7b9', '7#9', 'M9',  
                                'm11', '11', '13', 'm13',
                                'sus2', 'sus4', 'sus47', 'susb9', 'sus4b9',
                                # 'add2', 'add4', 'add6', 'add9', 
                                # 'madd2', 'madd4', 'madd6', 'madd9', 
                                # '7add4', '7addb6', '7add6'
                                ],
                         columns=major_chords)


In [11]:
for idx in chords_df.index:
    new_chords = [m + idx for m in major_chords]
    chords_df.loc[idx, :] = new_chords
chords_df.rename(index={'': 'major'}, inplace=True)
# chords_df.index

In [12]:
sharp_notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B', 
                'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
flats_notes = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B', 
                'C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
sharp_keys = ['D', 'E', 'G', 'A', 'B', 'Dm', 'Em', 'Fm', 'Bm']
flats_keys = ['C', 'F', 'Cm', 'Gm', 'Am']

slash_indexes = ['dim/b7', 'm/2', 'm/b3', 'm/3', 'm/4', 'm/5', 'm/6','m/b7', 'm/7',
                'm7/b3', 'm7/4', 'm7/b5', 'm7/5', 'm7/b7', 'm7/7', 
                '/2', '/b3', '/3', '/4', '/5', '/6','/b7', '/7', 
                '7/b2', '7/2', '7/b3', '7/3', '7/4', '7/5', '7/b6', '7/6','7/7', 
                '9/b3', '9/3', '9/4', '9/5', '9/6', '9/7',
                'M7/2', 'M7/b3', 'M7/3', 'M7/4', 'M7/5', 'M7/6','M7/b7']

whole_to_half = {'b2': 1, '2': 2, 'b3': 3, '3': 4, '4': 5, 'b5': 6,
                 '5': 7, 'b6': 8,'6': 9, 'b7': 10, '7': 11}

In [13]:
# big function
def get_slash_notes(slash_indexes, columns):
    slash_chord = pd.DataFrame(index=slash_indexes, columns=columns)
    i = 0 
    
    def find_slash(chord, notes, i):
        for si in slash_indexes:
            s = si.split('/')
            h = whole_to_half[s[1]]
            sc = f'{chord}{s[0]}/{notes[i+h]}'
            slash_chord.loc[si, chord] = sc
    
    def sort_f_s(chord, notes):
        for n in notes:
            if len(chord) > 1:
                i = notes.index(chord)
                find_slash(chord, notes, i)
            else:
                i = notes.index(chord[0])
                find_slash(chord, notes, i)
    
    for chord in columns:
        if chord in sharp_keys or '#' in chord:
            sort_f_s(chord, sharp_notes)
        else:
            sort_f_s(chord, flats_notes)
                
            
    return slash_chord

In [14]:
slash_chords_df = pd.concat([chords_df, get_slash_notes(slash_indexes, chords_df.columns)])
slash_chords_df.index

Index(['dim', 'm', 'major', 'aug', '5', 'dim7', 'm7b5', 'm7', 'mM7', '7', 'M7',
       '7#5', '7b5', 'm6', '6', '67', '69', 'm9', '9', '7b9', '7#9', 'M9',
       'm11', '11', '13', 'm13', 'sus2', 'sus4', 'sus47', 'susb9', 'sus4b9',
       'dim/b7', 'm/2', 'm/b3', 'm/3', 'm/4', 'm/5', 'm/6', 'm/b7', 'm/7',
       'm7/b3', 'm7/4', 'm7/b5', 'm7/5', 'm7/b7', 'm7/7', '/2', '/b3', '/3',
       '/4', '/5', '/6', '/b7', '/7', '7/b2', '7/2', '7/b3', '7/3', '7/4',
       '7/5', '7/b6', '7/6', '7/7', '9/b3', '9/3', '9/4', '9/5', '9/6', '9/7',
       'M7/2', 'M7/b3', 'M7/3', 'M7/4', 'M7/5', 'M7/6', 'M7/b7'],
      dtype='object')

## Shove chords into correct format

In [15]:
yes_chords = list(slash_chords_df.values[0])
for x in slash_chords_df.values[1:]:
    chords = list(x)
    for c in chords:
        yes_chords.append(c)

In [16]:
substitutions = {'º': 'dim', 
                 '*': 'dim', 
                 '°': 'dim',
                 'hdim7': 'm7b5',
                 'm75-': 'm7b5',
                 'm5-/7': 'm7b5',
                 'maj': '',
                 ',': '', 
                 '5+': 'aug', 
                 '/5+': 'aug', 
                 '7M': 'M7', 
                 'maj7': 'M7', 
                 'm7+': 'mM7', 
                 '7+': '7#5',
                 '9add6': '69', 
                 '/9-': '7b9', 
                 '79': '9', 
                 '711': '11', 
                 '7b13': 'm13', 
                 '713': '13', 
                 '4/7/9': '11', 
                 '59': '9',
                 '/9': '9', 
                 '/13-': 'm6', 
                 '/13': '6', 
                 'add6': '13',
                 '2': '9', 
                 '4': '11', 
                 '6': '13', 
                 'sus': 'sus4', 
                 'add9': '9',
                 'm7/5-': 'm7b5',
                 }

In [17]:
enharmonics = ['Gm/Bb', 'F#/Bb', 'B/Eb', 'Am/F', 'G/Bb', 'F#/D', 'B/C', 'E/Ab', 'B/G']

In [18]:
def merge_chords(chords_column):
    chords_column_copy = chords_column.copy()
    
    # if keys in subs in chord, replace with values
    rejected = []
    def check_subs(chord):
        if 'x' in chord:
            rejected.append(chord)
            return
        elif chord in yes_chords:
            return chord
        elif chord in enharmonics:
            return chord
        for k in list(substitutions.keys()):
            if k in chord: 
                return chord.replace(k, substitutions[k]) 
                
        rejected.append(chord)
        return 
    
    merged_chords = []
    for song in chords_column_copy:
        song_l = song.split()
        merged_song = []
        for ch in song_l:
            chs = check_subs(ch)
            if chs != None: 
                merged_song.append(chs)
        
        final_song = [x for i, x in enumerate(merged_song) if i == 0 or x != merged_song[i-1]]
                
        merged_chords.append(final_song)
    
    rejects_c = Counter(rejected)
    
    rejects_df = pd.Series(dict(rejects_c)).to_frame('rejects').sort_values(by='rejects', ascending=False)
 
    return merged_chords, rejects_df

In [19]:
merged, rejects = merge_chords(chords_ser_lists)

In [20]:
merged[0]

['G',
 'G/B',
 'C',
 'G',
 'G/B',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'G/B',
 'C',
 'G',
 'G/B',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Bm',
 'C',
 'Bm',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G',
 'Em',
 'C',
 'G']

In [21]:
rejects[:20]

Unnamed: 0,rejects
Intro:,41954
|,38288
-,34551
Intro,16296
Bridge,10232
2x,9154
x2,9037
/,8069
Bridge:,8020
Solo,4767


In [22]:
# final_chords_counter = Counter()
# for song in merged:
#     final_chords_counter.update(song)
# del final_chords_counter['']

# final_chord_count_df = pd.Series(dict(final_chords_counter)).to_frame('chord_count')
# final_cc_df = final_chord_count_df.sort_values(by='chord_count', ascending=False).reset_index(names=['chord'])
# print('the total amount of unique "chords" is: ', len(final_cc_df))

In [23]:
final_df = slim_raw_df.copy()
final_df['chords'] = pd.Series(merged)
final_df.head()

Unnamed: 0,artist_name,song_name,chords,genres,popularity
0,Justin Bieber,"10,000 Hours","[G, G/B, C, G, G/B, C, G, Em, C, G, Em, C, G, ...","['canadian pop', 'pop', 'post-teen pop']",100
1,Justin Bieber,2 Much,"[F#m7, D9, F#m7, D9, F#m7, D9, E, F#m7, A/C#, ...","['canadian pop', 'pop', 'post-teen pop']",100
2,Justin Bieber,2u (feat. David Guetta),"[Em, D, C, D, Em, D, C, D, Em, D, C, Am, D, Em...","['canadian pop', 'pop', 'post-teen pop']",100
3,Justin Bieber,All Around The World,"[Em, Bm, Am, C, Em, Bm, Am, C, Em, Bm, Am, C, ...","['canadian pop', 'pop', 'post-teen pop']",100
4,Justin Bieber,All Around The World (acoustic),"[Gm, Dm, C, Gm, Dm, C, Gm, Dm, C, Gm, A#, Cdim...","['canadian pop', 'pop', 'post-teen pop']",100


In [24]:
# load df to json
result = final_df.to_json(orient='table')
parsed = json.loads(result)
filepath = 'data/cleaned_data.json'

# write to json
with open(filepath, "w") as jsonFile:
    json.dump(parsed, jsonFile, indent=4)
    print(f'new data added sucessfully to {filepath}')

# read from json
new_json_df = pd.read_json(filepath, orient='table')
new_json_df.head()

new data added sucessfully to data/cleaned_data.json


Unnamed: 0,artist_name,song_name,chords,genres,popularity
0,Justin Bieber,"10,000 Hours","[G, G/B, C, G, G/B, C, G, Em, C, G, Em, C, G, ...","['canadian pop', 'pop', 'post-teen pop']",100
1,Justin Bieber,2 Much,"[F#m7, D9, F#m7, D9, F#m7, D9, E, F#m7, A/C#, ...","['canadian pop', 'pop', 'post-teen pop']",100
2,Justin Bieber,2u (feat. David Guetta),"[Em, D, C, D, Em, D, C, D, Em, D, C, Am, D, Em...","['canadian pop', 'pop', 'post-teen pop']",100
3,Justin Bieber,All Around The World,"[Em, Bm, Am, C, Em, Bm, Am, C, Em, Bm, Am, C, ...","['canadian pop', 'pop', 'post-teen pop']",100
4,Justin Bieber,All Around The World (acoustic),"[Gm, Dm, C, Gm, Dm, C, Gm, Dm, C, Gm, A#, Cdim...","['canadian pop', 'pop', 'post-teen pop']",100


# Unused Code

In [25]:
# first_raw = ast.literal_eval(slim_raw_df.loc[0, 'chords'])
# first_raw

In [26]:
# rejects_counter = Counter()
# for r in rejects:
#     for value in song.values():
#         l = value.split(' ')
#         chords_counter.update(l)

# rejects_df = pd.Series(dict(rejects_counter)).to_frame('rejects')
# sorted_rej_df = rejects_df.sort_values(by='rejects', ascending=False)
# sorted_rej_df

In [27]:
# # big function
# def translate_slashes(slash_indexes, columns):
#     slash_chord = pd.DataFrame(index=slash_indexes, columns=columns)
#     i = 0 
    
#     def find_slash(chord, notes, i):
#         for si in slash_indexes:
#             s = si.split('/')
#             h = whole_to_half[s[1]]
#             sc = f'{chord}{s[0]}/{notes[i+h]}'
#             slash_chord.loc[si, chord] = sc
    
#     def sort_f_s(chord, notes):
#         for n in notes:
#             if n in chord :
#                 i = notes.index(chord[0])
#                 find_slash(chord, notes, i)
#             elif len(chord) > 1:
#                 if '#' == chord[1]:
#                     i = sharp_notes.index(chord[:2])
#                     find_slash(chord, sharp_notes, i)
#                 elif 'b' == chord[1]:
#                     i = flats_notes.index(chord[:2])
#                     find_slash(chord, flats_notes, i)
    
#     for chord in columns:
#         sort_f_s(chord, sharp_notes)
#         sort_f_s(chord, flats_notes)
                
            
#     return slash_chord

In [28]:
# slash_chords_df = pd.concat([chords_df, get_slash_notes(slash_indexes, chords_df.columns)])


In [29]:
# r'(?<!\s)/(?=[0-9])'

In [30]:
# pd.set_option('display.max_rows', 50)
# cleaned_cc_df[:50]

In [31]:
# for idx, c in enumerate(cleaned_chords_ser):
#     if ' x ' in c:
#         print(idx, c)

In [32]:
# for m in raw_chords_ser[101]:
#     print(m)

In [33]:
# letters = list(string.ascii_uppercase)[:7]
# non_chord_filter = [chord for chord in sorted_cc_df.index if chord[0] in letters]
# cc_df = sorted_cc_df[sorted_cc_df.index.isin(non_chord_filter)]
# cc_df.iloc[100:200]

### Test

In [34]:
# # editing repeats
# test_chords = 'Intro: Gm - Dm - C - C   x2 Gm7   F % Csus4/D'
# chords_wo_reps = ''

# # find repeat symbols (%, - , x2, 2x) and replace with preceding chord
# if re.findall(r'%|2x|x2|\s-\s', test_chords):
#     reps = re.findall(r'(\S+)(?=\s*(%|2x|x2|\s-\s))|([A-Z]\S+)', test_chords)
#     for x in reps:
#         if x[0] != '':
#             chords_wo_reps += x[0] + ' ' + x[0] + ' '
#         else:
#             chords_wo_reps += x[2] + ' '
# else:
#     chords_wo_reps = test_chords
# print(test_chords)
# print(chords_wo_reps)

In [35]:
# # cleaning up chords
# test2_chords = '      Intro: G-F-G  FMm7,    G\\F\\F\\F,       Amsus6 GM7/D#   (F3)    BbFbCG     '

# # remove non-useful punctuation
# no_punc_ch = re.sub(r'-+|[()|,.]|\\', ' ', test2_chords)

# # remove spaces
# mult_sp_rem_ch = re.sub(r'\s+', ' ', no_punc_ch)
# beg_sp_rem_ch = re.sub(r'^\s+', '', mult_sp_rem_ch)
# end_sp_rem_ch = re.sub(r'\s+$', '', beg_sp_rem_ch)
    
# end_sp_rem_ch

In [36]:
# cleaned_chords_counter = Counter()
# for song in cleaned_chords_ser:
#     try:
#         l = song.split(' ')
#     except:
#         print(song)
#         break
#     cleaned_chords_counter.update(l)
# del cleaned_chords_counter['']

# cleaned_chord_count_df = pd.Series(dict(cleaned_chords_counter)).to_frame('chord_count')
# cleaned_cc_df = cleaned_chord_count_df.sort_values(by='chord_count', ascending=False).reset_index(names=['chord'])
# print('the total amount of unique "chords" is: ', len(cleaned_cc_df))