# Slimming down cleaning process and make it able to take in any new data
##### Started 14.12.22

### Imports

In [1]:
import pandas as pd
import random
import string
import re
import json
import ast
from collections import Counter


### Getting workable DF

In [2]:
data_path = '~/code/emilycardwell/final-project-data/data/raw/kaggle_raw.csv'
raw_df = pd.read_csv(data_path)
raw_df.head(3)

Unnamed: 0.1,Unnamed: 0,artist_name,song_name,chords&lyrics,chords,lyrics,tabs,lang,artist_id,followers,genres,popularity,name_e_chords
0,0,Justin Bieber,"10,000 Hours",\nCapo on 3rd fret\n\t \t\t \r\n\r\nVerse 1:\...,{3: 'G G/B ...,"{0: '\nCapo on 3rd fret\n\t \t\t ', 1: '', 2:...","{89: ""I-----, I'm gonna love you ""}",en,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",100,justin-bieber
1,1,Justin Bieber,2 Much,\n\t \t\t\r\nIntro: F#m7 D2 \r\n\r\nVerse 1:...,"{1: 'Intro: F#m7 D2 ', 4: 'F#m7 ', 8: 'D2 ', ...","{0: '\n\t \t\t', 2: '', 3: 'Verse 1:', 5: ' ...",{},en,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",100,justin-bieber
2,2,Justin Bieber,2u (feat. David Guetta),\n\t \t\t\r\nEm D C ...,{1: 'Em D C C...,"{0: '\n\t \t\t', 2: ""No limit in the sky that...",{},en,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",100,justin-bieber


In [3]:
slim_raw_df = raw_df[['artist_name', 'song_name', 'chords', 'genres', 'popularity']]
slim_raw_df.head(3)

Unnamed: 0,artist_name,song_name,chords,genres,popularity
0,Justin Bieber,"10,000 Hours",{3: 'G G/B ...,"['canadian pop', 'pop', 'post-teen pop']",100
1,Justin Bieber,2 Much,"{1: 'Intro: F#m7 D2 ', 4: 'F#m7 ', 8: 'D2 ', ...","['canadian pop', 'pop', 'post-teen pop']",100
2,Justin Bieber,2u (feat. David Guetta),{1: 'Em D C C...,"['canadian pop', 'pop', 'post-teen pop']",100


In [4]:
split_raw_df = slim_raw_df.copy()
splits = []

for row in split_raw_df.chords:
    row_dict = ast.literal_eval(row)
    for key, val in row_dict.items():
        p1 = re.sub(r'\||\s-|-[^/]|,|\.|\*', ' ', val)
        p2 = re.sub(r'[()]', '', p1)
        s = re.sub(r'^\s+|\s+$', '', p2)
        row_dict[key] = re.sub(r'\s+', ' ', s)
    splits.append(row_dict)

split_raw_df.chords = splits
len(splits)

135783

In [5]:
r = random.randint(0, 135782)
print(slim_raw_df.chords[r])
print(split_raw_df.chords[r])

{1: 'Introdução: D  D4  D ', 3: 'D                             D7M ', 5: 'Am7                       C ', 7: 'C7M                Em7 ', 9: 'A4/7           A7         D ', 11: 'D                     D7M ', 13: 'D6                     Am7 ', 15: 'C                 Em7 ', 17: 'A4/7          A7         D               D4    D ', 19: 'D                  D7M ', 21: 'Am7                     C ', 23: 'Em7      A4/7       A7 ', 25: 'Em7         A4/7        A7      D          D4           D '}
{1: 'Introdução: D D4 D', 3: 'D D7M', 5: 'Am7 C', 7: 'C7M Em7', 9: 'A4/7 A7 D', 11: 'D D7M', 13: 'D6 Am7', 15: 'C Em7', 17: 'A4/7 A7 D D4 D', 19: 'D D7M', 21: 'Am7 C', 23: 'Em7 A4/7 A7', 25: 'Em7 A4/7 A7 D D4 D'}


#### See Chord Frequency

In [6]:
chords_counter = Counter()
for song in splits:
    for value in song.values():
        l = value.split(' ')
        chords_counter.update(l)
del chords_counter['']

chord_count_df = pd.Series(dict(chords_counter)).to_frame('chord_count')
sorted_cc_df = chord_count_df.sort_values(by='chord_count', ascending=False)
len(sorted_cc_df)

25637

In [7]:
letters = list(string.ascii_uppercase)[:7]
non_chord_filter = [chord for chord in sorted_cc_df.index if chord[0] in letters]
cc_df = sorted_cc_df[sorted_cc_df.index.isin(non_chord_filter)]

# pd.set_option('display.max_rows', 1190)
## sorted_cc_df.head(70*17)
pd.set_option('display.max_rows', 100)
cc_df.iloc[100:200]

Unnamed: 0,chord_count
F7+,5598
Bbm7,5490
Gmaj7,5407
C7+,5372
G7+,5316
Amaj7,5227
C#5,5177
Bb9,5056
A7+,4993
A7/13,4972


In [8]:
# slim_chord_counts_dict = {}
# for chord, count in chords_count_dict.items():
#     if count <= low_freq_to_remove:
#         pass
#     else:
#         slim_chord_counts_dict[chord] = count

## Creating List of Correctly Formatted Chords

In [9]:
major_chords = ['C', 'C#', 'Db', 'D', 'D#', 'Eb', 'E', 'F', 'F#', 'Gb', 'G', 'G#', 'Ab', 'A', 'A#', 'Bb', 'B']

In [51]:
chords_df = pd.DataFrame(index=['', 'm', 'dim', 'aug', '5', 'aug5', '5b7',
                                'dim7', 'hdim7', 'm7', 'm-M7', '7', 'aug7', 'M7',  
                                'm9', '9', '7b9',  'M9', 'm11', '11',
                                'add2', 'add4', 'add6', 'add9', 
                                'madd2', 'madd4', 'madd6', 'madd9', 
                                '7add4', '7addb6', '7add6', 
                                'msus2', 'msus4', '7sus2', '7sus4', 'sus2', 'sus4'
                                ],
                         columns=major_chords)


In [62]:
for idx in chords_df.index:
    new_chords = [m + idx for m in major_chords]
    chords_df.loc[idx, :] = new_chords
chords_df.rename(index={'': 'major'}, inplace=True)
chords_df.head(3)

Unnamed: 0,C,C#,Db,D,D#,Eb,E,F,F#,Gb,G,G#,Ab,A,A#,Bb,B
major,C,C#,Db,D,D#,Eb,E,F,F#,Gb,G,G#,Ab,A,A#,Bb,B
m,Cm,C#m,Dbm,Dm,D#m,Ebm,Em,Fm,F#m,Gbm,Gm,G#m,Abm,Am,A#m,Bbm,Bm
dim,Cdim,C#dim,Dbdim,Ddim,D#dim,Ebdim,Edim,Fdim,F#dim,Gbdim,Gdim,G#dim,Abdim,Adim,A#dim,Bbdim,Bdim


In [63]:
sharp_notes = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B', 
                'C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
flats_notes = ['C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B', 
                'C', 'Db', 'D', 'Eb', 'E', 'F', 'Gb', 'G', 'Ab', 'A', 'Bb', 'B']
sharp_keys = ['D', 'E', 'G', 'A', 'B', 'Dm', 'Em', 'Fm', 'Bm']
flats_keys = ['C', 'F', 'Cm', 'Gm', 'Am']

slash_indexes = ['dim/b7', 'm/2', 'm/b3', 'm/3', 'm/4', 'm/5', 'm/6','m/b7', 'm/7',
                'm7/b3', 'm7/4', 'm7/b5', 'm7/5', 'm7/7', 
                '/2', '/b3', '/3', '/4', '/5', '/6','/b7', '/7', 
                '7/b2', '7/2', '7/b3', '7/3', '7/4', '7/5', '7/b6', '7/6','7/7', 
                '9/b3', '9/3', '9/4', '9/5', '9/6', '9/7',
                'M7/2', 'M7/b3', 'M7/3', 'M7/4', 'M7/5', 'M7/6','M7/b7']

whole_to_half = {'b2': 1, '2': 2, 'b3': 3, '3': 4, '4': 5, 'b5': 6,
                 '5': 7, 'b6': 8,'6': 9, 'b7': 10, '7': 11}

In [80]:
# big function
def get_slash_notes(slash_indexes, columns):
    slash_chord = pd.DataFrame(index=slash_indexes, columns=columns)
    i = 0 
    
    def find_slash(chord, notes, i):
        for si in slash_indexes:
            s = si.split('/')
            h = whole_to_half[s[1]]
            sc = f'{chord}{s[0]}/{notes[i+h]}'
            slash_chord.loc[si, chord] = sc
    
    def sort_f_s(chord, notes):
        for n in notes:
            if len(chord) > 1:
                i = notes.index(chord)
                find_slash(chord, notes, i)
            else:
                i = notes.index(chord[0])
                find_slash(chord, notes, i)
    
    for chord in columns:
        if chord in sharp_keys or '#' in chord:
            sort_f_s(chord, sharp_notes)
        else:
            sort_f_s(chord, flats_notes)
                
            
    return slash_chord

In [81]:
slash_chords_df = pd.concat([chords_df, get_slash_notes(slash_indexes, chords_df.columns)])
slash_chords_df

Unnamed: 0,C,C#,Db,D,D#,Eb,E,F,F#,Gb,G,G#,Ab,A,A#,Bb,B
major,C,C#,Db,D,D#,Eb,E,F,F#,Gb,G,G#,Ab,A,A#,Bb,B
m,Cm,C#m,Dbm,Dm,D#m,Ebm,Em,Fm,F#m,Gbm,Gm,G#m,Abm,Am,A#m,Bbm,Bm
dim,Cdim,C#dim,Dbdim,Ddim,D#dim,Ebdim,Edim,Fdim,F#dim,Gbdim,Gdim,G#dim,Abdim,Adim,A#dim,Bbdim,Bdim
aug,Caug,C#aug,Dbaug,Daug,D#aug,Ebaug,Eaug,Faug,F#aug,Gbaug,Gaug,G#aug,Abaug,Aaug,A#aug,Bbaug,Baug
5,C5,C#5,Db5,D5,D#5,Eb5,E5,F5,F#5,Gb5,G5,G#5,Ab5,A5,A#5,Bb5,B5
aug5,Caug5,C#aug5,Dbaug5,Daug5,D#aug5,Ebaug5,Eaug5,Faug5,F#aug5,Gbaug5,Gaug5,G#aug5,Abaug5,Aaug5,A#aug5,Bbaug5,Baug5
5b7,C5b7,C#5b7,Db5b7,D5b7,D#5b7,Eb5b7,E5b7,F5b7,F#5b7,Gb5b7,G5b7,G#5b7,Ab5b7,A5b7,A#5b7,Bb5b7,B5b7
dim7,Cdim7,C#dim7,Dbdim7,Ddim7,D#dim7,Ebdim7,Edim7,Fdim7,F#dim7,Gbdim7,Gdim7,G#dim7,Abdim7,Adim7,A#dim7,Bbdim7,Bdim7
hdim7,Chdim7,C#hdim7,Dbhdim7,Dhdim7,D#hdim7,Ebhdim7,Ehdim7,Fhdim7,F#hdim7,Gbhdim7,Ghdim7,G#hdim7,Abhdim7,Ahdim7,A#hdim7,Bbhdim7,Bhdim7
m7,Cm7,C#m7,Dbm7,Dm7,D#m7,Ebm7,Em7,Fm7,F#m7,Gbm7,Gm7,G#m7,Abm7,Am7,A#m7,Bbm7,Bm7


## Shove chords into correct format

In [82]:
substitutions = {'': 'maj', '': ',', 'dim': 'º', 'aug': '5+', 'M7': ['7M', 'maj7'], 
                 'hdim7': 'm7b5', 'm-M7': 'm7+', 'aug7': '7+', '9': '79', 'dim/b7': 'm5-/7', 
                 '7add4': '711', '7addb6': '7b13', '7add6': '713', '9add4': '4/7/9', '9add6': '69', 
                 '/b2': '/9-', '/2': '/9', '/b5': '/5-', '/b6': ['/13-', '/5+'], '/6': '/13'
                 }

re_subs = {'madd2': r'^2', 'madd4': r'^4', 'madd6': r'^6', 
           'add2': r'^2', 'add4': r'^4', 'add6': r'^6', 'sus4': r'sus$'}

#### For next time:
- do re_subs

In [83]:
rejects = {}

def merge_chords(chords_column):
    chords_column_copy = chords_column.copy()
    merged_chords = []
    
    for song in chords_column_copy:
        merged_song = {}
        for measure, chords in song.items():
            ch_list = chords.split()
            for idx, ch in enumerate(ch_list):
                if ch == '%':
                    ch1 = ch.replace('%', ch_list[idx-1])
                    ch_list[idx] = ch1
                elif ch in substitutions.values():
                    ch2 = ch.replace(ch, list(filter(lambda x: ch in substitutions[x], substitutions))[0])
                    ch_list[idx] = ch2
                elif ch in slash_chords_df.columns or ch in slash_chords_df.values:
                    ch_list[idx] = ch
                else:
                    if ch in rejects.keys():
                        rejects[ch] += 1
                    else:
                        rejects[ch] = 1
                merged_song[measure] = ch_list
            merged_chords.append(merged_song)
    
    return pd.Series(merged_chords)

In [84]:
merge_chords(split_raw_df.chords)

In [None]:
print('Rejects:', {c: q for c, q in rejects.items() if q > 100})

Rejects: {'Bridge:': 8023, 'Intro:': 41999, 'D2': 1869, 'A/C#': 11215, 'D/F#': 31648, '2x': 9228, 'x2': 9225, '1': 11152, '11': 2091, '6': 5653, '8': 5515, '12': 3852, '9': 7005, '2': 25632, '7': 12357, '10': 4658, '0': 27765, 'Intro': 16819, '2x:': 294, 'INTRO:': 2826, 'break': 504, 'Interlude': 2268, 'play': 146, 'once': 300, 'Bbmaj7': 2150, 'F7M': 10231, 'C7M': 12663, 'X': 954, '3': 18148, 'Outro:': 1101, 'Eb7M': 2430, 'Bridge': 10244, 'x': 5907, 'C7M/G': 197, 'Cm6': 1378, 'Dm6': 4593, '4x': 2808, 'D4': 4883, 'hide': 2022, 'this': 2094, 'tab': 2045, '5': 14052, 'Final': 1266, 'intro:': 667, 'G6/E': 120, 'Final:': 983, 'N': 2213, 'pause': 225, 'B/D#': 4058, 'F#m711': 434, 'A7M9': 316, 'o': 210, 'final': 134, 'h7': 317, 'Solo': 4864, 'D7+': 4761, '4': 11498, 'hold': 555, '/': 8589, 'Refrão': 149, 'bridge': 623, 'm': 1640, '6x': 195, 'c': 376, 'D7M': 12295, 'intro': 2749, 'x4': 2481, '1:': 411, 'D#maj7': 226, '2:': 460, 'I': 532, 'BRIDGE': 1097, ':': 4446, 'Dmaj7': 6204, 'Aº': 447, 'E4

In [None]:
# # big function
# def translate_slashes(slash_indexes, columns):
#     slash_chord = pd.DataFrame(index=slash_indexes, columns=columns)
#     i = 0 
    
#     def find_slash(chord, notes, i):
#         for si in slash_indexes:
#             s = si.split('/')
#             h = whole_to_half[s[1]]
#             sc = f'{chord}{s[0]}/{notes[i+h]}'
#             slash_chord.loc[si, chord] = sc
    
#     def sort_f_s(chord, notes):
#         for n in notes:
#             if n in chord :
#                 i = notes.index(chord[0])
#                 find_slash(chord, notes, i)
#             elif len(chord) > 1:
#                 if '#' == chord[1]:
#                     i = sharp_notes.index(chord[:2])
#                     find_slash(chord, sharp_notes, i)
#                 elif 'b' == chord[1]:
#                     i = flats_notes.index(chord[:2])
#                     find_slash(chord, flats_notes, i)
    
#     for chord in columns:
#         sort_f_s(chord, sharp_notes)
#         sort_f_s(chord, flats_notes)
                
            
#     return slash_chord

In [None]:
# slash_chords_df = pd.concat([chords_df, get_slash_notes(slash_indexes, chords_df.columns)])
