# Utility

In [3]:
def addToDict(chord_dict, chord_num):
    if chord_num in chord_dict:
        chord_dict[chord_num] += 1
    else:
        chord_dict[chord_num] = 1
    return chord_dict
    

# Cleaning the Lyrics

In [1]:
#genius API gives some bad data including movie & play scripts
#to remove the majority of those cases, we are removing a percent of the longest songs
# the correct perc is determined in the EDA analysis
def removeLongSongs(df, perc):
    df['len'] = None
    df['len'] = df['dirty_lyrics'].apply(lambda x: len(str(x)))
    lengths = list(df['len'])
    df = df.loc[df.len < np.percentile(lengths, perc)]
    df = df.drop(columns=['len'])
    return df

In [2]:
def clean_lyrics(lyrics):

    #remove verse/chorus/bridge etc. inside []
    cleaned_lyrics = re.sub(r'\[(.*?)\]','', lyrics)
    #remove genius *embed* words
    cleaned_lyrics = re.sub(r"\S*Embed\S*",'', cleaned_lyrics)
    
    #The below code may be required for some other parts of pipeline
    #For LDA, it is baked into lyrics_to_words
    
    #lowercase and strip
    #cleaned_lyrics = cleaned_lyrics.lower().strip()
    #remove punctuations
    #cleaned_lyrics = re.sub(r"[^\w\d'\s]+",'', cleaned_lyrics)
    
    return cleaned_lyrics 

In [3]:
def read_stopwords(filename):
    stopwords={}
    with open(filename) as file:
        for line in file:
            stopwords[line.rstrip()]=1
    return stopwords

In [4]:
def lyrics_to_words(lyrics):
    
    #for lemmatization
    lemma = WordNetLemmatizer()
    #ps =PorterStemmer()
    
    #lowercase and strip
    tokens = nltk.wordpunct_tokenize(lyrics.lower())
    #tokens = lyrics.lower().split()
        
    stop_words = {k:1 for k in stopwords.words('english')}
    stop_words.update(read_stopwords("jockers.stopwords"))
    stop_words["'s"]=1
    stop_words=list(stop_words.keys())
    
    s1 = set(string.punctuation)
    my_punct = ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '.',
           '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', 
           '`', '{', '|', '}', '~', '»', '«', '“', '”']
    s2 = set(my_punct)
    exclude = set.union(s1,s2)
    
    stopword_removal = [token for token in tokens if token not in stop_words]
    punct_removal = [token for token in stopword_removal if token not in exclude]
    alnum = [token for token in punct_removal if token.isalnum()]
    normalized = [lemma.lemmatize(word) for word in alnum]
    
    return normalized

# Cleaning the Tab

In [6]:
flat_to_sharp_dict = {'Db':'C#', 'Eb':'D#', 'Gb':'F#', 'Ab':'G#', 'Bb':'A#', 'Cb':'B'}
chord_base = ['C#m', 'C#', 'Cm', 'C', 'D#m', 'D#', 'Dm', 'D', 'Em', 'E', 'F#m', 'F#', 'Fm', 'F', 'G#m', 'G#', 'Gm', 'G', 'A#m', 'A#', 'Am', 'A', 'Bm', 'B']


## Utility Functions

In [4]:
def sepExt(chord):
    
    #if '/' in chord:
    #    chord_type_split = chord.split('/')
    #else:
    #chord_type_split = [chord_type]
    #for chord in chord_type_split:
    
    #Em -> Em
    #Emaj7 -> E
    #Emmaj -> Em
    
    for note in chord_base:
        #print('{}, {}'.format(note, chord))
        if note in chord:
            
            if chord.replace(note, '')[:2] == 'aj':
                continue
            else:
                ext = chord.replace(note, '')
                #if len(ext) > 0:
                    #print('{}, {}, {}, {}'.format(chord, note, chord[len(note):len(note)+2], ext))
                return note, ext

In [6]:
def convertToSharp(chords):
    clean_chords = []
    for chord in chords.split(','):
        clean_chord = chord
        for flat in flat_to_sharp_dict:
            if flat in chord:
                clean_chord = clean_chord.replace(flat, flat_to_sharp_dict[flat])
        clean_chords.append(clean_chord)
    return ','.join(clean_chords)

## Move Over Chords for Capo


In [8]:
import pandas as pd
chord_map = pd.read_table('ChordDistanceMap.csv', sep=",", index_col="Chords")

In [2]:
def getShiftedChord(chord, shift):
    base, ext = sepExt(chord)
    #Dmmaj9
    #Dm & maj9
    #D 
    
    minor = False
    if 'm' in base:
        minor = True
        base = base.replace('m', '')
    shift_chord = chord_map.loc[chord_map[base] == shift, base].index[0]
    if minor:
        return shift_chord + 'm' + ext
    else:
        return shift_chord + ext

In [36]:
def moveForCapo(chords, capo):
    new_chords = []
    for chord in chords.split(','):
        if '/' in chord:
            split_chord = chord.split('/')
            new_chord = '{}/{}'.format(getShiftedChord(split_chord[0], capo), getShiftedChord(split_chord[1], capo))
        else:
            new_chord = getShiftedChord(chord, capo)
        new_chords.append(new_chord)
    return ','.join(new_chords)
    

# Getting Key Information

## Greer Key Estimate

In [7]:
#pull in the key table for each chord structure
keyTable = []
keyTableFile = open('key_table_UTF-8.txt')
for line in keyTableFile.readlines():
    chords = line.split(' ')
    keyTable.append(chords[:-1])

In [8]:
"""
Ben Ma
Python 3.x
Contains the utility function findTonicNumNo7.
"""

def findTonicNumNo7(songChords): #songChords is a list, keyTable is a list of lists
    # edit songChords to change 7ths to just major
    songChordsNo7 = copy.deepcopy(songChords)
    for i in range(0, len(songChordsNo7)):
        songChordsNo7[i] = songChordsNo7[i].replace("7", "")

    maxKey = 0 #0 thru 11 for C thru B
    maxScore = 0
    for i in range(0,len(keyTable)): #go thru each of the 12 keys--example for key of C: C Dm Em F G G7 Am Bdim
        curScore = 0
        key = keyTable[i]
        for chord in songChordsNo7:
            for j in range(0,len(key)): #go thru each note in the major scale of the key
                note = key[j]
                if chord==note:
                    if (j == 1 or j == 2 or j == 7):
                        curScore+=0.9 #tiebreaker: the ii, iii, and vii are weighted less
                    else:
                        curScore+=1 #if it's a match, add 1 to the "score" of the current key
                    break
        if curScore>maxScore:
            maxScore=curScore
            maxKey = i
    return maxKey #return key with most matches for the chords in the song

In [4]:
def pickMainKey(song):

    chord_dict = {}
    chord_dict = addToDict(chord_dict, song['spotify_key'])
    chord_dict = addToDict(chord_dict, song['greer_key'])
    chord_dict = addToDict(chord_dict, song['firstNote_key'])
    chord_dict = addToDict(chord_dict, song['lastNote_key'])
    chord_dict = dict( sorted(chord_dict.items(), key=operator.itemgetter(1),reverse=True))
    
    if len(chord_dict) == 1:
        return list(chord_dict.items())[0][0]
    if list(chord_dict.items())[0][1] > list(chord_dict.items())[1][1]:
        return list(chord_dict.items())[0][0]
    if song['firstNote_key'] == song['lastNote_key']:
        return song['firstNote_key']
    else:
        return song['greer_key']
    

## First Chord Key

In [9]:
pitch_class = {'C':0, 'C#':1, 'D':2, 'D#':3, 'E':4, 'F':5, 'F#':6, 'G':7, 'G#':8, 'A':9, 'A#':10, 'B':11}

In [1]:
def ChordToNum(chord_base):
    return pitch_class[chord_base.replace('m', '')]