In [4]:
import pandas as pd
import string
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

In [None]:
kaggle_raw_df = pd.read_csv('./data/raw/chords_and_lyrics.csv')
kaggle_slim_df = kaggle_raw_df.loc[:, ['artist_name', 'genres', 'chords']]

In [2]:
def remove_symbols(chords):
    uninverted_chords = []
    
    for chord in chords:
        if "/" in chord:
            uninverted_chords.append(chord.split("/")[0])
        elif "'" in chord:
            uninverted_chords.append(chord.split("'")[0])
        elif "," in chord:
            uninverted_chords.append(chord.split(",")[0])
        elif "-" in chord:
            uninverted_chords.append(chord.split("-")[0])
        elif "t" in chord:
            uninverted_chords.append(chord.split("t")[0])
        elif "(" in chord:
            uninverted_chords.append(chord.split("(")[0])
        elif ")" in chord:
            uninverted_chords.append(chord.split(")")[0])
        elif "\\" in chord:
            uninverted_chords.append(chord.split("\\")[0])
        elif "|" in chord:
            uninverted_chords.append(chord.split("|")[0])
        elif "~" in chord:
            uninverted_chords.append(chord.split("~")[0])
        else:
            uninverted_chords.append(chord)
            
    return uninverted_chords

In [None]:
def merge_symbols(chords):
    merged_chords = []
    
    for chord in chords:
        if "*" in chord:
            respelled_chord = chord.replace('*', 'dim')
        elif "°" in chord:
            respelled_chord = chord.replace('°', 'dim')
        elif "+" in chord:
            respelled_chord = chord.replace('+', 'aug')
        elif "minor" in chord:
            respelled_chord = chord.replace('minor', 'm')
        elif "min" in chord:
            respelled_chord = chord.replace('min', 'm')
        elif "major" in chord:
            respelled_chord = chord.replace('major', '')
        elif "maj" in chord:
            respelled_chord = chord.replace('maj', '')
        else:
            respelled_chord = chord
            
        merged_chords.append(respelled_chord)
    
    return merged_chords

In [3]:
def clean_chords(chords_column):
    
    letters = list(string.ascii_uppercase)[:7]
    cleaned = []
    
    for row in chords_column:
        # Convert string to list of strings
        song_list = row.split()

        # Only chords that begin with designated letters
        raw_chords = [chord for chord in song_list if chord[0] in letters]
        
        # remove symbols
        unsymboled_chords = remove_symbols(raw_chords)
        
        # merge chords into same format
        merged_chords = merge_symbols(unsymboled_chords)
        
        # Remove repeated chords
        non_repeating_chords = []
        for idx, chord in enumerate(merged_chords):
            if idx == 0:
                non_repeating_chords.append(chord)
            elif merged_chords[idx - 1] != merged_chords[idx]:
                non_repeating_chords.append(chord)
            else:
                pass
        
        cleaned.append(non_repeating_chords)
        
    return cleaned

In [None]:
splits = clean_chords(kaggle_slim_df['chords'])
kaggle_cleaned_df = kaggle_slim_df
kaggle_cleaned_df['chords'] = splits

In [None]:
def count_chords(cleaned_df):

    chords_count_dict = {}
    
    for song in kaggle_cleaned_df['chords']:
        song_dict = dict(Counter(song))
        for chord, count in song_dict.items():
            if chord in chords_count_dict:
                chords_count_dict[chord] = chords_count_dict[chord] + count
            else:
                chords_count_dict[chord] = count
                
    return chords_count_dict

In [None]:
chords_dict = count_chords(kaggle_cleaned_df)
sns.histplot(data=chords_dict, bins=100)

In [None]:
def reduce_low_count(chord_count_dict):
    '''Only use this one if you have too many chords that happen only a few times,
        you can see in the distribution in the above graph'''
    slim_chord_counts_dict = {}
    for chord, count in chord_count_dict.items():
        if count < 20:
            pass
        else:
            slim_chord_counts_dict[chord] = count
    
    return slim_chord_counts_dict

In [None]:
slim_chord_count_dict = reduce_low_count(chords_dict)
chord_count_df = pd.Series(slim_chord_count_dict).to_frame('chord_count')
chord_count_df