In [1]:
# Scraping and data parsing
import requests as req
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
from bs4 import BeautifulSoup
import json
import re
import os

# mathematical operations
import math
import pandas as pd
import numpy as np
import random

# preparing ML datasets
from sklearn.model_selection import train_test_split

#debugging
import pdb
from pprint import pprint as pp

# User input

In [2]:
# number of tabs wanted for dataset (will be rounded up to nearest 50 as tabs are scraped per page)
# max 100 pages*50 tabs/page = 5000 tabs
MAX_TABS = 5000
num_tabs = MAX_TABS 

# Functions

In [3]:
def get_request_ug(url):
    '''
    Makes a get request to the specified Ultimate Guitar URL and returns relevant JSON data
    '''
    
    # Prepare session and retry strategy
    session = req.Session()
    retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))
    
    # Send request
    res = session.get(url)

    # Extract tab data from response
    soup = BeautifulSoup(res.content) # format response content
    
    # Use BeautifulSoup to find relevant element, then cast to json and drill down the Ultimate-Guitar-specific heirarchy
    page_data = json.loads(soup.find("div", {'class': 'js-store'})['data-content'])['store']['page']['data']
    
    return page_data

In [4]:
def scrape_page(page):
    # query params
    order = 'hitstotal_desc'  # order by total hits, descending order. i.e. most popular -> least popular
    tab_type = 'Chords'  # filter tabs that provide chord progression information (case sensitive)
    path = 'https://www.ultimate-guitar.com/explore'  # base endpoint path
    
    # generate url for request
    query_params = 'order=' + order + '&page=' + str(page)  + '&type[]=' + tab_type
    url = path + '?' + query_params
    
    # make request
    page_data = get_request_ug(url)['data']
    
    # create hits dataframe (stored separately to tab data)
    hits = page_data['hits']  # hits are stored with unique id which is also found in tab data
    hits = [[int(r['id']), int(r['hits'])] for r in hits]  # prep list of lists to be read into pandas dataframe, cast id and hits to integer to align with tabs dataset
    hits = pd.DataFrame(hits, columns = ['id', 'hits']).set_index('id')  # set index to shared unique id

    # create tabs dataframe (one page of data)
    tabs = page_data['tabs']  # extract all tabs and their attributes
    df = pd.DataFrame(tabs, columns=tabs[0].keys()).set_index('id')  # set index to shared unique id

    # join hits and tab data together for one page
    df = df.join(hits)  # adds hits dataframe to tabs dataframe - left join takes place by default on (shared) index
    return df

In [5]:
def get_tab_metadata(num_tabs):  
    #calc number of pages to scrape
    num_pages = math.ceil(num_tabs / 50)  # 50 tabs per page
    
    # initiate master dataframe
    df = pd.DataFrame()

    for p in range(1, num_pages + 1):
        page_tabs = scrape_page(p)
        df = df.append(page_tabs)
    return df

In [6]:
def get_chords(song_data):
    '''
    get_chords takes song data, and
    - extracts each of the following sections: Intro, Verse 1-9, Chorus, Outro, Bridge
    - parses each section for chords
    - returns 
        if tab in correct format: a dataframe with two columns [section, chords]
        otherwise: empty dataframe
    output columns
    - section: contains the name of each section eg. Chorus (may be duplicated)
    - chords: contians a list of chords eg. ['Eb', 'Gm6', 'Fsus4', 'Bb']
    '''
    
    # regex patterns
    extract_verse_name = r'\[[VCIOB][ehnur][roti][srd][euog][se ]?(\d?)\]'  # extract name of section. eg. Chorus
    extract_verse_with_name = extract_verse_name + r'\\r\\n(.*?)\\r\\n\\r\\n'  # extract each verse name and tab content
    extract_chords = r'\[ch\](.*?)\[\/ch\]'  # extract chords from within chord and lyric data
    
    song_data = song_data.replace('\r', '\\r').replace('\n', '\\n');  # to work with regex parser
    sections = re.finditer(extract_verse_with_name, song_data)  # has form [ '[section_name]', 'lyrics_and_chords' ]

    song = []
    for s in sections:
        section_string = song_data[s.start(): s.end()]  # extract a verse (with its name) from song_data
        section = re.search(extract_verse_name, section_string)  # get verse name from string

        section_name = section.group(0)[1:-1]  # remove []  surrounding section_name
        chords = re.findall(extract_chords, section_string)  # extract chords from lyrics
        
        if chords == []:
            continue
        else:
            section = {'section_name': section_name, 'chords': chords}  # save section name with its chords
            song.append(section)  # append to get a view for the whole song

    return pd.DataFrame(song)  # turn song into dataframe


In [7]:
def remove_duplicate_songs(df):
    '''
    takes a dataframe and removes all duplicate songs, keeping the most popular version only
    '''
    
    for i in df.song_id.unique():
        tmp = df[df['song_id'] == i]
        versions = tmp.id.unique().tolist()
        if len(versions) == 1:  # only one version
            continue
        else:
            max_row_id = tmp.loc[tmp['hits'].idxmax()].id  # get id of max row
            versions.remove(max_row_id)  # remove version corresponding to max_row from versions list
            for v in versions:  # remove everything remaining in versions list
                indexes = tmp[tmp['id'] == v].index
                df = df.drop(indexes, inplace=False)
    return df

#### Chord nomenclature reference
#TODO: Create and upload image explaining Ultimate Guitar chord structure

In [8]:
def is_flat(chord):
    if len(chord) > 1 and chord[1] == 'b':
        return True
    else:
        return False


def is_sharp(chord):
    if len(chord) > 1 and chord[1] == '#':
        return True
    else:
        return False


def is_minor(chord):
    if 'm' in chord and 'dim' not in chord and 'maj' not in chord:  # chord is minor
        return True
    else:
        return False
    

def is_dim(chord):
    if 'dim' in chord or 'o' in chord:
        return True
    else:
        return False
    

def to_triad(c):
    '''
    convert a chord into a three-note chord (triad)
    eg. A7sus4 -> A
    eg. E#dim7 -> E#dim
    '''
    
    notes = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
    incidentals = ['#', 'b']
    
    chord = c.strip().capitalize()  # remove surround whitespace and ensure only first char is capitalised

    dim = is_dim(chord)  # indicates a diminished chord
    minor = is_minor(chord)  # indicates a minor chord
    
    # archaic H notation is equivalent to a B in Western music
    if chord[0] == 'H':
        chord = 'B' + chord[1:]
    if dim:
        chord = chord.split('dim')[0].split('o')[0]
    elif minor:
        chord = chord.split('m')[0]
    else:
        chord = chord[:2]
        
    for c in chord:
        if c not in notes + incidentals:
            chord = chord.split(c)[0]  # take everything to the left of unacceptable character
     
    if dim:  # append 'dim'
        chord = chord + 'dim'
    if minor:  # append 'm'
        chord = chord + 'm'    
    
    return chord
   

def get_section_score(section, key_chords):
    '''
    returns number of matches between a sequence of chords and a set of chords
    '''
    
    return sum([1 for chord in section if chord in set(key_chords)])


def get_key(section):
    '''
    get_key takes a list of triad chords
    returns 
        keys: an array of possible keys
    multiple keys are possible where a section has limited chords
    (eg. only 2 chords that are shared among multiple keys)
    '''

    scores = []
    for key in key_chord_mapping.columns:
        key_chords = key_chord_mapping[key]  # get chords corresponding to key
        score = get_section_score(section, key_chords)  # get match score for section chords against this key
        
        item = {'key': key, 'score': score}
        scores.append(item)
    key = max(scores, key=lambda x: x['score'])['key']
    
    return key


def guitar_to_piano_chords(chords, capo):
    '''
    takes a list of guitar chords and adjusts them to piano based on the fret that the capo is on (1 semitone per fret)
    returns
        pitched_up_chords: a list of chords raised by n=capo semitones
    '''
    
    chords_pitch_adj = []
    for chord in chords:
        for semitone in range(capo):
            chord = adjust_pitch_by_semitone(chord=chord, pitch_up=True)
        chords_pitch_adj.append(chord)
   
    return chords_pitch_adj
    

def change_notation(chord, switcher):
    '''
    input
        chord
    returns
        corresponding value in switcher, retaining chord quality (eg. dim or m)
    '''
    
    minor = is_minor(chord)
    dim = is_dim(chord)
    
    if minor:
        chord = chord[:-1]  # chop off m
    elif is_dim:
        chord = chord.split('dim')[0].split('o')[0]  # chop off dim
        
    chord = switcher.get(chord, chord)

    if minor:
        chord = chord + 'm'  # add m back
    elif dim:
        chord = chord + 'dim'  # add dim back
    
    return chord
    
    
def adjust_pitch_by_semitone(chord, pitch_up):
    '''
    takes a chord and pitches_up based on value or pitch_up (boolean)
    returns
        pitch-adjusted chord
    '''
    
    if pitch_up:
        switcher = {
            'A': 'A#',
            'A#': 'B',
            'Bb': 'B',
            'B': 'C',
            'Cb': 'C',
            'B#': 'C#',
            'C': 'C#',
            'C#': 'D',
            'Db': 'D',
            'D': 'D#',
            'D#': 'E',
            'Eb': 'E',
            'E': 'F',
            'Fb': 'F',
            'E#': 'F#',
            'F': 'F#',
            'F#': 'G',
            'Gb': 'G',
            'G': 'G#',
            'G#': 'A',
            'Ab': 'A',
        }
        
    elif not pitch_up:  # pitch_down
        switcher = {
            'A#': 'A',
            'Bb': 'A',
            'B': 'A#',
            'Cb': 'A#',
            'C': 'B',
            'B#': 'B',
            'C#': 'C',
            'Db': 'C',
            'D': 'C#',
            'D#': 'D',
            'Eb': 'D',
            'E': 'D#',
            'Fb': 'D#',
            'E#': 'E',
            'F': 'E',
            'F#': 'F',
            'Gb': 'F',
            'G': 'F#',
            'G#': 'G',
            'Ab': 'G',
            'A': 'G#',
        }
        
    return change_notation(chord, switcher)


def switch_notation(chord):
    """
    take a chord and switch to its equivalent notation 
    eg. A# -> Bb, Cb -> B, A#m -> G#m -> Abm
    """
    
    switcher = {
        'A#': 'Bb',
        'B#': 'C',
        'D#': 'Eb',
        'G#': 'Ab',
        'C#': 'Db',
        'E': 'Fb',
        'E#': 'F',
        'F#': 'Gb',
        'B': 'Cb',
        'Cb': 'B',
        'F': 'E#',
        'Fb': 'E',
        'Gb': 'F#',
        'Db': 'C#',
        'Ab': 'G#',
        'Eb': 'D#',
        'C': 'B#',
        'Bb': 'A#'
    }
    
    return change_notation(chord, switcher)


def simplify_notation(chord):
    """
    take a chord and switch to the simplest notation 
    eg. A# -> Bb, Cb -> B, A#m -> G#m -> Abm, A -> A
    if chord is not flat or sharp, pass through
    """
    
    switcher = {
        'A#': 'Bb',
        'B#': 'C',
        'D#': 'Eb',
        'G#': 'Ab',
        'C#': 'Db',
        'E#': 'F',
        'F#': 'Gb',
        'Cb': 'B',
        'Fb': 'E',
        'Gb': 'F#',
        'Db': 'C#',
        'Ab': 'G#',
        'Eb': 'D#',
        'Bb': 'A#'
    }
    
    return change_notation(chord, switcher)
    

def chords_to_degree(chords, key, map_to_closest=False):
    '''
    chords_to_degree(chords, key)
    - maps chords in key provided scale degree (index of key_chord_mapping)
    '''
    
    chords_in_key = key_chord_mapping[key]
    chord_degrees = []
    for chord in chords:
        if chord in chords_in_key.to_list():
            degree = key_chord_mapping[chords_in_key==chord].index.to_numpy()[0]
        elif map_to_closest:
            chord = closest_chord_in_key(chord, key)
            degree = key_chord_mapping[chords_in_key==chord].index.to_numpy()[0]
        else:
            degree =  None
        chord_degrees.append(degree)

    return chord_degrees


def closest_chord_in_key(chord, key): 
    '''
    returning a close chord in key of the same chord quality (minor, major or diminished)
    '''
    
    chords_in_key = key_chord_mapping[key]

    if chord in chords_in_key:
        return chord
    elif is_dim(chord):  # chord degrees: 7
        # avoid while loop and retrieve correct diminised chord in key
        chord = chords_in_key[7]
        return chord
    else: # map to a closeby chord of same quality (minor or major)
        pitch_adj = [True, False]
        chords_in_key = chords_in_key.to_list()
        while chord not in chords_in_key and switch_notation(chord) not in chords_in_key:
            chord = adjust_pitch_by_semitone(chord=chord, pitch_up=random.choice(pitch_adj))

    if chord in chords_in_key:  # check whether sharp or flat representation was correct
        return chord
    else:
        return switch_notation(chord)

# Begin scraping

In [9]:
DATA_DIR = os.path.join(os.getcwd(), '..', 'data')
DATA_LOOKUPS_DIR = os.path.join(DATA_DIR, 'lookups')
DATA_RAW_DIR = os.path.join(DATA_DIR, 'raw')

key_chord_mapping = pd.read_csv(os.path.join(DATA_LOOKUPS_DIR, 'musical_key-triad_chord_mapping.csv'), index_col='Degree')

In [10]:
md = get_tab_metadata(num_tabs)  # scrape ultimate guitar 'explore' page for metadata (md)
md.drop(labels=['recording', 'part', 'tab_access_type', 'version_description', 'tp_version'], axis=1, inplace=True)

In [11]:
chords = []
cnt = 0

# loop through tabs, fetch tab data and parse into chords
for index, tab_url in md['tab_url'].iteritems():  # get index to use as foreign key
    
    #cnt += 1
    #print('fetching tab {0:>4}: {1}'.format(cnt, tab_url))

    
    # send get request and extract tab info
    tab_data = get_request_ug(tab_url)['tab_view']  # get tab data
    tab_content = tab_data['wiki_tab']['content']  # extract tab content
    
    # extract capo fret number to pitch up guitar chords to get the true chord
    try: 
        # eg. capo on 2nd fret, a chord that is denoted as Em is actually F#m (Em -> Fm -> F#m)
        capo = tab_data['meta']['capo']
    except (KeyError, TypeError): 
        # capo key not present in json response
        capo = 0
        
    # parse tab  content to extract chord
    song = get_chords(tab_content)
    
    # create additional columns, or move to next URL if song is empty
    if song.empty:
        continue
    else:  # list multiplication to create new column of correct length
        song['id'] = pd.Series([index] * len(song), dtype='int64')  # to be used to reference metadata (foreign key)
        song['capo'] = pd.Series([capo] * len(song), dtype='int64')  # save capo with chords
        song['key'] = pd.Series([None] * len(song), dtype='str')
        song['chords_simplified'] = pd.Series([None] * len(song), dtype='str')
        song['chords_simplified_pitch_corrected'] = pd.Series([None] * len(song), dtype='str')
        song['chords_numeric'] = pd.Series([None] * len(song), dtype='str')
    
    
    # get alternate chord representations
    song_chords = []
    for ids, section in song['chords'].iteritems():
        simple_section = []
        simple_pitch_corrected_section = []
        for chord in section:
            simple_chord = to_triad(chord)
            simple_section.append(simple_chord)
        
        # pitch correct guitar chords
        simple_pitch_corrected_section = guitar_to_piano_chords(simple_section, capo)
        
        # add alternate chords to dataframe
        song.at[ids, 'chords_simplified'] = simple_section
        song.at[ids, 'chords_simplified_pitch_corrected'] = simple_pitch_corrected_section
        
        song_chords.extend(simple_pitch_corrected_section)

    # get key of song - assumed 1 key for entire song
    key = get_key(song_chords)
    
    # check if key (and chords) can be simplified
    key_changed = False
    if key in ['A#', 'D#', 'G#', 'C#', 'F#']:  # these keys can be represented more simply
        key = switch_notation(key)
        key_changed = True
    elif key in ['Cb']: # check separately to sharp keys to avoid needing to check if key is flat or sharp 
        key = switch_notation(key)
        key_changed = True
    
    if key_changed:  # update chords
        for ids, section in song['chords_simplified_pitch_corrected'].iteritems():
            sec = []
            for chord in section:
                sec.append(simplify_notation(chord))
            song.at[ids, 'chords_simplified_pitch_corrected'] = sec
    # add song's key to each of its sections in the dataframe
    song['key'] = [key] * len(song)
    
    # loop through sections and make numeric
    for ids, section in song['chords_simplified_pitch_corrected'].iteritems():
        song.at[ids, 'chords_numeric'] = chords_to_degree(section, key, map_to_closest=True)
        
    # create chords: a list of song dataframes
    chords.append(pd.DataFrame(song))

In [12]:
song.head()  # an individual song

Unnamed: 0,section_name,chords,id,capo,key,chords_simplified,chords_simplified_pitch_corrected,chords_numeric
0,Intro,[Am],141670,0,C,[Am],[Am],[6]
1,Chorus,"[Dm, Am, Em, Am]",141670,0,C,"[Dm, Am, Em, Am]","[Dm, Am, Em, Am]","[2, 6, 3, 6]"
2,Verse 1,"[Am, G, F, G, E/G#]",141670,0,C,"[Am, G, F, G, E]","[Am, G, F, G, E]","[6, 5, 4, 5, 4]"
3,Bridge,[Am],141670,0,C,[Am],[Am],[6]
4,Chorus,"[Dm, Am, Em, Am]",141670,0,C,"[Dm, Am, Em, Am]","[Dm, Am, Em, Am]","[2, 6, 3, 6]"


In [13]:
chords = pd.concat(chords, axis=0, ignore_index=True)
chords  # all songs concatenated

Unnamed: 0,section_name,chords,id,capo,key,chords_simplified,chords_simplified_pitch_corrected,chords_numeric
0,Intro,"[C, Am, C, Am]",198052,1,Db,"[C, Am, C, Am]","[Db, Bbm, Db, Bbm]","[1, 6, 1, 6]"
1,Verse 1,"[C, Am, C, Am, F, G, C, G, C, F, G, Am, F, G, ...",198052,1,Db,"[C, Am, C, Am, F, G, C, G, C, F, G, Am, F, G, ...","[Db, Bbm, Db, Bbm, Gb, Ab, Db, Ab, Db, Gb, Ab,...","[1, 6, 1, 6, 4, 5, 1, 5, 1, 4, 5, 6, 4, 5, 1, 6]"
2,Chorus,"[F, Am, F, C, G, C, Am, C, Am]",198052,1,Db,"[F, Am, F, C, G, C, Am, C, Am]","[Gb, Bbm, Gb, Db, Ab, Db, Bbm, Db, Bbm]","[4, 6, 4, 1, 5, 1, 6, 1, 6]"
3,Verse 2,"[C, Am, C, Am, F, G, C, G, C, F, G, Am, F, G, ...",198052,1,Db,"[C, Am, C, Am, F, G, C, G, C, F, G, Am, F, G, ...","[Db, Bbm, Db, Bbm, Gb, Ab, Db, Ab, Db, Gb, Ab,...","[1, 6, 1, 6, 4, 5, 1, 5, 1, 4, 5, 6, 4, 5, 4, 6]"
4,Chorus,"[F, Am, F, C, G, C, Am, C, Am]",198052,1,Db,"[F, Am, F, C, G, C, Am, C, Am]","[Gb, Bbm, Gb, Db, Ab, Db, Bbm, Db, Bbm]","[4, 6, 4, 1, 5, 1, 6, 1, 6]"
...,...,...,...,...,...,...,...,...
25352,Bridge,[Am],141670,0,C,[Am],[Am],[6]
25353,Chorus,"[Dm, Am, Em, Am]",141670,0,C,"[Dm, Am, Em, Am]","[Dm, Am, Em, Am]","[2, 6, 3, 6]"
25354,Verse 2,"[Am, G, F, E]",141670,0,C,"[Am, G, F, E]","[Am, G, F, E]","[6, 5, 4, 4]"
25355,Bridge,"[Am, G, F, E, Am, G, F, E]",141670,0,C,"[Am, G, F, E, Am, G, F, E]","[Am, G, F, E, Am, G, F, E]","[6, 5, 4, 4, 6, 5, 4, 4]"


## Sanity checks

#### count degrees of scales (0 indicates there is an element that is not 1-7 inclusive)


In [14]:
import matplotlib.pyplot as plt

x = np.zeros(8)
v = [1, 2, 3, 4, 5, 6, 7]  # valid chord degrees
for j in chords['chords_numeric']:
    for y in range(1, len(x)):
        x[y] = x[y] + sum([1 if z == y else 0 for z in j])
    x[0] = x[0] + sum([1 if z not in v else 0 for z in j])

x = pd.DataFrame(x)
x.plot.bar();
for i, j in enumerate(x[0]):
    plt.text(i - 0.25, j + 10, str(int(j)))
    
plt.title('Number of each chord degree present in chords dataset');


- all chords are represented with numbers 1-7
- there are no missed chords (represented by index 0)
- most popular chords are 1,4,5,6 - matches a well-known fact about pop music

#### check all chords (or their equivalent notations) exist with key_chord_mapping table

In [15]:
all_chords = []
for col in key_chord_mapping:
    key_chords = key_chord_mapping[col].to_list()
    key_chords_sw = [switch_notation(c) for c in key_chords]
    all_chords.extend(key_chords)
    all_chords.extend(key_chords_sw)
all_chords = set(all_chords)

errors = 0
for s in chords['chords_simplified_pitch_corrected']:
    for c in s:
        if c not in all_chords:
            errors = errors + 1
print(f'Incomplete chord mappings: {errors}')

Incomplete chord mappings: 0


In [16]:
key_chord_mapping

Unnamed: 0_level_0,A,A#,Bb,B,C,C#,Db,D,D#,Eb,E,F,F#,Gb,G,G#,Ab
Degree,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,A,A#,Bb,B,C,C#,Db,D,D#,Eb,E,F,F#,Gb,G,G#,Ab
2,Bm,Cm,Cm,C#m,Dm,D#m,Ebm,Em,Fm,Fm,F#m,Gm,G#m,Abm,Am,A#m,Bbm
3,C#m,Dm,Dm,D#m,Em,Fm,Fm,F#m,Gm,Gm,G#m,Am,A#m,Bbm,Bm,Cm,Cm
4,D,D#,Eb,E,F,F#,Gb,G,G#,Ab,A,Bb,B,B,C,C#,Db
5,E,F,F,F#,G,G#,Ab,A,A#,Bb,B,C,C#,Db,D,D#,Eb
6,F#m,Gm,Gm,G#m,Am,A#m,Bbm,Bm,Cm,Cm,C#m,Dm,D#m,Ebm,Em,Fm,Fm
7,G#dim,Adim,Adim,A#dim,Bdim,Cdim,Cdim,C#dim,Ddim,Ddim,D#dim,Edim,Fdim,Fdim,F#dim,Gdim,Gdim


### Investigate metadata structure and data quality

In [17]:
md.head()  # metadata (scraped from Explore page of Ultimate Guitar)

Unnamed: 0_level_0,song_id,song_name,artist_id,artist_name,type,version,votes,rating,date,status,preset_id,tonality_name,verified,artist_url,tab_url,type_name,hits
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
198052,135952,Hallelujah,9898,Jeff Buckley,Chords,2,44950,4.87507,1121385600,approved,125,Db,0,https://www.ultimate-guitar.com/artist/jeff_bu...,https://tabs.ultimate-guitar.com/tab/jeff-buck...,Chords,33697652
1956589,1710759,Perfect,30232,Ed Sheeran,Chords,1,36714,4.86017,1488453515,approved,18913,Ab,0,https://www.ultimate-guitar.com/artist/ed_shee...,https://tabs.ultimate-guitar.com/tab/ed-sheera...,Chords,29677861
1248578,346275,All Of Me,11714,John Legend,Chords,1,22113,4.83845,1369872001,approved,14737,Em,0,https://www.ultimate-guitar.com/artist/john_le...,https://tabs.ultimate-guitar.com/tab/john-lege...,Chords,26763698
1137467,298037,Let Her Go,21762,Passenger,Chords,1,19769,4.85244,1331596801,approved,2333,Em,0,https://www.ultimate-guitar.com/artist/passeng...,https://tabs.ultimate-guitar.com/tab/passenger...,Chords,26669082
1086983,152656,Cant Help Falling In Love,11125,Elvis Presley,Chords,1,23890,4.86575,1314230401,approved,2368,C,0,https://www.ultimate-guitar.com/artist/elvis_p...,https://tabs.ultimate-guitar.com/tab/elvis-pre...,Chords,23652859


In [18]:
md_columns = ['type', 'status', 'verified', 'type_name']
for c in md_columns:
    print(md[c].value_counts())

Chords    5000
Name: type, dtype: int64
approved    5000
Name: status, dtype: int64
0    5000
Name: verified, dtype: int64
Chords    5000
Name: type_name, dtype: int64


In [19]:
md = md.drop(md_columns, axis=1)
md = md.drop(['votes'], axis=1)

In [20]:
md[md.index == chords['id'][0]]  # can use id from song table to read from md (metadata)

Unnamed: 0_level_0,song_id,song_name,artist_id,artist_name,version,rating,date,preset_id,tonality_name,artist_url,tab_url,hits
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
198052,135952,Hallelujah,9898,Jeff Buckley,2,4.87507,1121385600,125,Db,https://www.ultimate-guitar.com/artist/jeff_bu...,https://tabs.ultimate-guitar.com/tab/jeff-buck...,33697652


# Investigate data quality, clean up

In [21]:
df = chords.join(md, on='id')
df = remove_duplicate_songs(df)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19002 entries, 0 to 25356
Data columns (total 20 columns):
section_name                         19002 non-null object
chords                               19002 non-null object
id                                   19002 non-null int64
capo                                 19002 non-null int64
key                                  19002 non-null object
chords_simplified                    19002 non-null object
chords_simplified_pitch_corrected    19002 non-null object
chords_numeric                       19002 non-null object
song_id                              19002 non-null int64
song_name                            19002 non-null object
artist_id                            19002 non-null int64
artist_name                          19002 non-null object
version                              19002 non-null int64
rating                               19002 non-null float64
date                                 19002 non-null object
preset_id 

In [23]:
df.isna().sum()

section_name                         0
chords                               0
id                                   0
capo                                 0
key                                  0
chords_simplified                    0
chords_simplified_pitch_corrected    0
chords_numeric                       0
song_id                              0
song_name                            0
artist_id                            0
artist_name                          0
version                              0
rating                               0
date                                 0
preset_id                            0
tonality_name                        0
artist_url                           0
tab_url                              0
hits                                 0
dtype: int64

# Save extracted dataset

In [27]:
chords = df[chords.columns]
md = df[md.columns]

In [28]:
# save chords to csv
chords.to_csv(os.path.join(DATA_RAW_DIR, 'chords_by_section.csv'))

# save metadata to csv
md.to_csv(os.path.join(DATA_RAW_DIR, 'metadata.csv'))

# Limitations and areas for improvement

### - all chords are forced to be diatonic (only include notes of the scale)
This doesn't allow for a wide variety of chord structures that can greatly change the feel of the music, and thus its popularity
### - only triads are used, all chord extensions are suppressed
Similar to the point above: There are many variations on basic triads that contribute to the feel of the music. In particular, seventh chords are commonly used (even in pop music) to create tension and contribute to a more satisfying sense of resolution in the chord progression.
### - all chords are mapped to the one key
This decision was made to simplify the problem statement, and is a reasonable assumption for (recent) pop music. In more complex arrangements (eg. Queen - Bohemian Rhapsody), there are a number of key changes, which are not reflected in this dataset.
### - chords that are not in the key are randomly mapped to another chord (of the same quality) in the correct key
For example, this means that a progression that repeats itself twice, but has chord(s) not in the key, will not necessarily be mapped so that the chord progression is repeating anymore. Repetition is an important part of music as it gives the listener a sense of what to expect.
### - maximum of 5000 tabs
The URL used that links to the Ultimate Guitar Explore page has a maximum of 100 pages (50 tabs per page)
### - Each is saved seperately (even if repeated)
This may cause the resulting Machine Learning model to favour chord progressions that are used in songs with more sections, simply because they will occupy a larger percentage of the training set