### Data Collection - Top100 Chart data and Spotify Audio Feature aquisition

In [4]:
#Import and install the Spotipy library if necessary: https://spotipy.readthedocs.io/en/2.22.1/
!pip install spotipy
#The following packages may be required
!pip install urllib3
!pip install requests



In [7]:
#Retrive Offical UK Chart data and then retrieve Spotify Audio Analysis data for each track
import requests, bs4, pandas as pd
import csv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import urllib.parse
import re
import time

#Spotify developer account credentials. Using Spotipy library to access Spotify API
#CREATE YOUR OWN SPOTIFY DEVELOPER ACCOUNT AND REPLACE THE DETAILS BELOW
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(client_id="a947fee064374052bdd67f3be8eda4c1",
                                                           client_secret="b5cb4d0370234f37ac7a83fe3a7b08ef"),
                                                          retries=0,status_retries=0,backoff_factor=5)
#A 'back-off' is included in an attempt to prevent exeeding the Spotify API rate limit
#If triggered, it is necessary to wait for ~60mins+ to continue

#Create a CSV containing Year End Top 100 UK chart for chosen year
#Code adapted from source at https://medium.com/@caineosborne/analysing-uk-chart-history-1956-to-2017-6fec0ecc991b
def getchart(year):
 url = 'http://www.officialcharts.com/charts/end-of-year-singles-chart/'+year+'0101/37501/'
 alltracks = []
 print('Getting Page %s ' %url)
 req = requests.get(url)
 req.raise_for_status()
 
 #Exit loop if status code is not 200
 if req.status_code != 200:
    return None
 
 soup = bs4.BeautifulSoup(req.text,"lxml")


 #retrieve track position, artist and track name 
 positions = soup.find_all("span", class_="position")
 tracks = soup.find_all("div", class_="title")
 artists = soup.find_all("div", class_="artist")
    
 #create a list of each track, tidying the format
 for i in range (0,len(positions)): 
     track = [] 
     track.append(year)
     track.append(positions[i].text)
     track.append(artists[i].text.strip('\n').strip('\r'))
     track.append(tracks[i].text.strip('\n').strip('\r'))
     #append each track list to the list 
     alltracks.append(track)
 
 #write tracks to CSV, appending to existing file
 filename = "chartoutput_" + year + ".csv"
 with open(filename,'a',newline='') as resultFile:
    wr = csv.writer(resultFile)
    wr.writerows(alltracks)
    resultFile.close()
    
 #clear out the list
 alltracks = []
 return()


#Retrieve Spotify track data & audio analysis metrics for a given track/artist
#Uses the Spotify search method to find track
#Code adapted from examples at: https://spotipy.readthedocs.io/en/latest/
def getspotifydata(track, artist, year, chartposition,outputcsv):
    #replace problem characters in track/artist string    
    track = re.sub('[*\/;]', ' ', track)
    artist = re.sub('[*\/&;]', ' ', artist)
    
    #compose search string
    #this is complicated by Spotify including featured artists in title rather than artist
    #initially try search with only first two terms of the artist field (unless length of 1 etc.)
    if len(artist.split()) == 1 or artist.split()[1] == "FT":
        searchstring = f'track:{track} artist:{artist.split()[0]}'
    else:        
        searchstring = f'track:{track} artist:{artist.split()[0]} {artist.split()[1]}'
    print('Track:',track,'Artist:',artist)
    
    #alternative searchstring when the track cannot be found using the previous search string format
    altsearchstring = f'{track} {artist}'
    print()

    #use Spotipy library to access Spotify API search function to return a single track
    result = sp.search(q=searchstring,type='track', limit=1)
    altresult = sp.search(q=altsearchstring,type='track', limit=1)
            
    #retrieve data for the track - try the first format of search string, if not found try alternative
    try:
        name = result['tracks']['items'][0]['name']
        artist = result['tracks']['items'][0]['artists'][0]['name']
        id = result['tracks']['items'][0]['id'] 
    
    except:
        print('searchstring did not give a result, trying altsearchstring')
        try:
            name = altresult['tracks']['items'][0]['name']
            artist = altresult['tracks']['items'][0]['artists'][0]['name']
            id = altresult['tracks']['items'][0]['id'] 
        
        except:
            print('altsearchstring did not give a result, skipping this track')
            print()
            return()
              
    #If result foud then continue to retrieve audio analysis metrics for the track
    valence = sp.audio_features(id)[0]['valence']
    acousticness = sp.audio_features(id)[0]['acousticness']
    danceability = sp.audio_features(id)[0]['danceability']
    energy = sp.audio_features(id)[0]['energy']
    instrumentalness = sp.audio_features(id)[0]['instrumentalness']
    liveness = sp.audio_features(id)[0]['liveness']
    key = sp.audio_features(id)[0]['key']
    mode = sp.audio_features(id)[0]['mode']
    tempo = sp.audio_features(id)[0]['tempo']
    loudness = sp.audio_features(id)[0]['loudness']

    #print("Year", "ChartPos","Track", "Artist", "Acousticness", "Danceability", "Energy", "Instrumentalness","Liveness", "Valence", "Key", "Mode", "Tempo", "Loudness")
    #print(year, chartposition, name, artist, acousticness, danceability, energy, instrumentalness, liveness, valence, key, mode, tempo, loudness)
    print(year,',',chartposition,',',name,',', artist)
    print()
    
    #Append track data to CSV file
    #Use UTF-8 encoding to accomodate non Latin characters
    with open(outputcsv, 'a', newline='', encoding='UTF-8') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
        #keep this in to run column headers at the top of the file
        #spamwriter.writerow(["Track", "Artist", "Acousticness", "Danceability", "Energy", "Instrumentalness","Liveness", "Valence", "Key", "Mode", "Tempo", "Loudness"])
        spamwriter.writerow([year, chartposition, name, artist, acousticness, danceability, energy, instrumentalness, liveness, valence, key, mode, tempo, loudness])
    

    
#Get Official chart data for specified period (and store in CSV)
def getchartdata(startyear,endyear):
    for i in range(startyear,endyear+1):
        getchart(str(i))
        

        
#Retrive Spotify data for all tracks present in provided CSV
def getspotifydatafortracks(inputcsv,outputcsv):
    FILE=open(inputcsv,"r")

    charts=csv.reader(FILE)
    #next(charts) # skip first line if there is a header
    for line in charts: # already split for us
        year = line[0]
        chartposition = line[1]
        track = line[3]
        artist = line[2]
        getspotifydata(track,artist,year,chartposition,outputcsv)
        time.sleep(2) # sleeps for 1 second - to avoid hitting Spotify API rate limit
    FILE.close()

In [8]:
#Download chart data for chosen period  getchartdata(startyear, endyear)
#getchartdata(2005,2022) # get chart data for all available years

getchartdata(2020,2020) # get chart data for only 2020 for testing

Getting Page http://www.officialcharts.com/charts/end-of-year-singles-chart/20200101/37501/ 


In [9]:
#Get Spotify data for the downloaded charts
#The Spotify API is rate-limited. As of March 2023, when the data was accessed, it was possible to download
#two years of Top100 chart data within a 30min window.
#The example below takes an input csv generated by the getchartdata() function and appends the Spotify audio feature
#data to the csv file given in the second parameter.
getspotifydatafortracks("chartoutput_2020.csv","chartspotifydata_2020.csv")