# Auto scraper

## Libraries

In [2]:
import pandas as pd
import pandas as pd
import numpy as np
import seaborn as sns

import requests
import csv
from bs4 import BeautifulSoup
from random import randint
from time import sleep
from itertools import cycle

import spotipy 
from spotipy.oauth2 import SpotifyClientCredentials

## Functions

### Rateyourmusic.com

In [3]:
def make_soup(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"}
    thepage = requests.get(url, headers=headers).content
    soupdata = BeautifulSoup(thepage, "html.parser")
    return soupdata

In [35]:
def get_rym_data(start_year, end_year, start_page, end_page, target_filename):
    
    # define parameters
    chart_pages = range(start_page, end_page + 1)
    chart_dates = range(start_year, end_year + 1)
    chart_types = ['top', 'esoteric', 'bottom']
    
    # create song list
    song_list = []

    # begin scraping...
    for t in chart_types:
        for d in chart_dates:
            for p in chart_pages:
                
                # pause to avoid getting blocked
                sleep(randint(90,120))
                
                # make url
                soup = make_soup("https://rateyourmusic.com/customchart?page=" + str(p) + "&chart_type=" + str(t) + "&type=single&year=" + str(d) + "&genre_include=1&include_child_genres=1&genres=&include_child_genres_chk=1&include=both&origin_countries=&limit=none&countries=")
                
                # read rows
                for table in soup.find_all('tr'):
                    artist = table.find('a', attrs = {'class':'artist'})
                    if artist != None:
                        artist = artist.text.encode('utf8')
                    else:
                        break
                    
                    track = table.find('a', attrs = {'class':'album'})
                    if track != None:
                        track = track.text.encode('utf8')
                    else:
                        break
            
                    date = table.find('div', attrs = {'class': 'chart_year'})
                    if date != None:
                        date = date.text
                    else:
                        break
            
                    rate = table.find('div', attrs = {'class': 'chart_stats'})
                    if rate != None:
                        rate = rate.a.b.text
                    else:
                        break
                    
                    # if all song data succesfully obtained, add to list
                    song_list.append([track, artist, date, rate])
                
                print(f'Page {p} from {t}-{d} finished scrapping.')
    
    # scraping done!
    
    # write song data to CSV file
    csv_file = open(target_filename, 'w')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['track','artist','date','rating'])
    for song in song_list:
        csv_writer.writerow(song)
    csv_file.close()
    

### Spotify API

In [5]:
def authenticate_spotify():
    client_id = '7f6130cdb9904b398f0c0f94b880412f'
    client_secret = '88f25b600aea4def8d7b8f1394c79df8'
    client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
    return spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [6]:
def get_spotify_uris(csv_file, sp):
    
    # CSV should contain columns:
    # - track
    # - artist
    # - date
    # - rating
    
    # read CSV
    df = pd.read_csv(csv_file)
    
    # check that it is the right CSV
    if 'track' not in df.columns or 'artist' not in df.columns:
        return
    
    # clean dataframe rows
    df = df.drop_duplicates()
    df['track'] = [str(track)[2:-1].split('/')[0].split('(')[0].replace('"','').strip() for track in df['track']]
    df['artist'] = [str(artist)[2:-1] for artist in df['artist']]
    
    # get URI list
    saved_uris = []
    for artist, track in df[['artist', 'track']].values:
        q = f'artist:{artist} track:{track}'
        results = sp.search(q=q, limit=1, type='track')
        if results['tracks']['total'] != 0:
            uri = results['tracks']['items'][0]['uri']
        else:
            uri = "NaN"
        saved_uris.append(uri)
    
    print(f'Got {df.shape[0]} URIs.')
    
    # save dataframe
    df['uri'] = saved_uris
    
    # return dataframe
    return df

In [7]:
def get_audio_features(df, sp, target_filename):
    
    # check that it is the right CSV
    if 'uri' not in df.columns or 'track' not in df.columns or 'artist' not in df.columns:
        return
    
    # features to keep
    artist_list = []
    track_list = []
    date_list = []
    rating_list = []
    
    # features to obtain from API
    danceability = []
    energy = []
    key = []
    loudness = []
    mode = []
    speechiness = []
    acousticness = []
    instrumentalness = []
    liveness = []
    valence = []
    tempo = []
    duration_ms = []

    # get features for each URI
    for uri, track, artist, date, rating in df[['uri', 'track', 'artist', 'date', 'rating']].values:
        if uri != 'NaN':
            try:
                x = sp.audio_features(uri)
                
                if x != None:
                    for audio_features in x:
                        danceability.append(audio_features['danceability'])
                        energy.append(audio_features['energy'])
                        key.append(audio_features['key'])
                        loudness.append(audio_features['loudness'])
                        mode.append(audio_features['mode'])
                        speechiness.append(audio_features['speechiness'])
                        acousticness.append(audio_features['acousticness'])
                        instrumentalness.append(audio_features['instrumentalness'])
                        liveness.append(audio_features['liveness'])
                        valence.append(audio_features['valence'])
                        tempo.append(audio_features['tempo'])
                        duration_ms.append(audio_features['duration_ms'])
                        
                    artist_list.append(artist)
                    track_list.append(track)
                    date_list.append(date)
                    rating_list.append(rating)
                
            except:
                continue
    
    # make dataframe
    spotify_data = pd.DataFrame({'artist':artist_list, 'track':track_list, 'date':date_list,
                                 'rating': rating_list, 'danceability':danceability,
                                 'energy':energy, 'key':key, 'loudness':loudness, 'mode':mode,
                                 'speechiness':speechiness, 'acousticness':acousticness, 
                                 'instrumentalness':instrumentalness, 'liveness':liveness,
                                 'valence':valence, 'tempo':tempo, 'duration_ms':duration_ms})
    
    # save CSV file
    spotify_data.to_csv(target_filename)

    return spotify_data

In [None]:
csv_file = 'music_features.csv'
start_year = 2007
end_year = 2017
start_page = 1
end_page = 25

for i in range(start_year, end_year + 1):
    get_rym_data(i, i, start_page, end_page, csv_file)
    sp = authenticate_spotify()
    get_audio_features(get_spotify_uris(csv_file, sp), sp, 'spotified_' + str(i) + csv_file)

#df.head()

In [38]:
import os
csv_files = [f for f in os.listdir() if f.startswith('spotified_')]
dataframes = [pd.read_csv(csv, index_col=0) for csv in csv_files]
merged_df = pd.concat(dataframes).drop_duplicates()
merged_df.head()

Unnamed: 0,artist,track,date,rating,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms
0,Slint,Breadcrumb Trail,18 April 2020,4.77,0.231,0.44,7,-16.609,1,0.0408,0.0106,0.121,0.0947,0.195,180.681,355533
1,The Cure,Disintegration,1989,4.21,0.593,0.943,2,-5.948,1,0.0568,0.308,0.0284,0.366,0.125,122.02,499973
2,David Bowie,Heroes - Helden,1982,4.15,0.47,0.727,7,-8.193,1,0.0343,0.000606,0.644,0.223,0.198,113.453,365747
3,The Beatles,Golden Slumbers,1969,4.24,0.419,0.152,0,-12.126,1,0.0277,0.326,0.0034,0.113,0.239,80.608,91760
4,Ennio Morricone,The Ecstasy of Gold,28 June 2013,4.26,0.134,0.474,9,-13.864,0,0.0518,0.826,0.58,0.195,0.133,98.854,203227


In [40]:
merged_df.to_csv('RYM_Spotify_data.csv', index = False)