# Scraping

Code was developed on Google Collab. NOTE: Spotify key expires every hour

Put output in folder called "Output"

In [0]:
# !pip install spotipy

In [0]:
import os
import spotipy
import spotipy.util as util
import requests
import pandas as pd
import numpy as np
from io import BytesIO
from spotipy import oauth2
from PIL import Image, ImageFile
from bs4 import BeautifulSoup

ImageFile.LOAD_TRUNCATED_IMAGES = True

album_ct = 0 # Used to save and continue progress

# SPOTIFY DETAILS
TOKEN = 'ENTER_SPOTIFY_TOKEN'
username = 'ENTER_SPOTIFY_USERNAME' 
os.environ['SPOTIPY_CLIENT_ID'] = 'ENTER_SPOTIFY_ID'
os.environ['SPOTIPY_CLIENT_SECRET'] = 'ENTER_SPOTIFY_SECRET'
os.environ['SPOTIPY_REDIRECT_URI'] = 'https://www.google.com/'

In [0]:
try:
    token = util.prompt_for_user_token(username)
except:
    os.remove(f".cache-{username}")
    token = util.prompt_for_user_token(username)

defaults = {
    'request': {
        'token': TOKEN,
        'base_url': 'https://api.genius.com'
    },
    'message': {
        'search_fail': 'Lyrics Not Found',
        'wrong_input': 'Wrong number of arguments.\n' \
                       'Use two parameters to perform a custom search ' \
                       'or none to get the song currently playing on Spotify.'
    }
}

In [0]:
# Code derived from: 
# https://dev.to/willamesoares/how-to-integrate-spotify-and-genius-api-to-easily-crawl-song-lyrics-with-python-4o62

def main(img_dir):
    global album_ct
    
    if not os.path.isdir(os.getcwd() + '/output'):
        os.mkdir(os.getcwd() + '/output')
        
    album_list = pd.read_csv('/content/wiki_albums_list.csv', encoding = 'ISO-8859-1')
    album_artist_list = []
    for i in range(len(album_list)):
        album_artist_list.append([album_list.iloc[i,1], album_list.iloc[i,2]])
    
    for i in range(len(album_artist_list) - album_ct):
        status = output_album_info(album_artist_list[album_ct][0], album_artist_list[album_ct][1])
        
        if status == 1:
            print('Album Completed: ' + album_artist_list[album_ct][0] + ' - ' + str((album_ct/len(album_artist_list)) * 100) + '%')
        if status == 0:
            print("Album Not Found: " + album_artist_list[album_ct][0] + ' - ' + str((album_ct/len(album_artist_list)) * 100) + '%')
            
        album_ct += 1
        
    
def output_album_info(album_name, artist_name):

    path = os.getcwd() + '/output/' + album_name.replace('/','-')
    
    album_lyrics, art = get_album_info(album_name, artist_name)
    
    if album_lyrics == 0:
        return 0
    
    try:
        if not os.path.isdir(path):
            os.mkdir(path)
    except OSError:  
        print ("Failed to create directory %s" % path)
        
    # Output image
    art.save(path + '/' + album_name.replace('/','-') + '.jpg')
    
    # Output lyrics
    song_ct = 0
    for song_lyrics in album_lyrics:
        song_path = path + '/' + str(song_ct) + '.txt'
        text_file = open(song_path, "w")
        text_file.write(song_lyrics)
        text_file.close()
        song_ct += 1
    
    return 1
        
        
def get_album_info(album_name, artist_name):

    tracks, art_url = get_tracks_art(album_name, artist_name, token)
    
    if tracks == 0:
        return 0, 0
    
    # Get Cover Art
    response = requests.get(art_url)
    art = Image.open(BytesIO(response.content))
    
    # Get Lyrics
    album_lyrics = []
    
    for track in tracks['items']:

        song_title = track['name']
        song_title = song_title.split(' - ')[0]

        # Search for matches in request response
        response = request_song_info(song_title, artist_name)
        json = response.json()
        remote_song_info = None

        for hit in json['response']['hits']:
            if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
                remote_song_info = hit
                break

        # Extract lyrics from URL if song was found
        if remote_song_info:
            song_url = remote_song_info['result']['url']
            lyrics = scrape_lyrics(song_url)
            album_lyrics.append(lyrics)

        else:
            print(defaults['message']['search_fail'] + ': (' + album_name + ", " + artist_name + ', ' + song_title + ')')
            
    return album_lyrics, art
            

def get_tracks_art(album_name, artist_name, token):
    sp = spotipy.Spotify(auth=token)

    # find album by name
    album = album_name.replace(' ', ' ')
    artist = artist_name.replace(' ', ' ')
    results = sp.search(q = "album:" + album + " artist:" + artist, type = "album")
    
    if len(results['albums']['items']) == 0:
        return 0, 0
    
    # get album art url
    images = results['albums']['items'][0]['images']
    
    if len(list(filter(lambda image: image['height'] >= 128, images))) == 0:
        return 0, 0
    
    image_url = list(filter(lambda image: image['height'] >= 128, images))[-1]['url']

    # get the first album uri
    album_id = results['albums']['items'][0]['uri']

    # get album tracks
    tracks = sp.album_tracks(album_id)

    return tracks, image_url

            
def request_song_info(song_title, artist_name):
    base_url = defaults['request']['base_url']
    headers = {'Authorization': 'Bearer ' + defaults['request']['token']}
    search_url = base_url + '/search'
    data = {'q': song_title + ' ' + artist_name}
    response = requests.get(search_url, data=data, headers=headers)
    return response


def scrape_lyrics(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    [h.extract() for h in html('script')]
    lyrics = html.find('div', class_='lyrics').get_text()
    return lyrics


def write_lyrics_to_file (lyrics, song, artist, album):
    out = album + '.txt'
    f = open(album, 'w+')
    f.write('{} by {}'.format(song, artist))
    f.write(lyrics)
    f.close()

In [0]:
main(img_dir)

In [0]:
# !zip -r /content/output.zip /content/output

# Data from Wiki

In [0]:
# https://en.wikipedia.org/wiki/List_of_X_albums, X=[2005 to 2018]
# Main

import requests
from bs4 import BeautifulSoup
import re 
import pandas as pd

year = 2005
col_names =  ['Albums', 'Artists']
df = pd.DataFrame(columns=col_names)
for year in range(2005, 2019):
  url = 'https://en.wikipedia.org/wiki/List_of_' + str(year) + '_albums'
  #url = 'https://en.wikipedia.org/wiki/List_of_2005_albums'
  website_url = requests.get(url).text
  soup = BeautifulSoup(website_url,'lxml')
  df = df.append(scrape(soup))
  year = year + 1;
  #break

df.to_csv('/content/albums_list.csv')


  
  

In [0]:
def scrape(soup):
  albums = []
  artists = []
  df = pd.DataFrame()
  
  tables = soup.find_all('table',{'class':'wikitable'})

  # Creates album and artist list
  for table in tables:
    table_body = table.find('tbody')
    rows = table_body.find_all('tr')
    rows.pop(0)
    for row in rows: 
      cols = row.find_all('td')
      try:
        #album = cols[0].find('a').get('title')
        artist = cols[0].find('a').get('title')
        try:
          #artist = cols[1].find('a').get('title')
          album = cols[1].find('a').get('title')
        except:
          #artist = cols[1].text.strip()  
          album = cols[1].text.strip()

      except:
        try:
          #album = cols[1].find('a').get('title')
          artist = cols[1].find('a').get('title')
          try:
            #artist = cols[2].find('a').get('title')
            album = cols[2].find('a').get('title')
          except:
            #artist = cols[2].text.strip()
            album = cols[2].text.strip()

        except:
          pass
      albums.append(album)
      artists.append(artist)
  
  # Format list
  rem_idx = []
  for n, i in enumerate(albums):
    if i is '':
      rem_idx.append(n)
    if '(' in str(i):
      rem = re.sub("[\(\[].*?[\)\]]", "", str(i))
      rem = rem[:-1]
      albums[n] = rem
      
  for n, i in enumerate(artists):
    if i is '':
      rem_idx.append(n)
    if '(' in str(i):
      rem = re.sub("[\(\[].*?[\)\]]", "", str(i))
      rem = rem[:-1]
      artists[n] = rem
    
  for index in sorted(rem_idx, reverse=True):
    del albums[index]
    
  for index in sorted(rem_idx, reverse=True):
    del artists[index]
        
  # Save dataframe    oops
  df['Albums'] = albums
  df['Artists'] = artists
  return df        

