In [4]:
from lyricsgenius import Genius
import requests
import urllib.request
import urllib.parse
import json
from bs4 import BeautifulSoup

In [5]:
### a new client_access_token might be needed to be retrieved for successful scraping 
base = "https://api.genius.com"
client_access_token = "nNQLTdOiCABzphOY-f1Frpg7PPHbvXKagfoqL3y2_PgTqV_XTgDNur5EJcigeYoz"

In [37]:
def get_json(path, params=None, headers=None):
    '''Send request and get response in json format.'''
    # Generate request URL
    requrl = '/'.join([base, path])
    token = "Bearer {}".format(client_access_token)
    if headers:
        headers['Authorization'] = token
    else:
        headers = {"Authorization": token}

    # Get response object from querying genius api
    response = requests.get(url=requrl, params=params, headers=headers)
    response.raise_for_status()
    return response.json()

def get_artist_id_via_genius(artist_name):
    genius = Genius(client_access_token)
    artist = genius.search_artist(artist_name, max_songs=0, sort="title")
    try:
        if artist.name == artist_name:
            results = []
            results.append(artist.name)
            results.append(artist.id)
            return results
    except:
        print('no result found')

def get_songs_of_artist(artist_id, page_limit, songs_limit):
    current_page = 1
    next_page = True
    songs = []

    while next_page and current_page <= page_limit:

        path = 'artists/{}/songs?page={}&sort=popularity'.format(artist_id, current_page)
        page = get_json(path)['response']['songs']
        if page:
            for song in page:
                if song['primary_artist']['id'] == artist_id and song['lyrics_state'] == 'complete' and len(songs) <= songs_limit:
                    songs.append(song['id'])
        else:
            next_page = False
        current_page += 1
    songs = list(dict.fromkeys(songs)) # remove duplicates if exist   
    return songs

def get_songs_information(song_ids, artist_name):
    song_list = {}
    print("Scraping songs information of artist {}".format(artist_name))
    for i, song_id in enumerate(song_ids):
        path = "songs/{}".format(song_id)
        try:
            data = get_json(path)['response']['song']
        except:
            print("failed to fetch information for song id {}".format(song_id))
        
        song_list.update({
        i: {
            "title": data["title"] if data else "",
            "album": data["album"]["name"] if data["album"] else "<single>",
            "album_cover" : data["album"]["cover_art_url"] if data["album"] and data["album"]["cover_art_url"] else "",
            "genius_album_id": data["album"]["id"] if data["album"] else "none",
            "release_date": data["release_date"] if data["release_date"] else "unidentified",
            "featured_artists":
                [feat["name"] if data["featured_artists"] else "" for feat in data["featured_artists"]],
            "featured_artists_pics":
                [feat["image_url"] if data["featured_artists"] else "" for feat in data["featured_artists"]],
            "producer_artists":
                [feat["name"] if data["producer_artists"] else "" for feat in data["producer_artists"]],
            "writer_artists":
                [feat["name"] if data["writer_artists"] else "" for feat in data["writer_artists"]],
            "primary_artist_picture":
                data["primary_artist"]["image_url"] if data["primary_artist"]["image_url"] else "",
            "lyrics_path" : data['path'] if data['path'] else 'none',
            "lyrics_status" : data['lyrics_state'],
            "genius_track_id": song_id}
        })

    print("Scraped information for {} songs of artist {}".format(len(song_list), artist_name))
    
    return song_list

def get_lyrics(lyrics_path):
    if lyrics_path == 'none':
        return "none"
    URL = "http://genius.com" + lyrics_path
    page = requests.get(URL)

    # Extract the page's HTML as a string
    html = BeautifulSoup(page.text, "html.parser")

    # Scrape the song lyrics from the HTML
    output = ""
    for lyrics in html.select('div[class^="Lyrics__Container"]'):
        output += lyrics.get_text(strip=True, separator='\n')
    return output

## Pre-scraping processing:
### We load the artists list collected from Spotfiy and
### use geniuslyrics to find the artist id on genius.com

In [8]:
## Pre-scraping

##Finding the artist ids and saving them in a csv file

import csv
from typing import Set, Dict
# def get_ids_from_csv(filepath):
filepath = 'artist_data.csv'
file = open(filepath, encoding='utf-8')
csvreader = csv.reader(file)
artists_list = []
for artist in csvreader:
    artists_list.append(artist)
    
artists_id_list = []    
for artist in artists_list:
    result = get_artist_id_via_genius(artist[0])
    try:
        if result[0].lower() == artist[0].lower():
            artists_id_list.append(result)
            print("Found a match!")
    except:
        print("No Found")

    ################# save output
with open("artists_id_list.csv", "w", newline="", encoding='utf8') as f:
    writer = csv.writer(f)
    writer.writerows(artists_id_list)

Searching for songs by Beginner...

Done. Found 0 songs.
Found a match!
Searching for songs by Samy Deluxe...

Done. Found 0 songs.
Found a match!
Searching for songs by Fischmob...



KeyboardInterrupt: 

## Batch-Scraping using multithreading

In [7]:
######################################################
###### Scraping the lyrics using multithreading ######
######################################################

import concurrent.futures
import csv
from ipywidgets import IntProgress
from IPython.display import display
import time

##This function was originally written to batch scrap all artists
##Since we want concurrent multithreading, we must have changed it
##so it only processes one artist
def batch_get_songs(artist_name, artist_id, songs_per_artist = 15, pages_to_scrape = 5):
    result = {}
    song_ids = get_songs_of_artist(artist_id, pages_to_scrape, songs_per_artist)
    print("Fetched {} songs for {}, {}".format(len(song_ids), artist_name, artist_id))
    songs_information = get_songs_information(song_ids, artist_name)
    song_lyrics = [get_lyrics(song['lyrics_path']) for song in songs_information.values()]
    for i, lyrics in enumerate(song_lyrics):
        songs_information[i].update({"lyrics" : lyrics})

    result.update({artist_name : songs_information})
    return result

###########
##Reading the artists_id_list and dumping into a dictionary
artists_id_list = {}
with open('artists_id_list.csv') as csv_file:
    reader = csv.reader(csv_file)
    for row in reader:
        name = row.pop(0)
        artist_id = int(row.pop(0))
        artists_id_list.update({name: artist_id})

## getting an list for names & list for ids
names = []
ids = []
for key, value in artists_id_list.items():
    names.append(key)
    ids.append(value)

##Initializing final dictionary for the output
final = {}
##We can limit how many names & ids we want to scrap
names = names
ids = ids
##Initializing a bar to see progress
f = IntProgress(min=0, max=len(names)) # instantiate the bar
display(f) # display the bar
##initiaiting a ThreadPoolExecutor to multithread the scraping process
##Workers return their results to the final dictionary once done scraping the songs of an artist
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    for result in executor.map(batch_get_songs, names, ids):
        final.update(result)
        f.value += 1

IntProgress(value=0, max=782)

Fetched 3 songs for Hausmarke, 403521
Scraping songs information of artist Hausmarke
Fetched 16 songs for Fischmob, 37917
Scraping songs information of artist Fischmob
Fetched 16 songs for Kinderzimmer Productions, 47931
Scraping songs information of artist Kinderzimmer Productions
Scraped information for 3 songs of artist Hausmarke
Fetched 16 songs for Die Firma, 18832
Scraping songs information of artist Die Firma
Fetched 16 songs for Fettes Brot, 12800
Scraping songs information of artist Fettes Brot
Fetched 16 songs for Samy Deluxe, 11959
Scraping songs information of artist Samy Deluxe
Fetched 16 songs for Beginner, 25183
Scraping songs information of artist Beginner
Fetched 16 songs for Curse, 12202
Scraping songs information of artist Curse
Fetched 5 songs for Future Rock, 73604
Scraping songs information of artist Future Rock
Scraped information for 5 songs of artist Future Rock
Scraped information for 16 songs of artist Fischmob
Scraped information for 16 songs of artist Die F

### Saving into 3 different file types (txt, csv, json)
### Checking that the JSON file is loadable back into a dictionary
*When loading the JSON file into a dictionary, the indices of the songs are being interpreted as strings*

**TODO:** *find a solution for this issue that doesn't require iterating over all song's indices and convert them to int*

In [9]:
### Save the results into 3 different file types
### JSON seems to be the one we're actually looking
### CSV is a big-NONO, but saved anyway

import json
import csv

## saving as different file types, only JSON is really needed.
def export_to_3_types(filename, filename_save):
    with open(filename_save + '.txt', 'w', encoding='utf8') as f:
        f.write(str(filename))

    # Save the dictionary as .json
    with open(filename_save + '.json', 'w', encoding='utf8') as f:
        json.dump(filename, f)

    # Save the dictionary as .csv
    with open(filename_save + '.csv', 'w', encoding='utf8') as f:
        w = csv.DictWriter(f, filename.keys())
        w.writeheader()
        w.writerow(filename)

In [10]:
###save the results###
export_to_3_types(final, "final_new")

In [42]:
### checking that the json file was saved correctly and can be loaded into a dictionary ###
with open('final_new.json', 'r') as f:
    load_me = json.load(f)

# create a dictionary from the loaded json file
load_final = dict(load_me)

### Loading into dictionary works well

## deleting empty artists
i = 0
j = 0
empties = 0
to_be_deleted = []
for name, idd in load_final.items():
    for song_index in load_final[name]:
        i += 1
        j += 1
    if j == 0:
        to_be_deleted.append(name)
        empties += 1
    j = 0


print('Before:',len(load_final))
for artist in to_be_deleted:
    load_final.pop(artist)
print('After:',len(load_final))

Before: 782
After: 707


In [12]:
## After loading the JSON into the dictionary
## it seems like the indices of the songs is being read as a string
## following turns it back into int, need to find a better solution for that
for artist in load_final:
    load_final[artist] = {int(idx):value for idx, value in load_final[artist].items()}

load_final['Kollegah'][0]

{'title': 'Mondfinsternis',
 'album': 'Bossaura',
 'album_cover': 'https://images.genius.com/51e08c065f8aa5228164bdf2fb4e6955.1000x1000x1.jpg',
 'genius_album_id': 12261,
 'release_date': '2011-09-21',
 'featured_artists': [],
 'featured_artists_pics': [],
 'producer_artists': ['Sunset Mafia Jay-Ho'],
 'writer_artists': ['Sunset Mafia Jay-Ho', 'Kollegah'],
 'primary_artist_picture': 'https://images.genius.com/851cc7ad19da70bb8b380be671ef83d6.1000x1000x1.png',
 'lyrics_path': '/Kollegah-mondfinsternis-lyrics',
 'genius_track_id': 56142,
 'lyrics': "[Songtext zu „Mondfinsternis“]\n[Part 1]\nKid, ich bin der Mac in diesem Business und ticke dicke Packets an die Kids, stepp' mit der glitzernden Kette\nIn den Club, zerre deine Bitch auf Toilette und ficke der den arroganten Blick aus der Fresse\nDu holst dir mein Album und sagst, ich bin der Beste, doch ich dreh' mit deiner Mutter Porntapes\nKomm' in deine Villa mit der Basey, gebe deinem Body Punches, alle deine Rippen zerbersten wie Cornf

In [14]:
###save the cleaned results###
export_to_3_types(load_final, "final_new_cleaned")

In [22]:
# check how many songs we have
total_songs = 0
print(len(load_final))
for key in load_final:
    total_songs += len(load_final[key])
print(total_songs)

707
9361


In [28]:
load_final['Future Rock']

{0: {'title': 'Exodus (Kapitel 8)',
  'album': 'Produced By Future Rock',
  'album_cover': 'https://images.genius.com/605965a16c31d192794b6c12af3b6229.1000x1000x1.jpg',
  'genius_album_id': 740902,
  'release_date': 'unidentified',
  'featured_artists': ['Torch'],
  'featured_artists_pics': ['https://images.genius.com/9def38a982558b6f517c2cd03eaabd71.548x548x1.png'],
  'producer_artists': ['Future Rock'],
  'writer_artists': [],
  'primary_artist_picture': 'https://images.genius.com/94488ae0c8ebbf9e560afb31c0fb131d.237x237x1.jpg',
  'lyrics_path': '/Future-rock-exodus-kapitel-8-lyrics',
  'genius_track_id': 225389,
  'lyrics': 'Dies ist die Geschichte eines Nomaden vom Volk der Massai, namens Torch und die geht so:\nAls ich zur Schule ging, ging das ziemlich schief\n5 in Chemie, aber im rappen bin ich der Chief\nDer Pate, die Sonne in der Galaxis\nBlauer Samt Baby, Theorie und Praxis\nÜberleg´ nicht lang, schreib´s einfach hin\nEgal ob sich´s reimt, Hauptsache die Worte geben Sinn\nUnd

In [57]:
# find any songs that has no no lyrics
to_delete = []
for key in load_final:
    for idx in load_final[key]:
        if load_final[key][idx]['lyrics'] == '.':
            print(load_final[key][idx])
            to_delete.append({key : idx})

In [56]:
# delete the few songs with no lyrics
for entry in to_delete:
    for key, val in entry.items():
        del load_final[key][val]

In [58]:
###save the cleaned results again###
export_to_3_types(load_final, "final_new_clean")