In [1]:
%load_ext blackcellmagic

In [2]:
#!pip install python-decouple

In [3]:
from decouple import config
import json
import requests
import sys
import asyncio
from concurrent.futures import ThreadPoolExecutor
import nest_asyncio
from itertools import zip_longest
import aiohttp
from aiohttp import ClientSession

nest_asyncio.apply()

In [4]:
CLIENT_SECRET= config("CLIENT_SECRET")
CLIENT_ID = config("CLIENT_ID")
CLIENT_ACCESS_TOKEN = config("CLIENT_ACCESS_TOKEN")

In [5]:
#max_song = 2110000
test_song =100

## Step One: Get all the URL's for each song

In [6]:
import requests


def request_song_info(session, song_num, song_urls):
    base_url = "https://api.genius.com/songs/" + str(song_num)
    headers = {"Authorization": "Bearer " + CLIENT_ACCESS_TOKEN}
    response = requests.get(base_url, headers=headers)
    try:
        if response.json()["meta"]["status"] == 200:
            song_urls.append(
                (
                    song_num,
                    response.json()["response"]["song"]["title"],
                    response.json()["response"]["song"]["url"],
                )
            )
        else:
            pass
    except:
        pass

In [7]:
async def get_index_data_asynchronous(min_song, max_song, song_urls):
    """
    1. Establish an executor and number of workers
    2. Establish the session
    3. Establish the event loop
    4. Create the task by list comprenhensions
    5. Gather tasks.
    """
    with ThreadPoolExecutor(max_workers=40) as executor:
        with requests.Session() as session:
            loop = asyncio.get_event_loop()
            tasks = [
                loop.run_in_executor(
                    executor, request_song_info, *(session, song_num, song_urls)
                )
                for song_num in range(min_song, max_song)
            ]
            for response in await asyncio.gather(*tasks):
                pass


def execute_async_index_event_loop(min_song, max_song, song_urls):
    """
    This function does something analogous to compiling the get_data_asynchronously function,
    Then it executes loop.
    1. Call the get_data_function
    2. Get the event_loop
    3. Run the tasks (Much easier to understand in python 3.7, "ensure_future" was changed to "create_task")
    4. Edge_list and top_interactions will be passed to the next functions
    """
    future = asyncio.create_task(
        get_index_data_asynchronous(min_song, max_song, song_urls)
    )
    loop = asyncio.get_event_loop()
    loop.run_until_complete(future)
    return song_urls

## tests

In [10]:
%%time
song_urls = []
min_song=0
test_song_urls = execute_async_index_event_loop(min_song,test_song,song_urls)

Wall time: 955 ms


In [11]:
len(test_song_urls)

91

## Running this iteratively
It takes about ~15 minutes per 100,000 queries. This algorithm does have poor error handling(it just skips some errors, but some raise exception and causes it to break), so the total ends up less than is expected. However, from working on this project a lot I know that many entries have been deleted and should explain the missing values.

In [None]:
%%time
min_song = max([s[0] for s in song_urls])+1
max_song = min_song +100000
song_urls = execute_async_index_event_loop(min_song, max_song,song_urls)

In [None]:
max_song

In [None]:
len(song_urls)

## Pickling. 
Pickling the data after each loop.

In [None]:
import pickle
filename = "song_list_tuple"
outfile = open(filename,'wb')
pickle.dump(song_urls, outfile)
outfile.close()

In [8]:
import pickle
filename = "song_list_tuple"
infile = open(filename, "rb")
song_urls = pickle.load(infile)
infile.close()

FileNotFoundError: [Errno 2] No such file or directory: 'song_list_tuple'

In [9]:
song_urls

NameError: name 'song_urls' is not defined

### Putting it all together

In [19]:
from bs4 import BeautifulSoup

In [20]:
def get_links(x,i):
    return x[i]['href']
def get_table_row(side_list_table_row):
    a = side_list_table_row.find_all("span", {"class":"metadata_unit-label"})
    label = a[0].text
    links_list = side_list_table_row.find_all("a")
    links = [get_links(links_list,i) for i in range(len(links_list))]
    return label, links
def get_table_from_page(url,title):
    page = requests.get(url)
    soup = BeautifulSoup(page.text,'html.parser')
    side_list_table = soup.find_all("div", {"class":"metadata_unit metadata_unit--table_row"} )
    return [title, url, [get_table_row(side_list_table[i]) for i in range(0,len(side_list_table)) ]]

In [21]:
def get_song_info(i):
    r = request_song_info(i)
    return r



    

In [22]:
get_song_info(1)

['Killa Cam', 'https://genius.com/Camron-killa-cam-lyrics']

In [14]:
%%time
results = [get_song_info(i) for i in range(1,5) ]

AttributeError: 'list' object has no attribute 'json'

In [13]:
results

NameError: name 'results' is not defined

In [15]:
import numpy as np
def request_artist_id(i):

    base_url = 'https://api.genius.com/artists/'+str(i)
    headers = {'Authorization': 'Bearer ' + CLIENT_ACCESS_TOKEN}
    response = requests.get(base_url, headers=headers)
    if response.json()["meta"]["status"] == 200:
        name = response.json()["response"]["artist"]["name"]
        artist_id = response.json()["response"]["artist"]["id"]
        return (name, artist_id)
    else:
        return (np.nan,i)

In [16]:
r = request_artist_id(1)

In [17]:
r

("Cam'ron", 1)

In [27]:
results3 = [request_artist_id(i) for i in range(25000,35000) ]

In [29]:
import pickle

In [30]:
filename = 'artist_genius_id_3'
outfile = open(filename, 'wb')
pickle.dump(results3,outfile)
outfile.close()

In [9]:
with open('artist_genius_id', 'rb') as fp:
    artists_genius_id = pickle.load(fp)

EOFError: Ran out of input

In [None]:
with open(target, "rb") as f:
        unpickler = pickle.Unpickler(f)

In [None]:
def request_artist_songs_list(i):
    page = 1
    base_url = 'https://api.genius.com/artists/'+str(i)+'/songs?per_page=50&page='+str(page)+"&sort=popularity"
    headers = {'Authorization': 'Bearer ' + CLIENT_ACCESS_TOKEN}
    
    
    response = requests.get(base_url, headers=headers)
    return response

In [None]:
%%time
r = request_artist_info(1)

In [None]:
r.json()["response"]["songs"][1]["url"]

In [None]:
r.json()["response"]["next_page"]