#### Copyright 2019 Google LLC.

In [0]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Using APIs

In this module we will practice pulling and parsing data using a restricted public API.

## Web Crawling and Scraping

**Why do we need to crawl the web?  Aren't there datasets out there?**

There are datasets to learn, and conduct research on, but the best experiments and models are tested on new data.  Since new data is constantly being created on the internet, data collection is always in demand.  

**How are datasets created in the first place?**

The answer is through a process of manual labeling.  [Mechanical Turk](https://www.mturk.com/) is a service that allows you to crowdsource data labeling, amongst other tasks. Another way to obtain labeled data is through web crawling and scraping.



In [0]:
# Install spotipy library to access Spotify API
!pip install spotipy

## Set up your Spotify API key

1. Navigate to https://developer.spotify.com/dashboard/.

2. Create a ***free*** Spotify account if you don't have one.
3. Register an application under your account.



In [0]:
SPOTIPY_CLIENT_ID = "your_id_here" #your client ID from Spotify
SPOTIPY_CLIENT_SECRET = "your_secret_here" #your client secret from Spotify 
REDIRECT_URI = "http://localhost:8888/callback"
SCOPE = {
    "account": "user-read-private",
    "top": "user-top-read",
    "email": "user-read-email",
}

In [0]:
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import Spotify

In [0]:
client_credentials_manager = SpotifyClientCredentials(
    client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
sp = Spotify(client_credentials_manager=client_credentials_manager)
sp.trace=False

In [0]:
#@title Download and artist image

from PIL import Image
from urllib.request import urlretrieve
from IPython.display import Image

def get_image_url(name):
  results = sp.search(q='artist:' + name, type='artist')
  items = results['artists']['items']
  if len(items) > 0:
      artist = items[0]
      url = artist['name'], artist['images'][0]['url']
      return url
  else:
    return "no response"

artist_name = 'eminem' #@param {type:"string"}
url = get_image_url(artist_name)[1]

file_name = 'downloaded_image.jpg'
urlretrieve(url, file_name)
Image(filename=file_name)

## Find a track title from an album and download lyrics

1. Query artist of interest
2. Query albums from that artist
3. Query songs from an album

In [0]:
#@title Download track titles from an album

from PIL import Image
from urllib.request import urlretrieve
from IPython.display import Image

def get_artists(name):
  results = sp.search(q='artist:' + name, type='artist')
  items = results['artists']['items']
  if len(items) > 0:
      artist = items[0]
      return artist
  else:
    return "no response"

artist_name = 'billie eilish' #@param {type:"string"}
artists = get_artists(artist_name)

print(artists['uri'])

def get_albums(uri):
  results = sp.artist_albums(uri)
  return results  

In [0]:
results = sp.artist_albums(artists['uri'], album_type='album')
albums = results['items']

In [0]:
while results['next']:
  results = sp.next(results)
  albums.extend(results['items'])

In [0]:
album_names = [(item['name'], item['uri']) for item in albums]

In [0]:
def get_tracks(album_id):
  results = sp.album_tracks(album_id)
  return results

tracks = get_tracks(albums[0]['id'])

In [0]:
def get_track_names(tracks):
  return [tracks['items'][i]['name'] for i in range(len(tracks['items']))]

In [0]:
track_names = get_track_names(tracks)
track_names

## Query Genius API for song lyrics

https://genius.com/developers

In [0]:
import requests

def request_song_info(song_title, artist_name):
  base_url = 'https://api.genius.com'
  headers = {'Authorization': 'Bearer ' + 'h3dXZsiA82uG64McuHwx1KaUv7rNrD02q7pWjLP_Lao76IN3QWQpTm8xEpPSxdyX'}
  search_url = base_url + '/search'
  data = {'q': song_title + ' ' + artist_name}
  response = requests.get(search_url, data=data, headers=headers)

  return response

In [0]:
# Query the Genius API
artist_name = artists['name']
song_title = track_names[1] # play around with this number to change the song
response = request_song_info(song_title, artist_name)

json = response.json()
remote_song_info = None

for hit in json['response']['hits']:
  if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
    remote_song_info = hit
    break

In [0]:
remote_song_info

In [0]:
# Extract lyrics from URL if the song was found
if remote_song_info:
  song_url = remote_song_info['result']['url']

In [0]:
# Use bs4 to parse lyrics from html
from bs4 import BeautifulSoup

def scrape_song_url(url):
  page = requests.get(url)
  html = BeautifulSoup(page.text, 'html.parser')
  lyrics = html.find('div', class_='lyrics').get_text()

  return lyrics

In [0]:
lyrics = scrape_song_url(song_url)
print(lyrics)

## Write the lyrics to CSV

In [0]:
import pandas as pd

data = {
    'artist_name': artist_name, 
    'song_title': song_title,
    'lyrics': lyrics,
}

df = pd.DataFrame([data])
df

In [0]:
df.to_csv(f'{artist_name}_{song_title}.csv', header=0)

In [0]:
!ls

# Resources

* [BS4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/): A Python library for parsing html documents.

* [Spotipy](https://spotipy.readthedocs.io/en/latest/#installation): a Python wrapper for the spotify API.

* [Genius](https://genius.com/api-clients): An API library for music lyrics and other metadata.

# Exercises

Download all the lyrics from your favorite album.

### Student Solution

In [0]:
# Your answer goes here

### Answer Key

**Solution**

In [0]:
artist_name = 'the strokes'
album_name = 'room on fire'

In [0]:
albums[0]['name']

In [0]:
# Get all albums by The Strokes.
artist = get_artists(artist_name)
albums = get_albums(artist['uri'])['items']

In [0]:
# Find the album we are interested in.
my_favorite_album = None

for album in albums:
  if album['name'].lower() == album_name.lower():
    my_favorite_album = album['id']
    print('We found your favorite album "%s".' % album_name)
    break

if my_favorite_album is None:
  print('We couldn\'t find your favorite album, sorry.')

In [0]:
tracks = get_tracks(my_favorite_album)
track_names = get_track_names(tracks)
track_names

In [0]:
# Query the Genius API for the song info for all songs and get their lyrics.
# Store the lyrics in a dict, keyed by song name.

# This cell may take a few minutes to run.
import time

lyrics_dict = {}

for song_title in track_names:
  response = request_song_info(song_title, artist_name)

  json = response.json()
  remote_song_info = None

  for hit in json['response']['hits']:
    if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
      remote_song_info = hit
      break
  
  if remote_song_info:
    song_url = remote_song_info['result']['url']
  
  lyrics_dict[song_title] = scrape_song_url(song_url)
  # Let Genius and BS4 rest for 2 seconds!
  time.sleep(2)

In [0]:
print(lyrics_dict['Reptilia'])