#### Copyright 2019 Google LLC.

In [0]:
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Web Crawling and Scraping

In this module you will learn how to pull and parse data from a restricted public API.

## Overview

### Learning Objectives

* Authentication with Spotify and Genius API
* Authentication with Google Storage
* Crawling the Spotify index
* Crawling the Genius index
* Parsing data from a json response
* Scraping data from an html response

### Prerequisites

* Intermediate Python
* Cloud Access

### Estimated Duration

90 minutes

### Grading Criteria

Each exercise is worth 3 points. The rubric for calculating those points is:

| Points | Description |
|--------|-------------|
| 0      | No attempt at exercise |
| 1      | Attempted exercise, but code does not run |
| 2      | Attempted exercise, code runs, but produces incorrect answer |
| 3      | Exercise completed successfully |

There are 1 exercises in this Colab so there are 3 points available. The grading scale will be 3 points.

## Web Crawling and Scraping

Why do we need to crawl the web?  Aren't there datasets out there?

There are datasets to learn, and conduct research on, but the best experiments and models are tested on new data.  Since new data is constantly being created on the internet, data collection is always in demand.  

How are datasets created in the first place?  The answer is through a tedious process of manual labeling.  [Mechanical Turk](https://www.mturk.com/) is a service that allows you to crowdsource data labeling, amongst other tasks.

Another way we can obtain labeled data is through web crawling and scraping.



In [0]:
# install spotipy library to access spotify api
!pip install spotipy

## Set up your Spotify API key

1. Navigate to https://developer.spotify.com/dashboard/

2. Create a ***free*** Spotify account if you don't have one.
3. Register an application under your account.



In [0]:
#Spotify#
SPOTIPY_CLIENT_ID = "38ec4b86f0a04836abb241281abc657d"
SPOTIPY_CLIENT_SECRET = "2dad51c31ae84a86b8036d257b85d5a6"
REDIRECT_URI = "http://localhost:8888/callback"
SCOPE = {"account": "user-read-private", "top": "user-top-read", "email": "user-read-email"}

In [0]:
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy import Spotify

In [0]:
client_credentials_manager = SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID, client_secret=SPOTIPY_CLIENT_SECRET)
sp = Spotify(client_credentials_manager=client_credentials_manager)
sp.trace=False

In [0]:
#@title Download and artist image

from PIL import Image
from urllib.request import urlretrieve
from IPython.display import Image

def get_image_url(name):
  results = sp.search(q='artist:' + name, type='artist')
  items = results['artists']['items']
  if len(items) > 0:
      artist = items[0]
      url = artist['name'], artist['images'][0]['url']
      return url
  else:
    return "no response"

artist_name = 'eminem' #@param {type:"string"}
url = get_image_url(artist_name)[1]

file_name = 'downloaded_image.jpg'
urlretrieve(url, file_name)
Image(filename=file_name)

## Send image to google storage

First authenticate your account.

In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
# Set Project ID
project_id = 'spotify-amli'

# generate unique id for bucket_name
import uuid
bucket_name = 'colab-sample-bucket-' + str(uuid.uuid1())

### Upload using Python

In [0]:
from googleapiclient.discovery import build
gcs_service = build('storage', 'v1')

### Create Bucket

In [0]:
# Use a different globally-unique bucket name from the gsutil example above.
import uuid
bucket_name = 'colab-sample-bucket-' + str(uuid.uuid1())

body = {
  'name': bucket_name,
  # For a full list of locations, see:
  # https://cloud.google.com/storage/docs/bucket-locations
  'location': 'us',
}
gcs_service.buckets().insert(project=project_id, body=body).execute()
print('Done')

### Upload File to new bucket

In [0]:
from googleapiclient.http import MediaFileUpload

media = MediaFileUpload('downloaded_image.jpg', 
                        mimetype='image/jpeg',
                        resumable=True)

request = gcs_service.objects().insert(bucket=bucket_name, 
                                       name='uploaded.jpg',
                                       media_body=media)

response = None
while response is None:
  # _ is a placeholder for a progress object that we ignore.
  # (Our file is small, so we skip reporting progress.)
  _, response = request.next_chunk()

print('Upload complete')

### Download file from new bucket

In [0]:
from apiclient.http import MediaIoBaseDownload

with open('downloaded.jpg', 'wb') as f:
  request = gcs_service.objects().get_media(bucket=bucket_name,
                                            object='uploaded.jpg')
  media = MediaIoBaseDownload(f, request)

  done = False
  while not done:
    # _ is a placeholder for a progress object that we ignore.
    # (Our file is small, so we skip reporting progress.)
    _, done = media.next_chunk()

print('Download complete')

### Finally Inspect Image

In [0]:
Image(filename='downloaded.jpg')

## Find a track title from an album and download lyrics

1. Query artist of interest
2. Query albums from that artist
3. Query songs from an album

In [0]:
#@title Download track titles from an album

from PIL import Image
from urllib.request import urlretrieve
from IPython.display import Image

def get_artists(name):
  results = sp.search(q='artist:' + name, type='artist')
  items = results['artists']['items']
  if len(items) > 0:
      artist = items[0]
      return artist
  else:
    return "no response"

artist_name = 'john legend' #@param {type:"string"}
artists = get_artists(artist_name)

print(artists['uri'])

def get_albums(uri):
  results = sp.artist_albums(uri)
  return results  

## Get albums

In [0]:
results = sp.artist_albums(artists['uri'], album_type='album')

In [0]:
albums = results['items']

### Crawl the next 20

In [0]:
while results['next']:
    results = sp.next(results)
    albums.extend(results['items'])

In [0]:
album_names = [(item['name'], item['uri']) for item in albums]

## Get tracks

In [0]:
def get_tracks(album_id):
  results = sp.album_tracks(album_id)
  return results

tracks = get_tracks(album_id)

In [0]:
track_names = [tracks['items'][i]['name'] for i in range(len(tracks['items']))]
track_names

## Create Genius API key at

https://genius.com/developers

In [0]:
import requests

def request_song_info(song_title, artist_name):
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + 'h3dXZsiA82uG64McuHwx1KaUv7rNrD02q7pWjLP_Lao76IN3QWQpTm8xEpPSxdyX'}
    search_url = base_url + '/search'
    data = {'q': song_title + ' ' + artist_name}
    response = requests.get(search_url, data=data, headers=headers)

    return response

## Query Genius API

In [0]:
artist_name = artists['name']
song_title = track_names[0]
response = request_song_info(song_title, artist_name)

request_song_info(song_title, artist_name)    
json = response.json()
remote_song_info = None



for hit in json['response']['hits']:
    if artist_name.lower() in hit['result']['primary_artist']['name'].lower():
        remote_song_info = hit
        break

In [0]:
remote_song_info

## Get Genius song url

In [0]:
# Extract lyrics from URL if the song was found
if remote_song_info:
    song_url = remote_song_info['result']['url']

## Use bs4 to parse lyrics from html

In [0]:
from bs4 import BeautifulSoup

def scrap_song_url(url):
    page = requests.get(url)
    html = BeautifulSoup(page.text, 'html.parser')
    lyrics = html.find('div', class_='lyrics').get_text()

    return lyrics

In [0]:
lyrics = scrap_song_url(song_url)
print(lyrics)

## Create a data object

In [0]:
import pandas as pd

data = {
    'artist_name': artist_name, 
    'song_title': song_title,
    'lyrics': lyrics,
    }

df = pd.DataFrame([data])

df

## Create a csv and upload to Google Storage

In [0]:
df.to_csv(f'{artist_name}_{song_title}.csv', header=0)

In [0]:
!ls

In [0]:
from googleapiclient.http import MediaFileUpload
import uuid

# Set Project ID
project_id = 'spotify-amli'

bucket_name = 'lyrics-bucket-' + str(uuid.uuid1())

body = {
  'name': bucket_name,
  # For a full list of locations, see:
  # https://cloud.google.com/storage/docs/bucket-locations
  'location': 'us',
}
gcs_service.buckets().insert(project=project_id, body=body).execute()
print('Done')

media = MediaFileUpload(f'{artist_name}_{song_title}.csv', 
                        mimetype='text/csv',
                        resumable=True)

request = gcs_service.objects().insert(bucket=bucket_name, 
                                       name='sample_lyrics.csv',
                                       media_body=media)

response = None
while response is None:
  # _ is a placeholder for a progress object that we ignore.
  # (Our file is small, so we skip reporting progress.)
  _, response = request.next_chunk()

print('Upload complete')

# Resources

* [BS4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/): A Python library for parsing html documents.

* [Spotipy](https://spotipy.readthedocs.io/en/latest/#installation): a Python wrapper for the spotify API.

* [Genius](https://genius.com/api-clients): An API library for music lyrics and other metadata.

# Exercises

## Exercise 1

Download all the lyrics from your favorite artist.

### Student Solution

In [0]:
# Your answer goes here

### Answer Key

**Solution**

In [0]:
# Put the recommended solution here; if there is more than one "good" solution
# that you think students should know put those solutions in subsequent code
# boxes with "# Solution" in the first line.

**Validation**

In [0]:
# If the solution can be auto-graded, perform the autograding here.