# Beautiful soup for scraping song names

Main goal: get a list of songs on a website describing a particular music book. Goal was to automate playlist construction from this list, but alas - the apple music api doesn't allow this.

In [2]:
from bs4 import BeautifulSoup
from requests import get
from requests.exceptions import RequestException
from contextlib import closing

In [3]:
def simple_get(url):
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                print('{0} doesnt look like html'.format(url))
                return None

    except RequestException as e:
        print('Error during requests to {0} : {1}'.format(url, str(e)))
        return None

In [4]:
def is_good_response(resp):
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 and content_type is not None and content_type.find('html') > -1)

# Test with boardgamegeek stuff 

I got this part from another [tutorial](https://towardsdatascience.com/web-scraping-boardgamegeek-com-using-selenium-beautifulsoup-requests-lxml-and-scrapy-1902d478ecde).

In [5]:
 url = 'https://boardgamegeek.com/browse/boardgamemechanic'
html =simple_get(url)
soup = BeautifulSoup(html,'html.parser')

In [6]:
 print(soup.prettify())

<!DOCTYPE html>
<html lang="en-US" ng-app="GeekApp" ng-cloak="">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" id="vp" name="viewport"/>
  <script>
   window.addEventListener( 'DOMContentLoaded',  function() {
				var width = document.documentElement.clientWidth || window.innerWidth;
				if (width < 960) {
					var mvp = document.getElementById('vp');
					// android debugging
					mvp.setAttribute('content','width=960');
				}
			});
  </script>
  <meta content="yes" name="apple-mobile-web-app-capable"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <title>
   Browse Board Game Mechanics | BoardGameGeek
  </title>
  <link href="https://cf.geekdo-static.com/icons/touch-icon180.png" rel="apple-touch-icon"/>
  <link href="https://cf.geekdo-static.com/icons/favicon2.ico" rel="shortcut icon" type="image/ico"/>
  <link href="https://cf.geekdo-static.com/icons/favicon2.ico" rel="icon" type="image/ico"/>
  <link href="/game-ope

In [7]:
Table_element = soup.find('table',{'class':'forum_table'})
table_items = Table_element.findAll('td')
Mech_dict = {}
for elt in table_items:
    if len(elt.contents)>0:
        mechanic = elt.getText().strip()
        url = elt.find('a').get('href')
        Mech_dict[mechanic] = url

In [8]:
print(Mech_dict)

{'Acting': '/boardgamemechanic/2073/acting', 'Action Drafting': '/boardgamemechanic/2838/action-drafting', 'Action Points': '/boardgamemechanic/2001/action-points', 'Action Queue': '/boardgamemechanic/2689/action-queue', 'Action Retrieval': '/boardgamemechanic/2839/action-retrieval', 'Action Timer': '/boardgamemechanic/2834/action-timer', 'Action/Event': '/boardgamemechanic/2840/actionevent', 'Advantage Token': '/boardgamemechanic/2847/advantage-token', 'Alliances': '/boardgamemechanic/2916/alliances', 'Area Majority / Influence': '/boardgamemechanic/2080/area-majority-influence', 'Area Movement': '/boardgamemechanic/2046/area-movement', 'Area-Impulse': '/boardgamemechanic/2021/area-impulse', 'Auction/Bidding': '/boardgamemechanic/2012/auctionbidding', 'Auction: Dexterity': '/boardgamemechanic/2930/auction-dexterity', 'Auction: Dutch': '/boardgamemechanic/2924/auction-dutch', 'Auction: Dutch Priority': '/boardgamemechanic/2932/auction-dutch-priority', 'Auction: English': '/boardgamemec

# Now try it for the fakebook 

I downloaded the html file I wanted and named it "fakebook.html".

In [9]:
fakebook = open("fakebook.html")
booksoup = BeautifulSoup(fakebook,'html.parser')

In [10]:
print(booksoup.prettify)

<bound method Tag.prettify of 
<!DOCTYPE html>

<html lang="en">
<head>
<meta content="" name="description"/>
<meta content="UB Libraries,University at Buffalo,books,e-journals,library,Buffalo,Buffalo Library,Databases,Best Basic Resources,Librarian,Articles+,worldcat,UB,University at Buffalo Libraries" name="Keywords">
<meta content="University at Buffalo Libraries provide resources for students, faculty and the public" name="Description"/>
<meta charset="utf-8"/>
<meta content="97fa1944b5c9c4056145ef8d430743a4" name="p:domain_verify">
<meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, minimum-scale=1, maximum-scale=1" name="viewport"/>
<script async="" src="//rum-static.pingdom.net/pa-5bb658c5cea07b001600020e.js"></script>
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
<script src="/js/search-all.js"></script>
<link href="/css/style.css" rel="stylesheet"/>
<!-- Google Tag Manager - Unive

## get the titles!

In [11]:
div_element = booksoup.find('div',{'id':'content'})
#div_items = div_element.findAll('hr')
for hr in div_element.findAll('h2'):
    print(hr)

<h2>32 Feet and 8 Little Tails</h2>
<h2>96 Tears</h2>
<h2>99 Red Balloons</h2>
<h2>Aba Daba Honeymoon, The</h2>
<h2>Absence Of Malice</h2>
<h2>Adam</h2>
<h2>Adios</h2>
<h2>Adios Mariquita Linda</h2>
<h2>After The Lovin'</h2>
<h2>Against All Odds</h2>
<h2>Ain't It Kind Of Wonderful</h2>
<h2>Ain't No Mountain High Enough</h2>
<h2>Ain't No Sunshine</h2>
<h2>Ain't Nothing Like The Real Thing</h2>
<h2>Ain't That A Shame</h2>
<h2>Ain't Too Proud To Beg</h2>
<h2>Air That I Breathe, The</h2>
<h2>Alfie</h2>
<h2>Ali Bombaye I &amp; II</h2>
<h2>All I Do Is Dream Of Yu</h2>
<h2>All In Love Is Fair</h2>
<h2>All My Hard Times</h2>
<h2>All Out Of Love</h2>
<h2>All Over the World</h2>
<h2>Alma Llanera</h2>
<h2>Almost In Your Arms</h2>
<h2>Almost Paradise</h2>
<h2>Along Comes Mary</h2>
<h2>Always And Forever</h2>
<h2>Always In My Heart</h2>
<h2>Am I In Love</h2>
<h2>American Pie</h2>
<h2>Amor</h2>
<h2>Amyable</h2>
<h2>An Old Fashioned Love Song</h2>
<h2>Anchors Aweigh</h2>
<h2>And I Love You So</h2>
<h

Next: just make a table with all the info in it; I can use excel or some such to restrict it down. Sad times: cannot get real playlist without apple developer account :/