# Day 4: IMSLP Download Stats

An easy one for today that I'm going to use in a project at some point in the future: getting download statistics from ISMLP with Beautiful Soup 4. I scrape the list [here](http://imslp.org/wiki/Special:IMSLPPopularFiles), parse it for links to the IMSLP work page, then extract the name of the composer and the work from the header. I was trying to use the infoboxes on the bottom of the page but it didn't work out: the infoboxes contain different information from page to page.

In [1]:
import bs4
import requests
import re

In [2]:
base_url = 'http://imslp.org/wiki/Special:IMSLPPopularFiles/{}'
urls = [base_url.format((index)*100) for index in range(2)]
page_texts = []

In [3]:
for url in urls:
    page = requests.get(url)
    page_texts.append(page.text)

In [4]:
def parse_table(page_text):
    bs = bs4.BeautifulSoup(page_text)
    table = bs.find('table')
    rows = []
        
    for row in table.findAll('tr')[1:]:
        cols = row.findAll('td')
        rank = cols[0].string
        
        index_number = cols[1].string.replace('#', '')
        index_url = ('http://imslp.org/index.php?title=Special:ReverseLookup&'
        'action=submit&indexsearch={}').format(index_number)
        
        file_name = cols[2].string
        file_url = u'http://imslp.org/wiki/File:{}'.format(file_name)
        download_count = cols[3].string
        
        rows.append([rank, index_number, index_url, file_name, file_url, download_count])
    
    return rows

In [5]:
master = []
for page_text in page_texts:
    master.extend(parse_table(page_text))

In [6]:
len(master)

200

In [7]:
def get_information(page_text):
    bs = bs4.BeautifulSoup(page_text)
    heading = bs.find('h1', class_='firstHeading').text
    composer = re.findall('\((.*?)\)',heading)[-1]
    title = heading.split('(')[0].strip()
    return [title, composer]

In [8]:
for index, item in enumerate(master):
    page = requests.get(item[2])
    item.extend(get_information(page.text))

In [9]:
master[:10]

[[u'1.',
  u'86710',
  'http://imslp.org/index.php?title=Special:ReverseLookup&action=submit&indexsearch=86710',
  u'PMLP03848-nocturne C- minor B.49.pdf',
  u'http://imslp.org/wiki/File:PMLP03848-nocturne C- minor B.49.pdf',
  u'147213',
  u'Nocturne in C-sharp minor, B.49',
  u'Chopin, Fr\xe9d\xe9ric'],
 [u'2.',
  u'81759',
  'http://imslp.org/index.php?title=Special:ReverseLookup&action=submit&indexsearch=81759',
  u'PMLP05948-BWV 846.pdf',
  u'http://imslp.org/wiki/File:PMLP05948-BWV 846.pdf',
  u'144970',
  u'Das wohltemperierte Klavier I, BWV 846-869',
  u'Bach, Johann Sebastian'],
 [u'3.',
  u'01307',
  'http://imslp.org/index.php?title=Special:ReverseLookup&action=submit&indexsearch=01307',
  u'BWV1004.pdf',
  u'http://imslp.org/wiki/File:BWV1004.pdf',
  u'141121',
  u'Violin Partita No.2 in D minor, BWV 1004',
  u'Bach, Johann Sebastian'],
 [u'4.',
  u'16156',
  'http://imslp.org/index.php?title=Special:ReverseLookup&action=submit&indexsearch=16156',
  u'Debussy - Clair de Lun