In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import pandas as pd
import time
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")

In [2]:
########
# The example below illustrates scraping a table from a Wikipedia page
########

from bs4 import BeautifulSoup
import requests

wiki_url = "http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1970"
web_page = requests.get(wiki_url)

soup = BeautifulSoup(web_page.content, 'lxml')
table_classes = {"class": ["sortable", "plainrowheaders"]}
wikitables = soup.findAll("table", table_classes)

tables = soup.findAll("table", { "class" : "wikitable" })
table = tables[0]

songs = []
rows = table.find_all('tr')
for row in rows[1:101]:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]    
    url = row.find_all('a')[len(row.find_all('a'))-1].get('href')  
    songs.append(dict([('band_singer', cols[2]), ('ranking', cols[0]), ('title', cols[1]), ('url', url)]))
print(songs[2:4])

[{'band_singer': 'The Guess Who', 'ranking': '3', 'title': '"American Woman"', 'url': '/wiki/The_Guess_Who'}, {'band_singer': 'B.J. Thomas', 'ranking': '4', 'title': '"Raindrops Keep Fallin\' on My Head"', 'url': '/wiki/B.J._Thomas'}]


In [5]:
yearstext = {}
for yr in range(1992, 1993):
    wiki_url = "http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_" + str(yr)
    print(wiki_url)
    web_page = requests.get(wiki_url)
    time.sleep(1)
    yearstext['%d'%yr]= web_page.content

http://en.wikipedia.org/wiki/Billboard_Year-End_Hot_100_singles_of_1992


In [3]:
########
# The example below illustrates cleaning of data
########

"""
Function
--------
parse_year

Inputs
------
the_year: the year you want the singles for
yeartext_dict: a dictionary with keys as integer years and values the downloaded web pages 
    from wikipedia for that year.
   
Returns
-------

a list of dictionaries, each of which corresponds to a single and has the
following data:

Eg:

{'band_singer': ['Brandy', 'Monica'],
  'ranking': 2,
  'song': ['The Boy Is Mine'],
  'songurl': ['/wiki/The_Boy_Is_Mine_(song)'],
  'titletext': '" The Boy Is Mine "',
  'url': ['/wiki/Brandy_Norwood', '/wiki/Monica_(entertainer)']}
  
A dictionary with the following data:
    band_singer: a list of bands/singers who made this single
    song: a list of the titles of songs on this single
    songurl: a list of the same size as song which has urls for the songs on the single 
        (see point 3 above)
    ranking: ranking of the single
    titletext: the contents of the table cell
    band_singer: a list of bands or singers on this single
    url: a list of wikipedia singer/band urls on this single: only put in the part 
        of the url from /wiki onwards
    

Notes
-----
See description and example above.
"""
import re
def parse_year(the_year, yeartext_dict):    
    web_page_content = yeartext_dict['%s' %str(the_year)]
    soup = BeautifulSoup(web_page_content, 'lxml')
    table_classes = {"class": ["sortable", "plainrowheaders"]}
    wikitables = soup.findAll("table", table_classes)

    tables = soup.findAll("table", { "class" : "wikitable" })
    table = tables[0]
    
    stat = []
    rows = table.find_all('tr')
    for row in rows[1:101]:
        songs = []
        songurls = []
        singers = []
        singerurls = []
        cols = row.find_all(['td', 'th'])
        rank = cols[0].text.strip()
        titletext = cols[1].text.strip()
        urls = cols[1].find_all('a')
        
        '''
        ### Parse song names and song urls ###
        ### Need to be more precise ###
        '''
        if len(urls) == 0 and titletext.find('/') == -1:
            songs.append(titletext)
            songurls.append(None)
            #print('if_1')
        elif len(urls) == 1 and titletext.find('/') == -1:
            songs.append(titletext)
            songurls.append(urls[0].get('href'))
            #print('if_2')
        elif len(urls) == 2 and titletext.find('/') != -1:
            splited_titletext = titletext.split(' / ')
            for word in splited_titletext:
                word = word.strip('"')
                songs.append(word)
            for url in urls:                
                songurls.append(url.get('href'))
        else:
            songs.append(titletext)
            songurls.append(urls[0].get('href'))
        singertext = cols[2].text.strip()
        urls = cols[2].find_all('a')
        textchecklist = ["featuring", "and", ',']
        
        '''
        ### Parse singers' name and singers' urls ###
        ### Might need more elaborations ###
        '''
        if len(urls) == 1 :    
            singers.append(singertext)
            singerurls.append(urls[0].get('href'))
            #print('if_1')
        elif len(urls) > 1 and any(word in singertext for word in textchecklist):
            #splited_text = re.split('featuring |\sand\s |,', singertext);
            splited_text = re.split('featuring | and |,', singertext);
            if len(urls) == len(splited_text):
                clean_splited_text = []
                for ele in splited_text:
                    clean_splited_text.append(ele.strip())
                singers = clean_splited_text
                for url in urls:                
                    singerurls.append(url.get('href'))
            else:
                clean_splited_text = []
                for ele in splited_text:
                    clean_splited_text.append(ele.strip())
                singers = clean_splited_text[0:len(urls)]
                for url in urls:                
                    singerurls.append(url.get('href'))
        else:
            singers.append(singertext)
            singerurls.append(None)
        stat.append(dict([('band_singer', singers), ('ranking', int(rank)), ('song', songs), \
                          ('songurl', songurls), ('titletext', titletext), ('url', singerurls)]))
    return stat


In [6]:
# Testing
print(parse_year(1992, yearstext)[:5])

[{'band_singer': ['Boyz II Men'], 'ranking': 1, 'song': ['"End of the Road"'], 'songurl': ['/wiki/End_of_the_Road'], 'titletext': '"End of the Road"', 'url': ['/wiki/Boyz_II_Men']}, {'band_singer': ['Sir Mix-a-Lot'], 'ranking': 2, 'song': ['"Baby Got Back"'], 'songurl': ['/wiki/Baby_Got_Back'], 'titletext': '"Baby Got Back"', 'url': ['/wiki/Sir_Mix-a-Lot']}, {'band_singer': ['Kris Kross'], 'ranking': 3, 'song': ['"Jump"'], 'songurl': ['/wiki/Jump_(Kris_Kross_song)'], 'titletext': '"Jump"', 'url': ['/wiki/Kris_Kross']}, {'band_singer': ['Vanessa Williams'], 'ranking': 4, 'song': ['"Save the Best for Last"'], 'songurl': ['/wiki/Save_the_Best_for_Last'], 'titletext': '"Save the Best for Last"', 'url': ['/wiki/Vanessa_L._Williams']}, {'band_singer': ['TLC'], 'ranking': 5, 'song': ['"Baby-Baby-Baby"'], 'songurl': ['/wiki/Baby-Baby-Baby'], 'titletext': '"Baby-Baby-Baby"', 'url': ['/wiki/TLC_(band)']}]
