# Scrape Wikipedia tables with Eurovision finals information

Scrape a table from wikipedia with eurovision final songs information
Based on: https://gist.github.com/wassname/5b10774dfcd61cdd3f28


In [37]:
from bs4 import BeautifulSoup
import urllib
import os
import codecs
from pprint import pprint

In [163]:
def scrape_results_table(table, year):

    """
    Processes the html of the table and extracts the results
    """

    rows=table.findAll("tr")    
    results = []
    
    for row in rows:
        cells = row.findAll(["td","th"])
        
        if cells[0].text == 'Draw':
            continue;
        else:
            results.append({
                'Year' : year,
                'Country' : cells[1].text.replace('\xa0',''),
                'Artist' : cells[2].text,
                'Song' : cells[3].text.replace('"',''),
                'Language' : cells[4].text,
                'Rank' : cells[5].text,
                'Points' : cells[6].text if len(cells)>6 else ''
            })
            
    return results

In [175]:
all_results = []

for year in range(2013,2018):
    
    print('Processing year',year)

    wikipage = 'https://en.wikipedia.org/wiki/Eurovision_Song_Contest_{}'.format(year)

    header = {'User-Agent': 'Mozilla/5.0'} # header needed to prevent 403 error on Wikipedia
    req = urllib.request.Request(wikipage) #, headers = header)
    res = urllib.request.urlopen(req)
    soup = BeautifulSoup(res, "lxml")

    # get tables
    tables = soup.findAll("table", { "class" : ["sortable wikitable", "wikitable sortable"] })

    print('  Found {} table(s) in entire html'.format(len(tables)))

    # loop tables
    for t in tables:
        
        h2_to_search = 'Results'
        if year in (2006, 2007, 2009, 2010, 2011, 2012, 2016, 2017):
            h2_to_search = 'Participating countries'
        if year in (2015):
            h2_to_search = 'Participants'
        
        # get the tables in the h2 'Results'
        previous_h2 = t.findPreviousSiblings(u'h2')[0]
        if previous_h2.next_element.text == h2_to_search:
            table_results = scrape_results_table(t, year)

            # check country X is in the list (sanity check that the right table was read)
            country = 'Germany'
            if not any(r['Country'] == country for r in table_results):
                print('  {} MISSING! Processing next table'.format(country))
                continue
            else:
                all_results.extend(table_results)
                print('  Processed table with {} rows'.format(len(table_results)))
                break


Processing year 2013
  Found 7 table(s) in entire html


TypeError: argument of type 'int' is not iterable

In [128]:
all_results

[{'Artist': 'Song',
  'Country': 'Country',
  'Language': 'Final result',
  'Points': '',
  'Rank': 'Points',
  'Song': 'Performer(s)',
  'Year': 2002},
 {'Artist': '"Never Let It Go"',
  'Country': 'Sweden',
  'Language': '8th',
  'Points': '',
  'Rank': '72',
  'Song': 'Afro-dite',
  'Year': 2002},
 {'Artist': '"Addicted to You"',
  'Country': 'Finland',
  'Language': '20th',
  'Points': '',
  'Rank': '24',
  'Song': 'Laura Voutilainen',
  'Year': 2002},
 {'Artist': '"Il faut du temps"',
  'Country': 'France',
  'Language': '5th',
  'Points': '',
  'Rank': '104',
  'Song': 'Sandrine François',
  'Year': 2002}]

In [124]:
# save to csv
import csv
keys = all_results[0].keys()
with open('finals_results.csv', 'w') as output_file:
    w = csv.DictWriter(output_file, keys)
    w.writeheader()
    w.writerows(all_results)