You can use this notebook to scrape votes of the Sight and Sound 2012 Poll. 

All data are stored in JSON files. The films are saved in FILMS_PATH as an array of film objects of name, year, director, country and url. 

The voters are saved in VOTERS_PATH as an array of voter objects of name, role, country, gender and url. 

The votes are saved in VOTES_PATH as an array of pairs of voter index and film index. Each pair denotes a vote.

The comments are saved in COMMENTS_PATH as an array of strings, in the same order as the voters. If a voter does not provide a comment, an empty string is saved.

In [1]:
import requests
import os.path
from bs4 import BeautifulSoup
import json
import time
import sys
import re

In [2]:
FILMS_URL = 'https://www.bfi.org.uk/films-tv-people/sightandsoundpoll2012/films'
VOTEFS_URL ='https://www.bfi.org.uk/films-tv-people/sightandsoundpoll2012/voters'

# List of films is saved in FILMS_PATH
# List of voters is saved in VOTERS_PATH
# List of voter-film pair is saved in VOTES_PATH 
# List of voters' comments is saved in COMMENTS_PATH
FOLDER_PATH = 'data'
FILMS_PATH = '{}/films.json'.format(FOLDER_PATH)
VOTERS_PATH = '{}/voters.json'.format(FOLDER_PATH)
VOTES_PATH = '{}/votes.json'.format(FOLDER_PATH)
COMMENTS_PATH = '{}/comments.json'.format(FOLDER_PATH)
SCRAPE_DELAY = 10

In [3]:
def scrape_films():
    if os.path.exists(FILMS_PATH):
        print('Films are parsed already')
        return
    films_page = requests.get(FILMS_URL)
    soup = BeautifulSoup(films_page.content, 'html.parser')
    film_soup = soup.find_all('tr')
    films = []
    for fs in film_soup:
        f_list = fs.get_text().split('\n')
        year_re = re.findall(r'(?<=\()\d{4}(?=\)$)', f_list[1])
        year = None if len(year_re) == 0 else int(year_re[0])
        name = f_list[1] if len(year_re) == 0 else f_list[1][0:len(f_list[1]) - 7]
        url_soup = fs.find('a', href=True)
        url = None if url_soup is None else url_soup['href']
        films.append({
            'name': name,
            'year': year,
            'director': f_list[2],
            'country': f_list[3],
            'url': url,
        })
    with open(FILMS_PATH, 'w') as outfile:
        json.dump(films, outfile)
    print('Scraped films')
    print('Saved films in {}'.format(FILMS_PATH))
    time.sleep(SCRAPE_DELAY)

def scrape_voters():
    if os.path.exists(FILMS_PATH):
        print('Films are parsed already')
        return
    voters_page = requests.get(VOTEFS_URL)
    soup = BeautifulSoup(voters_page.content, 'html.parser')
    voter_soup = soup.find_all('tr')
    voters = []
    for vs in voter_soup:
        v_list = vs.get_text().split('\n')
        voters.append({
            'name': v_list[1],
            'role': v_list[2].lower(),
            'country': v_list[3],
            'gender': v_list[4].lower(),
            'url': vs.find('a', href=True)['href'],
        })
    with open(VOTERS_PATH, 'w') as outfile:
        json.dump(voters, outfile)
    print('Scraped voters')
    print('Saved voters in {}'.format(VOTERS_PATH))
    time.sleep(SCRAPE_DELAY)

def scrape_votes(films, voters):
    print('Votes are saved in {}'.format(VOTES_PATH))
    print('Comments are saved in {}'.format(COMMENTS_PATH))
    # Represent votes with a list of row-col index pairs
    if os.path.exists(VOTES_PATH):
        with open(VOTES_PATH) as json_file:
            votes_data = json.load(json_file)
    else:
        votes_data = []
    if os.path.exists(COMMENTS_PATH):
        with open(COMMENTS_PATH) as json_file:
            comments = json.load(json_file)
    else:
        comments = []
    row_idx_start = 0 if len(votes_data) == 0 else votes_data[-1][0] + 1
    if len(comments) != row_idx_start:
        raise RuntimeError('votes data and comments have different lengths')
    print('Start scraping from voter id {}'.format(row_idx_start))
    # Scrape votes and append to stored data
    for row_idx in range(row_idx_start, len(voters)):
        voter = voters[row_idx]
        vote_page = requests.get(voter['url'])
        soup = BeautifulSoup(vote_page.content, 'html.parser')
        vote_soup = soup.find_all('tr')
        if len(vote_soup) != 10:
            print(voter['url'])
            print('Find {} votes of voter id {}'.format(len(vote_soup), row_idx))
        for v in vote_soup:
            v_soup = v.find_all('p')
            if len(v_soup) == 0:
                print(voter['url'])
                raise RuntimeError('Cannot find votes of voter ' + str(row_idx))
            film_name = v.find_all('p')[0].get_text()
            year_text = v.find_all('p')[1].get_text()
            year = int(year_text) if year_text.isdigit() else None
            director_name = v.find_all('p')[2].get_text()
            # Find the film by matching film name, year and director
            col_range = [i for i in range(len(films)) if films[i]['name'] == film_name and films[i]['year'] == year and films[i]['director'] == director_name]
            # If the film cannot be identified, find the film with film name and year only
            if len(col_range) == 0:
                col_range = [i for i in range(len(films)) if films[i]['name'] == film_name and films[i]['year'] == year]
            if len(col_range) != 1:
                raise RuntimeError('Find {} films with name {}'.format(len(col_range), film_name))
            col_idx = col_range[0]
            votes_data.append([row_idx, col_idx])
        comment = soup.find('div', {"class": "wysiwyg"})
        if comment is None:
            comments.append('')
        else:
            comments.append(comment.get_text())
        # Save the result after every vote page visit
        with open(VOTES_PATH, 'w') as outfile:
            json.dump(votes_data, outfile)
        with open(COMMENTS_PATH, 'w') as outfile:
            json.dump(comments, outfile)
        # Delay between parsing
        print('Scraped votes of voter {}/{}'.format(row_idx + 1, len(voters)))
        sys.stdout.flush()
        time.sleep(SCRAPE_DELAY)
    print('Scraped all votes')

In [4]:
# Make directory for data folder if it does not exist
os.makedirs(FOLDER_PATH, exist_ok=True)

In [5]:
scrape_films()
scrape_voters()
# Load films and voters data
with open(FILMS_PATH) as json_file:
    films = json.load(json_file)
with open(VOTERS_PATH) as json_file:
    voters = json.load(json_file)
print('Found {} films'.format(len(films)))
print('Found {} voters'.format(len(voters)))
scrape_votes(films, voters)

Films are parsed already
Films are parsed already
Found 2567 films
Found 1205 voters
Votes are saved in data/votes.json
Comments are saved in data/comments.json
Start scraping from voter id 1205
Scraped all votes
