# Web scraping using `BeautifulSoup` and accessing APIs

In [1]:
import json
import numpy as np
import pandas as pd
import re
import requests

from bs4 import BeautifulSoup
from time import sleep

## Web scraping using `BeautifulSoup`

### Read HTML

In [2]:
html = requests.get('http://www.imdb.com/title/tt2084970/').text

### Parse HTML into a `BeautifulSoup` object

In [3]:
soup = BeautifulSoup(html, 'lxml')

### Retrieve the title

In [4]:
soup.find(name='h1', attrs={'itemprop': 'name'})

<h1 class="" itemprop="name">The Imitation Game <span id="titleYear">(<a href="/year/2014/?ref_=tt_ov_inf">2014</a>)</span> </h1>

In [5]:
soup.find(name='h1', attrs={'itemprop': 'name'}).find(text=True, recursive=False)

'The Imitation Game\xa0'

In [6]:
soup.find(name='h1', attrs={'itemprop': 'name'}).find(text=True, recursive=False).strip()

'The Imitation Game'

### Retrieve the genre(s)

In [7]:
soup.find_all(name='span', attrs={'itemprop': 'genre'})

[<span class="itemprop" itemprop="genre">Biography</span>,
 <span class="itemprop" itemprop="genre">Drama</span>,
 <span class="itemprop" itemprop="genre">Thriller</span>]

In [8]:
[x.text for x in soup.find_all('span', itemprop='genre')]

['Biography', 'Drama', 'Thriller']

### Retrieve the description

In [9]:
soup.find('div', itemprop='description').text.strip()

'During World War II, mathematician Alan Turing tries to crack the enigma code with help from fellow mathematicians.'

### Retrieve the duration (in minutes)

In [10]:
int(re.findall(r'(\d+)', soup.find('time', itemprop='duration')['datetime'])[0])

114

### Retrieve the content rating

In [11]:
soup.find('meta', itemprop='contentRating')['content']

'12A'

### Retrieve the rating

In [12]:
float(soup.find('span', itemprop='ratingValue').text)

8.1

### Retrieve the rating and number of reviews

In [13]:
soup.find('div', class_='ratingValue').strong['title']

'8.1 based on 533,052 user ratings'

In [14]:
soup.find('div', 'ratingValue').strong['title']

'8.1 based on 533,052 user ratings'

In [15]:
rating, n = re.findall(r'^([\d\.]+).+?([\d,]+)', soup.find('div', 'ratingValue').strong['title'])[0]
rating = float(rating)
n = int(n.replace(',', ''))
rating, n

(8.1, 533052)

### Define a function to do all of the above given an IMDb ID

In [16]:
def scrape_film_info(imdb_id):
    html = requests.get('http://www.imdb.com/title/' + imdb_id).text
    soup = BeautifulSoup(html, 'lxml')
    info = {}
    info['title'] =\
        soup.find('h1', itemprop='name').find(text=True, recursive=False).strip()
    info['genres'] =\
        [x.text for x in soup.find_all('span', itemprop='genre')]
    info['description'] =\
        soup.find('div', itemprop='description').text.strip()
    info['duration'] =\
        int(re.findall(r'(\d+)', soup.find('time', itemprop='duration')['datetime'])[0])
    info['content_rating'] =\
        soup.find('meta', itemprop='contentRating')['content']
    rating, n =\
        re.findall(r'^([\d\.]+).+?([\d,]+)',\
                   soup.find('div', 'ratingValue').strong['title'])[0]
    info['rating'] = float(rating)
    info['n'] = int(n.replace(',', ''))
    return info

In [17]:
scrape_film_info('tt2084970')

{'content_rating': '12A',
 'description': 'During World War II, mathematician Alan Turing tries to crack the enigma code with help from fellow mathematicians.',
 'duration': 114,
 'genres': ['Biography', 'Drama', 'Thriller'],
 'n': 533052,
 'rating': 8.1,
 'title': 'The Imitation Game'}

### Get the *Top 250 as rated by IMDb Users* list

In [18]:
soup = BeautifulSoup(requests.get('http://www.imdb.com/chart/top').text, 'lxml')

### Retrieve the list of IMDb IDs

In [19]:
tmp = soup.find_all(name='td', attrs={'class': 'titleColumn'})
imdb_ids = [re.findall(r'/(tt[0-9]+)/', x.a['href'])[0] for x in tmp]
imdb_ids = imdb_ids[:10]  # Keep only the top 10 films
imdb_ids

['tt0111161',
 'tt0068646',
 'tt0071562',
 'tt0468569',
 'tt0050083',
 'tt0108052',
 'tt0110912',
 'tt0167260',
 'tt0060196',
 'tt0137523']

### Call `scrape_film_info` for each ID

In [20]:
films = []
for imdb_id in imdb_ids:
    films.append(scrape_film_info(imdb_id))
    sleep(1)

In [21]:
films[:2]

[{'content_rating': '15',
  'description': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
  'duration': 142,
  'genres': ['Crime', 'Drama'],
  'n': 1813827,
  'rating': 9.3,
  'title': 'The Shawshank Redemption'},
 {'content_rating': 'X',
  'description': 'The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son.',
  'duration': 175,
  'genres': ['Crime', 'Drama'],
  'n': 1238791,
  'rating': 9.2,
  'title': 'The Godfather'}]

In [22]:
films = pd.DataFrame(films, index=imdb_ids)  # Convert to a DataFrame
films

Unnamed: 0,content_rating,description,duration,genres,n,rating,title
tt0111161,15,Two imprisoned men bond over a number of years...,142,"[Crime, Drama]",1813827,9.3,The Shawshank Redemption
tt0068646,X,The aging patriarch of an organized crime dyna...,175,"[Crime, Drama]",1238791,9.2,The Godfather
tt0071562,X,The early life and career of Vito Corleone in ...,202,"[Crime, Drama]",852634,9.0,The Godfather: Part II
tt0468569,12,When the menace known as the Joker wreaks havo...,152,"[Action, Crime, Drama]",1794091,9.0,The Dark Knight
tt0050083,U,A jury holdout attempts to prevent a miscarria...,96,"[Crime, Drama]",491670,8.9,12 Angry Men
tt0108052,15,"In German-occupied Poland during World War II,...",195,"[Biography, Drama, History]",931146,8.9,Schindler's List
tt0110912,18,"The lives of two mob hit men, a boxer, a gangs...",154,"[Crime, Drama]",1420439,8.9,Pulp Fiction
tt0167260,12A,Gandalf and Aragorn lead the World of Men agai...,201,"[Adventure, Drama, Fantasy]",1301174,8.9,The Lord of the Rings: The Return of the King
tt0060196,X,A bounty hunting scam joins two men in an unea...,148,[Western],538443,8.9,"The Good, the Bad and the Ugly"
tt0137523,18,"An insomniac office worker, looking for a way ...",139,[Drama],1453391,8.8,Fight Club


## Accessing APIs

### Send API request

In [23]:
req = requests.post('http://api.postcodes.io/postcodes', json={'postcodes': ['E1 7PT', 'EC2M 7PP']})

### Check HTTP status code (2xx = success, 4xx = client error, 5xx = server error)

In [24]:
req.status_code

200

### Get the raw response

In [25]:
req.text

'{"status":200,"result":[{"query":"E1 7PT","result":{"postcode":"E1 7PT","quality":1,"eastings":533842,"northings":181367,"country":"England","nhs_ha":"London","longitude":-0.0725132699729764,"latitude":51.5153793466949,"parliamentary_constituency":"Bethnal Green and Bow","european_electoral_region":"London","primary_care_trust":"Tower Hamlets","region":"London","lsoa":"Tower Hamlets 015D","msoa":"Tower Hamlets 015","incode":"7PT","outcode":"E1","admin_district":"Tower Hamlets","parish":"Tower Hamlets, unparished area","admin_county":null,"admin_ward":"Spitalfields & Banglatown","ccg":"NHS Tower Hamlets","nuts":"Tower Hamlets","codes":{"admin_district":"E09000030","admin_county":"E99999999","admin_ward":"E05009333","parish":"E43000220","ccg":"E38000186","nuts":"UKI42"}}},{"query":"EC2M 7PP","result":{"postcode":"EC2M 7PP","quality":1,"eastings":533190,"northings":181545,"country":"England","nhs_ha":"London","longitude":-0.0818367652850008,"latitude":51.5171328904314,"parliamentary_cons

### Decode the JSON response into a dictionary

In [26]:
req.json()

{'result': [{'query': 'E1 7PT',
   'result': {'admin_county': None,
    'admin_district': 'Tower Hamlets',
    'admin_ward': 'Spitalfields & Banglatown',
    'ccg': 'NHS Tower Hamlets',
    'codes': {'admin_county': 'E99999999',
     'admin_district': 'E09000030',
     'admin_ward': 'E05009333',
     'ccg': 'E38000186',
     'nuts': 'UKI42',
     'parish': 'E43000220'},
    'country': 'England',
    'eastings': 533842,
    'european_electoral_region': 'London',
    'incode': '7PT',
    'latitude': 51.5153793466949,
    'longitude': -0.0725132699729764,
    'lsoa': 'Tower Hamlets 015D',
    'msoa': 'Tower Hamlets 015',
    'nhs_ha': 'London',
    'northings': 181367,
    'nuts': 'Tower Hamlets',
    'outcode': 'E1',
    'parish': 'Tower Hamlets, unparished area',
    'parliamentary_constituency': 'Bethnal Green and Bow',
    'postcode': 'E1 7PT',
    'primary_care_trust': 'Tower Hamlets',
    'quality': 1,
    'region': 'London'}},
  {'query': 'EC2M 7PP',
   'result': {'admin_county': N