# Retrieving Data from the Web

## Screen Scraping, Regular Expressions, requests

In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
# get inital page for cast list of F.R.I.E.N.D.S
base_url = 'https://www.imdb.com'
friends_url = f'{base_url}/title/tt0108778/fullcredits?ref_=tt_cl_sm#cast'

res = requests.get(friends_url)
dom = BeautifulSoup(res.text)

# find anchor tag nested underneath td with class primary_photo
# (which in turn, is nested underneath a table with class,
# .cast_list containing a tr)
selector = '.cast_list tr td.primary_photo a'
links = dom.select(selector)

In [3]:
d = {}

# just the first 15 actors / actresses for now...
for link in links[:15]:
    link = f'{base_url}/{link["href"]}'
    # print(link)
    res = requests.get(link)
    dom = BeautifulSoup(res.text)
    
    # get the actor/actress name from title
    name = dom.select('title')[0].get_text().replace(' - IMDb', '')
    
    # find the part of the page that has NNN credits
    div = dom.select('#filmo-head-actress, #filmo-head-actor')
    
    # match (NNN credits), but make sure to capture digits
    regex = '\((\d+) credits\)'
    m = re.search(regex, div[0].get_text())
    num = m.group(1)
    
    try:
        d[name] = int(num)
    except ValueError:
        d[name] = None



In [4]:
pd.Series(d).sort_values(ascending=False)

Elliott Gould          188
Paul Rudd              120
Jessica Hecht           95
Christina Pickles       91
Lisa Kudrow             83
Maggie Wheeler          72
Courteney Cox           67
Jennifer Aniston        64
Matthew Perry           58
David Schwimmer         56
Jane Sibbett            52
Helen Baxendale         46
Matt LeBlanc            32
June Gable              22
James Michael Tyler     17
dtype: int64

## Parsing json, Pagination, API Usage

Assuming API described is paginated with 20 per page:

* `info` key has meta information about response
    * ... meta info includes `next` for next url
* `results` has actual data
* note that api base url must be filled in manually


In [9]:
import json
import pandas as pd
import urllib                             # make http requests
from pandas.io.json import json_normalize # flatten nested json

In [13]:
def api_gen(start_url):
    """
    generator for retrieving paginated data from api 
    
    :param start_url: api end point
    :yield parsed json data from api
    """
    next_url = start_url
    while next_url is not None:
        # print('retrieving', next_url)
        
        # GET url and parse resulting json
        res = urllib.request.urlopen(next_url)
        s = res.read()
        json_obj = json.loads(s)
        
        # based on api documentation, payload is in
        # results key
        data = json_obj.get('results')
        
        # next is in the meta information under
        # info and next
        next_url = json_obj['info'].get('next')
        next_url = next_url if next_url and 'http' in next_url else None 
        
        yield data

In [17]:
# fill in api url with trailing slash
base = 'base url with trailing slash goes here'
url = f'{base}character'

# start with empty DataFrame as accumulator
df = pd.DataFrame()

# loop over generator
for data in api_gen(url):
    # create a DataFrame with json_normalize
    # with each "returned" value (loop variable, data)
    # ...and add it to accumulator
    df = df.append(json_normalize(data))
    
# fix row numbers
df.reset_index(inplace=True)