# Rotten Tomatoes Scraper

In [1]:
import sys
import os
import json
import re
import pprint
import multiprocessing as mp
import time

In [2]:
data_dir = 'data'
rt_movie_urls_file = os.path.join(data_dir, 'rt_movie_urls.json')
with open(rt_movie_urls_file) as f:
    rt_movie_urls = json.load(f)

In [3]:
from bs4 import BeautifulSoup

from web_fetcher import get_cache

def rt_scrape(url):
    
    page = get_cache(url)
    if not page:
        return
    soup = BeautifulSoup(page, "lxml")
    e = soup.find("script", type="application/ld+json", id="jsonLdSchema")
    if not e:
        return
    info = json.loads(e.text)
    if info['@type'] != 'Movie':
        return
    
    try:
        actors = [a['name'] for a in info['actors'] if a['@type'] == 'Person']
        agg_rating = info['aggregateRating']
        rating = agg_rating['ratingValue'] / 100 #  agg_rating['maxRating'] 
        mpaa_rating = info['contentRating']
        genre = info['genre']
        release_date = info['dateCreated']
        directors = [d['name'] for d in info['director']]
        writers = [w['name'] for w in info['author']]
        title = info['name']
    except (KeyError, ValueError, TypeError):
        return
    
    runtime = None
    try:
        e = soup.find("div", string=re.compile("(?i)Runtime:")).next_sibling.next_sibling
        e = e.find("time")
        runtime = int(e.text.split()[0])
    except (ValueError, TypeError, AttributeError):
        pass
    
    try:
        e = soup.find("div", string=re.compile("(?i)In Theaters:")).next_sibling.next_sibling
        e = e.find("time")
        release_date = e['datetime']
    except (ValueError, TypeError, AttributeError):
        pass
    
    return dict(
        actors=actors,
        rating=rating,
        mpaa_rating=mpaa_rating,
        genre=genre,
        release_date=release_date[:10],
        directors=directors,
        writers=writers,
        title=title,
        runtime=runtime
    )

In [4]:
batch_size = 1000
crt_batch_file = os.path.join(data_dir, 'rt_scraped.crt_batch_num')

In [5]:
def get_crt_batch_num():
    try:
        with open(crt_batch_file) as f:
            crt_batch_num = int(f.readline().strip())
    except (IOError, ValueError, TypeError):
        crt_batch_num = 0
    return crt_batch_num

def save_crt_batch_num(crt_batch_num):
    with open(crt_batch_file, 'w') as f:
        f.write('{}\n'.format(crt_batch_num))

In [6]:
def process_one_batch(crt_batch_num):
    t_start = time.time()
    i_start = crt_batch_num * batch_size
    i_end = i_start + batch_size
    crt_batch_url_list = rt_movie_urls[i_start:i_end]
    if not crt_batch_url_list:
        return 0
    with mp.Pool() as pool:
        scraped_data = list(filter(None, pool.map(rt_scrape, crt_batch_url_list)))
    t_end = time.time()
    o_file = os.path.join(data_dir, 'rt_scraped.py'.format(crt_batch_num))
    first_time = not os.path.isfile(o_file)
    with open(o_file, 'a') as f:
        if first_time:
            f.write("# RT Scraped Batch\n\n" 
                    + "rt_movies = []\n\n")
        f.write("# Batch# {}, time={:.06f}\n".format(crt_batch_num, t_end - t_start))
        f.write("rt_movies.extend(\n")
        pprint.pprint(scraped_data, stream=f)
        f.write(")\n\n")
    return len(crt_batch_url_list)

In [7]:
%%time
n = -1
while n:
    crt_batch_num = get_crt_batch_num()
    n = process_one_batch(crt_batch_num)
    crt_batch_num += 1
    save_crt_batch_num(crt_batch_num)

CPU times: user 7.78 s, sys: 1.43 s, total: 9.21 s
Wall time: 15min 59s


In [None]:
n