In [1]:
import sys
import os
import requests
from http import HTTPStatus
from urllib.parse import \
    urlparse, urljoin
import json

In [2]:
# Install:       conda install lxml
# Doc:           http://lxml.de/api.html
from lxml import etree as ET

In [3]:
RT_TOP_URL = 'https://www.rottentomatoes.com'
RT_SITEMAP_ROOT = 'sitemap.xml'

data_dir = 'data'
if not os.path.isdir(data_dir):
    os.makedirs(data_dir)
    
movie_urls_file = os.path.join(data_dir, 'rt_movie_urls.json')

In [4]:
def get_rt_sitemap_urls():
    """Return the list of sitemap URLs for Rotten Tomatoes
    """
    
    #<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
    # <sitemap>
    #  <loc>https://www.rottentomatoes.com/sitemap_0.xml</loc>
    #  <lastmod>2018-01-27</lastmod>
    # </sitemap>
    # ...
    
    response = requests.get(urljoin(RT_TOP_URL, RT_SITEMAP_ROOT))
    if response.status_code != HTTPStatus.OK:
        return None
    root = ET.fromstring(response.text)
    
    url_list = []
    for loc_node in root.findall('./sitemap/loc', namespaces=root.nsmap):
        url_list.append(loc_node.text)
    return url_list


def get_rt_movie_urls(sitemap_url):
    """Glean movie URL's from a specific sitemap
    """
    
    # <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" 
    #         xmlns:image="http://www.google.com/schemas/sitemap-image/1.1" 
    #         xmlns:video="http://www.google.com/schemas/sitemap-video/1.1">
    #  <url>
    #   <loc>https://www.rottentomatoes.com/m/two_moon_junction</loc>
    #   <image:image>
    #    <image:loc> 
    #      http://resizing.flixster.com/36-DqC-BG2XY44jZyoEqWT3zMXY=/fit-in/205x305/v1.bTsxMTI5MDk2OTtqOzE3NDU1OzEyMDA7MTIwMDsxNjAw
    #    </image:loc>
    #   </image:image>
    #  </url>

    response = requests.get(sitemap_url)
    if response.status_code != HTTPStatus.OK:
        return None
    root = ET.fromstring(response.text)
    
    url_list = []
    for loc_node in root.findall('./url/loc', namespaces=root.nsmap):
        url = loc_node.text
        # Ignore /pictures/, /trailers/, i.e. anything that ends w: `/':
        if url[-1] == '/':
            continue
        # Keep only /m/, i.e. Movies section:
        parsed_url = urlparse(url)
        if parsed_url.path.startswith('/m/'):
            url_list.append(url)
    return url_list

def _rt_get_all_movie_urls():
    """Get all movie URLs from the site
    """

    sitemap_urls = get_rt_sitemap_urls()
    all_movie_urls = []
    for sitemap_url in sitemap_urls:
        all_movie_urls.extend(get_rt_movie_urls(sitemap_url))
    return all_movie_urls

def rt_get_all_movie_urls(force_refresh=False):
    """Get all movie URLs from cache; refresh/seed it as needed.
    """
    
    if force_refresh or not os.path.isfile(movie_urls_file):
        all_movie_urls = _rt_get_all_movie_urls()
        with open(movie_urls_file, 'w') as f:
            json.dump(all_movie_urls, f, indent=2)
    with open(movie_urls_file) as f:
        all_movie_urls = json.load(f)
    return all_movie_urls

In [5]:
all_movie_urls = rt_get_all_movie_urls()

In [6]:
len(all_movie_urls)

40976