## Resources
* Web Scraping Tutorial: https://youtu.be/52wxGESwQSA
    * Associated GitHub repo: https://github.com/paulproteus/python-scraping-code-samples
* MechanicalSoup Tutorial: http://mechanicalsoup.readthedocs.io/en/stable/tutorial.html
* BeautifulSoup4 Documentation: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
* Some Selenium stuff: http://stanford.edu/~mgorkove/cgi-bin/rpython_tutorials/Scraping_a_Webpage_Rendered_by_Javascript_Using_Python.php
* Add chromedriver to path: https://youtu.be/dz59GsdvUF8
* Fandango API: https://developer.fandango.com/docs/read/Fandango

In [433]:
import re
import os
import json
from time import time, sleep
from datetime import datetime
import requests

from selenium import webdriver
import mechanicalsoup as msoup
from bs4 import BeautifulSoup
import lxml

## Selenium Solution (slow, but effective)

In [567]:
def params_to_url_string(params, url_base):

    params_string = "&".join(["%s=%s" % (key, params[key]) for key in params])
    url = "?".join([url_base, params_string])
    
    return url

def get_movie_times(url_base, params, print_confirmation=False):
    '''
    Get movie times for selected theater and movie
    '''
    
    showing_params = {
        "tid": params["tid"],
        "mid": params["mid"],
        "from": params["from"]
    }

    tic = time()
    response_0 = requests.get(url = url_base, params = showing_params)
    content = response_0.content.decode("utf-8") 
    match = re.search(r"var dropdowndates = ({[\s\S]+]})\/\/\]\]", content)
    dropdowndates = json.loads(match.group(1)) if match else None
    
    movie_times_list = [[date, dropdowndates[date][0]["Times"]] for date in dropdowndates]
    movie_times = dict(zip([t[0] for t in movie_times_list], [t[1] for t in movie_times_list]))
    
    if print_confirmation:
        print("Movie times retrieved in %s s." % time() - tic)
    
    return movie_times, dropdowndates
    

def click_through(url, movie_date, movie_time, op_sys="win", browser=None):

    page_source = None
    error_text = None
    
    chromedriver_path = {
        "win": "../webdriver/chromedriver.exe",
        "linux": "../webdriver/chromedriver_linux64"
    }
    
    if browser:
        
        try: 
            # Click "Select new showtime"
            browser.find_element_by_xpath("//*[@id='movieTicketSummary']/div[3]/ul[2]/li[2]/p/a").click()
        except:
            browser.close()
            browser = webdriver.Chrome(chromedriver_path[op_sys])
            browser.get(url = url)
    
    else:
        browser = webdriver.Chrome(chromedriver_path[op_sys])
        browser.get(url = url)
    
    browser.find_element_by_xpath("//*[@id='ShowDateDropDownList']/option[@value='%s']" % movie_date).click()
    browser.find_element_by_xpath("//*[@id='ShowTimeDropDownList']/optgroup/option[@value='%s']" % movie_time).click()
    
    try:
        browser.find_element_by_xpath("//select[@name='AreaRepeater$ctl00$TicketRepeater$ctl00$quantityddl']/option[@value='1']").click()
    
    except:
        error_text = browser.find_element_by_class_name("errorHeaderMessage").text + "\n"
        return browser, page_source, error_text
    
    browser.find_element_by_id("NewCustomerCheckoutButton").click()
    page_source = browser.page_source
    
    return browser, page_source, error_text

def find_and_parse_seat_tags(source):
    
    seats = None
    error = None
    
    soup = BeautifulSoup(source, "html.parser")
    auditorium = soup.find("h2", id="auditoriumInfo").text.strip()
    seat_tags = soup.findAll(name = "div", attrs = {"class": re.compile(".*Seat"), "id": re.compile(".*")})
    if seat_tags:
        seats_raw = [[seat["id"], *seat["class"]] for seat in seat_tags]
        seats = [[x[0], x[1], "A" if x[2] == "availableSeat" else "R"] for x in seats_raw]
    else:
        error = "Seating chart not found.\n"
    
    return seats, auditorium, error

def get_paths(params, movie_datetime):
    
    data_dir = "../data/%s/%s" % (params["mid"], params["tid"])
    data_file = os.path.join(data_dir, "%s.txt" % movie_datetime)
    log_file = os.path.join(data_dir, "%s.log" % movie_datetime)
    
    return data_dir, data_file, log_file

def write_log(event, log_time, log_text_dict, log_file):
    
    with open(log_file, "a+") as f:
        log_text = "%s -%s" % (log_time, log_text_dict[event])
        f.write(log_text)
        return log_text

def write_data(browser, data, data_dir, data_file, log_text_dict, log_file, finish=False):
    
    
    log_time = str(datetime.now())
    
    seat_config = data["seat_config"]
    seat_reserv = data["seat_reserv"]
    
    img_path = "%s/%s.png" % (data_dir, seat_config["auditorium"])
    
    if not os.path.exists(img_path):
        
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        
        browser.set_window_size(1080, 1080)
        element = browser.find_element_by_xpath("//*[@id='seatpickerPage']")
        browser.execute_script("return arguments[0].scrollIntoView(true);", element)
        browser.save_screenshot(img_path)
    
    if not os.path.exists(data_file):
        
        if not os.path.exists(data_dir):
            os.makedirs(data_dir)
        
        with open(data_file, "w") as f:
            f.write(str(seat_config) + "\n")
            f.write(str([log_time, seat_reserv]) + "\n")
            return write_log("start", log_time, log_text_dict, log_file)

    else:
        
        with open(data_file, "a+") as f:
            
            f.seek(0)
            saved_seat_config = f.readline().strip("\n")
            
            if saved_seat_config.strip("\n") != str(seat_config):
                return write_log("error_seat_config", log_time, log_text_dict, log_file)
            
            else:
                f.write(str([log_time, seat_reserv]) + "\n")
                return write_log("finish" if finish else "append", log_time, log_text_dict, log_file)

In [578]:
def get_seating_chart_and_write(movie_date, 
                                movie_time_full,
                                params,
                                url_base,
                                log_text_dict,
                                op_sys="win",
                                browser=None):

    skip = False
    
    movie_time = movie_time_full.split("_")[0]
    movie_datetime = datetime.strptime("%s %s" % (movie_date, movie_time), "%m/%d/%Y %I:%M %p")
    movie_datetime_formatted = movie_datetime.strftime("%Y-%m-%d_%H%M")

    delta = movie_datetime - datetime.now()
    finish = False
    if delta.total_seconds() < 60 * 8:
        finish = True

    url = params_to_url_string(params, url_base)
    browser, source, source_error = click_through(url, movie_date, movie_time_full, op_sys, browser)

    if source_error:
        print("[", movie_datetime, "]", source_error)
        skip = True
        return skip, browser

    seats, auditorium, seat_error = find_and_parse_seat_tags(source)
    
    if seat_error:
        print("[", movie_datetime, "]", seat_error)
        skip = True
        return skip, browser
    
    seat_config = {
        "auditorium": auditorium,
        "seats": [x[0] for x in seats],
        "seat_types": [x[1] for x in seats]
    }
    
    data = {
        "seat_config": seat_config,
        "seat_reserv": {"R": [i for i in range(len(seats)) if seats[i][2] == "R"]}
    }

    data_dir, data_file, log_file = get_paths(params, movie_datetime_formatted)

    
    
    log_text = write_data(browser,
                          data, 
                          data_dir, 
                          data_file, 
                          log_text_dict, 
                          log_file, 
                          finish)

    print("[", movie_datetime, "]", log_text)
    
    return skip, browser

# get_seating_chart_and_write("7/12/2018", "6:15 pm_6819668")

In [579]:
op_sys = "win"

url_base = "https://tickets.fandango.com/transaction/ticketing/express/ticketboxoffice.aspx"

params = {
    "tid": "AABFB",                # theater id
    "mid": "185805",               # movie id
    "from": "mov_det_showtimes"    # page navigated from
}

log_text_dict = {
    "start": " START: Successfully retrieved seating data.\n",
    "append": " Successfully appended seating data.\n",
    "finish": " FINISH: Successfully appended seating data.\n",
    "error_seat_config": "----- ERROR: Inconsistent seating configuration.\n"
}


movie_times, _ = get_movie_times(url_base, params)

browser = None
tic = time()

# Iterate through movie times
for i in range(1):
    
    short_dashes = 20*"-"
    print("\n##### %s Iteration %d %s #####\n" % (short_dashes, i, short_dashes))
    
    for date in movie_times:
        
        for t in movie_times[date]:
            
            skip, browser = get_seating_chart_and_write(date, t, params, url_base, log_text_dict, op_sys, browser)
            
            if skip:
                continue

browser.close()
dt = time() - tic

long_dashes = 50*"-"
print("\n##### %s #####\n" % long_dashes)
print("Finished iterating (%s s elapsed)." % dt)


##### -------------------- Iteration 0 -------------------- #####

[ 2018-07-06 10:45:00 ] 2018-07-06 02:04:22.649568 - Successfully appended seating data.

[ 2018-07-06 11:30:00 ] 2018-07-06 02:04:28.439810 - Successfully appended seating data.

[ 2018-07-06 15:00:00 ] 2018-07-06 02:04:46.276812 - Successfully appended seating data.

[ 2018-07-06 18:15:00 ] 2018-07-06 02:04:52.320548 - Successfully appended seating data.

[ 2018-07-06 21:00:00 ] 2018-07-06 02:04:58.436882 - Successfully appended seating data.

[ 2018-07-07 10:45:00 ] 2018-07-06 02:05:04.537439 - Successfully appended seating data.

[ 2018-07-07 11:30:00 ] 2018-07-06 02:05:10.629047 - Successfully appended seating data.

[ 2018-07-07 15:00:00 ] 2018-07-06 02:05:17.083078 - Successfully appended seating data.

[ 2018-07-07 18:15:00 ] 2018-07-06 02:05:23.358934 - Successfully appended seating data.

[ 2018-07-07 21:00:00 ] 2018-07-06 02:05:29.500066 - Successfully appended seating data.

[ 2018-07-08 11:30:00 ] 2018-07-

---
# Other
---

In [377]:
soup = BeautifulSoup(source, "html.parser")
print(soup.decode(pretty_print = True))

<!DOCTYPE html>
<html class=" js no-flexbox flexbox-legacy canvas canvastext touch geolocation postmessage websqldatabase indexeddb hashchange history draganddrop websockets rgba hsla multiplebgs backgroundsize borderimage borderradius boxshadow textshadow opacity cssanimations csscolumns cssgradients cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent audio localstorage sessionstorage webworkers applicationcache" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <script async="" crossorigin="anonymous" src="https://cdnssl.clicktale.net/www/ChangeMonitor-latest.js" type="text/javascript">
  </script>
  <script async="" crossorigin="anonymous" src="https://cdnssl.clicktale.net/www12/ptc/104ef80c-087c-4d87-958a-be928764850a.js" type="text/javascript">
  </script>
  <script async="" src="https://sb.scorecardresearch.com/beacon.js">
  </script>
  <script async="" src="https://connect.facebook.net/en_US/fbevents.js">
  </script>
  <script async="" src="//nerv

In [229]:
tic = time()

browser = msoup.StatefulBrowser()
# my_url = "https://www.fandango.com/92612_movietimes"
# my_url = "https://www.fandango.com/incredibles-2-185805/movie-times"
# my_url = "https://tickets.fandango.com/transaction/ticketing/express/ticketboxoffice.aspx?row_count=231470826&tid=AABFB&sdate=2018-07-05+21:30&mid=185805&from=mov_det_showtimes"

base_url = "https://tickets.fandango.com/transaction/ticketing/express/ticketboxoffice.aspx"

showing_params = {
    "row_count": "231470826",      # ???
    "tid": "AABFB",                # theater id
    "sdate": "2018-07-05+21:30",   # showing date
    "mid": "185805",               # movie id
    "from": "mov_det_showtimes"    # page navigated from
}

# response = browser.open(base_url, params = showing_params)
# soup = BeautifulSoup(response.content, "html.parser")
# dt = time() - tic
# print("Time elapsed: %s s" % dt)
response.content.decode("utf-8") 

'<!doctype html>\r\n\r\n\r\n\r\n <html class="no-js"> \r\n    <head>\r\n        <meta http-equiv="X-UA-Compatible" content="IE=Edge"/>\r\n        <title>Fandango Secure Checkout</title>\r\n        \r\n\r\n\r\n<meta http-equiv="X-UA-Compatible" content="IE=Edge;chrome=1" > \r\n<meta name="viewport" content="width=980"> \r\n<script type="text/javascript" src="https://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js"></script>\r\n\r\n\r\n    \r\n    <!--[mps:Load]-->\r\n    <script>\r\n        var mpscall = {\r\n            "site":"fandango-web",\r\n"path":"/transaction/ticketing/express/ticketboxoffice.aspx",\r\n"adunits":"Top Banner",\r\n"field[env]":"production",\r\n"cat":"purchase",\r\n"is_content":"1",\r\n"qs":"tid=AABFB&mid=185805&from=mov_det_showtimes",\r\n"title":"Fandango Secure Checkout",\r\n"type":"ecommerce",\r\n            };\r\n        var mpsopts = {\r\n            \'host\': \'mps.nbcuni.com\'\r\n        };\r\n        var mps = mps || {}; mps._ext = mps._ext || {};

In [231]:
showing_params = {
#     "row_count": "231470826",      # ???
    "tid": "AABFB",                # theater id
#     "sdate": "2018-07-05+21:30",   # showing date
    "mid": "185805",               # movie id
    "from": "mov_det_showtimes"    # page navigated from
}

url_base = "https://tickets.fandango.com/transaction/ticketing/express/ticketboxoffice.aspx"

response = requests.get(url = url_base, params = showing_params)
response

<Response [200]>

In [232]:
response.url

'https://tickets.fandango.com/transaction/ticketing/express/ticketboxoffice.aspx?tid=AABFB&mid=185805&from=mov_det_showtimes'

In [251]:
content = response.content.decode("utf-8") 
match = re.search(r"var dropdowndates = ({[\s\S]+]})\/\/\]\]", content)
if match:
    dropdowndates = json.loads(match.group(1))
else:
    print("No showing times found.")
    
dropdowndates

{'7/10/2018': [{'Key': 'Reserved seating, Closed caption, Accessibility devices available, Recliner Seats',
   'Times': ['10:45 am_6819668',
    '11:30 am_6819668',
    '3:00 pm_6819668',
    '6:15 pm_6819668',
    '9:15 pm_6819668']}],
 '7/11/2018': [{'Key': 'Reserved seating, Closed caption, Accessibility devices available, Recliner Seats',
   'Times': ['10:45 am_6819668',
    '11:30 am_6819668',
    '3:00 pm_6819668',
    '6:15 pm_6819668',
    '9:15 pm_6819668']}],
 '7/12/2018': [{'Key': 'Reserved seating, Closed caption, Accessibility devices available, Recliner Seats',
   'Times': ['10:45 am_6819668',
    '11:30 am_6819668',
    '3:00 pm_6819668',
    '6:15 pm_6819668',
    '9:15 pm_6819668']}],
 '7/4/2018': [{'Key': 'Reserved seating, Closed caption, Accessibility devices available, Recliner Seats',
   'Times': ['7:00 pm_6819668', '9:15 pm_6819668']}],
 '7/5/2018': [{'Key': 'Reserved seating, Closed caption, Accessibility devices available, Recliner Seats',
   'Times': ['11:45 a

In [280]:
showing_params = {
#     "row_count": "231470826",      # ???
    "tid": "AABFB",                # theater id
#     "sdate": "7/5/2018+21:30",   # showing date
    "mid": "185805",               # movie id
    "from": "mov_det_showtimes"    # page navigated from
}

browser = msoup.StatefulBrowser()
msoup_response = browser.open(response.url)
browser.select_form()
# browser.select_form().print_summary()
browser["ShowDateDropDownList"] = "7/12/2018"
browser.submit_selected()
browser["ShowTimeDropDownList"] = "9:15 pm_6819668"
# browser.get_current_form().print_summary()

In [282]:
soup = BeautifulSoup(msoup_response.content, "html.parser")
print(soup.decode(pretty_print = True))

<!DOCTYPE doctype html>
<html class="no-js">
 <head>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <title>
   Fandango Secure Checkout
  </title>
  <meta content="IE=Edge;chrome=1" http-equiv="X-UA-Compatible">
   <meta content="width=980" name="viewport">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js" type="text/javascript">
    </script>
    <!--[mps:Load]-->
    <script>
     var mpscall = {
            "site":"fandango-web",
"path":"/transaction/ticketing/express/ticketboxoffice.aspx",
"adunits":"Top Banner",
"field[env]":"production",
"cat":"purchase",
"is_content":"1",
"qs":"tid=AABFB&mid=185805&sdate=7/11/2018&from=mov_det_showtimes",
"title":"Fandango Secure Checkout",
"type":"ecommerce",
            };
        var mpsopts = {
            'host': 'mps.nbcuni.com'
        };
        var mps = mps || {}; mps._ext = mps._ext || {}; mps._adsheld = []; mps._queue = mps._queue || {}; mps._queue.mpsloaded = mps._queue.mpsl

# Select 1 ticket to to view seat selection 
* `Adult: 1`
* `Senior: 0`
* `Child: 0`

(Assume that we already have a ticket selection URL, e.g. "https://tickets.fandango.com/transaction/ticketing/express/ticketboxoffice.aspx?row_count=231470826&tid=AABFB&sdate=2018-07-05+21:30&mid=185805&from=mov_det_showtimes")

In [336]:
ticket_selection_url = "https://tickets.fandango.com/transaction/ticketing/express/ticketboxoffice.aspx?row_count=232344705&tid=AABTB&sdate=2018-07-12+21:15&mid=185805&from=mov_det_showtimes"

In [337]:
browser = msoup.StatefulBrowser()
msoup_response = browser.open(ticket_selection_url)
form = browser.select_form()
browser["AreaRepeater$ctl00$TicketRepeater$ctl00$quantityddl"] = "1"
# soup = BeautifulSoup(t_response.content, "html.parser")
# form.choose_submit("inputTotal")
response_1 = browser.submit_selected()

In [305]:
soup.button

<button class="button primary medium" id="NewCustomerCheckoutButton" onclick="__doPostBack('NewCustomerCheckoutButton','')" type="button">Continue to Seat Selection</button>

In [308]:
soup = BeautifulSoup(response_1.content, "html.parser")
print(soup.decode(pretty_print = True))

<!DOCTYPE doctype html>
<html class="no-js">
 <head>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <title>
   Fandango Secure Checkout
  </title>
  <meta content="IE=Edge;chrome=1" http-equiv="X-UA-Compatible">
   <meta content="width=980" name="viewport">
    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.9.1/jquery.min.js" type="text/javascript">
    </script>
    <!--[mps:Load]-->
    <script>
     var mpscall = {
            "site":"fandango-web",
"path":"/transaction/ticketing/express/ticketboxoffice.aspx",
"adunits":"Top Banner",
"cag[genre]":"Action_Adventure|Animated",
"cag[rt]":"PG",
"cag[mvnm]":"Incredibles 2",
"cag[mv]":"185805",
"field[env]":"production",
"field[cid]":"REGL",
"field[tid]":"AABFB",
"cat":"purchase",
"is_content":"1",
"qs":"row_count=231470826&tid=AABFB&sdate=2018-07-05+21:30&mid=185805&from=mov_det_showtimes",
"title":"Fandango Secure Checkout",
"type":"ecommerce",
            };
        var mpsopts = {
           

In [343]:
from selenium import webdriver

tic = time()

browser = webdriver.Chrome("../webdriver/chromedriver.exe")
response = browser.get(ticket_selection_url)
browser.find_element_by_xpath("//select[@name='AreaRepeater$ctl00$TicketRepeater$ctl00$quantityddl']/option[@value='1']").click()
browser.find_element_by_id("NewCustomerCheckoutButton").click()
browser.close()

time() - tic

9.860759973526001

In [129]:
element = browser.find_element_by_class_name("theater__btn-list")