In [1]:
# import packages
import random
import requests
from selenium import webdriver
import time
from bs4 import BeautifulSoup

# for whatever reason, selenium gets to the end of a page, and keeps trying to click buttons
# need to try except this error away
from selenium.common.exceptions import StaleElementReferenceException

# import proxy drivers
from selenium.webdriver.common.proxy import Proxy, ProxyType
from selenium.webdriver.firefox.options import Options

# import async packages
import asyncio
from proxybroker import Broker
import nest_asyncio
nest_asyncio.apply()

In [None]:
# function to get "N_PROXIES" proxies
def get_proxies(N_PROXIES):
    # initiate proxy list
    proxy_list = []
    
    # define async function to get proxies
    async def show(proxies):
        while True:
            proxy = await proxies.get()
            if proxy is None: break
            print('Found proxy: %s' % proxy)
            proxy_list.append(proxy)
    
    # create async loop
    proxies = asyncio.Queue()
    broker = Broker(proxies)
    tasks = asyncio.gather(
        broker.find(types=['HTTPS'], limit=N_PROXIES),
        show(proxies))
    
    # run async
    loop = asyncio.get_event_loop()
    loop.run_until_complete(tasks)
    
    return proxy_list

In [None]:
# function to test a proxy "TESTS" times
def test_proxy(HOST, PORT, TESTS):
    url = "https://www.rottentomatoes.com"
    
    # initialize proxy settings
    PROXY = str(HOST) + ":" + str(PORT)
    webdriver.DesiredCapabilities.FIREFOX['proxy'] = {
        "httpProxy": PROXY,
        "ftpProxy": PROXY,
        "sslProxy": PROXY,
        "proxyType": "MANUAL",
    }
    
    # define iter variable, using a flag technique
    ITER = 1
    
    # try to get url, if fail, increase iter
    while ITER <= TESTS:
        try:
            # get the url
            driver = webdriver.Firefox()
            driver.install_addon('/home/nathanael/Downloads/testdir/noscript/noscript.xpi', temporary=True)
            driver.set_page_load_timeout(10)
            driver.get(url)
            
            # let the page load
            time.sleep(2)
            
            # look for ratings to double check if loaded
            ratings = driver.find_elements_by_class_name('dynamic-text-list__tomatometer-group')
            driver.quit()
            
            # if successful, set flag to 10, otherwise try again
            if ratings:
                ITER = 10
            else:
                ITER += 1
        
        # failure, increase iter
        except:
            print("failed " + str(ITER) + " time(s)")
            ITER += 1
            driver.quit()
    
    # if success, return True, otherwise false
    if ITER == 10:
        return True
    else:
        return False

In [None]:
def cycle_proxy(proxies):
    # make a copy of the list
    proxies_copy = proxies
    
    # use the flag method
    flag = 0
    while flag == 0:
        # choose random index
        rand_ind = random.randrange(len(proxies))
        proxy = proxies_copy[rand_ind]
        
        # test this proxy
        if test_proxy(proxy.host, proxy.port, 2):
            # if successful, return this proxy
            flag = 1
            return (proxy, proxies_copy)
        
        # otherwise, remove it
        else:
            del proxies_copy[rand_ind]

In [None]:
# get_links gets all links for movies of genre "genre" with score between "score_min" and "score_max"
# input "score_min", "score_max", and "genre" as integers
def get_links(score_min, score_max, genre, HOST, PORT):

    # create url to scrape links from
    url = "https://www.rottentomatoes.com/browse/dvd-streaming-all?" + \
        "minTomato=" + str(score_min) + "&maxTomato=" + str(score_max) + \
        "&services=amazon;hbo_go;itunes;netflix_iw;vudu;amazon_prime;fandango_now&genres=" + \
        str(genre) + "&sortBy=release"

    # initialize empty list to put URLs in
    endings = []
    
    # initialize proxy settings
    PROXY = str(HOST) + ":" + str(PORT)
    webdriver.DesiredCapabilities.FIREFOX['proxy'] = {
        "httpProxy": PROXY,
        "ftpProxy": PROXY,
        "sslProxy": PROXY,
        "proxyType": "MANUAL",
    }

    # open the URL
    driver = webdriver.Firefox()
    driver.install_addon('/home/nathanael/Documents/ublock.xpi', temporary=True)
    driver.install_addon('/home/nathanael/Downloads/testdir/noscript/noscript.xpi', temporary=True)
    driver.set_page_load_timeout(180)
    driver.get(url)
    
    # look for button to load all the movies
    buttons = driver.find_elements_by_class_name('btn.btn-secondary-rt.mb-load-btn')
    
    # wait five seconds for the page to load
    time.sleep(5)
    
    # while there are buttons to click, keep clicking them
    while(buttons):
        try:
            buttons[0].click()
            
            # look for more buttons to click
            buttons = driver.find_elements_by_class_name('btn.btn-secondary-rt.mb-load-btn')
            time.sleep(2)
        except StaleElementReferenceException:
            # once we reach bottom of page, break
            print("Reached bottom of page, scraping links")
            break
    
    # look for the boxes containing info about the movies
    infos = driver.find_elements_by_class_name('movie_info')
    for info in infos:
        # convert to beautiful soup objects
        soup = BeautifulSoup(info.get_attribute('innerHTML'), "html.parser")
        for link in soup.findAll('a'):
            # extract hrefs
            endings.append(link.get('href'))

    # close the browser once done
    driver.quit()
    
    return endings

In [2]:
# same as above, but doesn't use proxies
def get_links_no_proxies(score_min, score_max):

    # create url to scrape links from
    url = "https://www.rottentomatoes.com/browse/dvd-streaming-all?" + \
            "minTomato=" + str(score_min) + "&maxTomato=" + str(score_max) + \
            "&services=amazon;hbo_go;itunes;netflix_iw;vudu;amazon_prime;fandango_now" + \
            "&genres=1;2;4;5;6;8;9;10;11;13;18;14&sortBy=release"

    # initialize empty list to put URLs in
    endings = []

    # open the URL
    driver = webdriver.Firefox()
    driver.install_addon('/home/nathanael/Documents/ublock.xpi', temporary=True)
    driver.install_addon('/home/nathanael/Downloads/testdir/noscript/noscript.xpi', temporary=True)
    driver.set_page_load_timeout(180)
    driver.get(url)
    
    # look for button to load all the movies
    buttons = driver.find_elements_by_class_name('btn.btn-secondary-rt.mb-load-btn')
    
    # wait five seconds for the page to load
    time.sleep(5)
    
    # while there are buttons to click, keep clicking them
    while(buttons):
        try:
            buttons[0].click()
            
            # look for more buttons to click
            buttons = driver.find_elements_by_class_name('btn.btn-secondary-rt.mb-load-btn')
            time.sleep(5)
        except StaleElementReferenceException:
            # once we reach bottom of page, break
            print("Reached bottom of page, scraping links")
            break
    
    # look for the boxes containing info about the movies
    infos = driver.find_elements_by_class_name('movie_info')
    for info in infos:
        # convert to beautiful soup objects
        soup = BeautifulSoup(info.get_attribute('innerHTML'), "html.parser")
        for link in soup.findAll('a'):
            # extract hrefs
            endings.append(link.get('href'))

    # close the browser once done
    driver.quit()
    
    return endings

  and should_run_async(code)


In [None]:
N_PROXIES = 20
genres = [1, 2, 4, 5, 6, 8, 9, 10, 11, 13, 18, 14]
proxies = get_proxies(N_PROXIES)
links_list = []

In [None]:
# script to scrape by genre, using proxies
for genre in genres:
    # cycle proxy every genre
    (proxy, proxies) = cycle_proxy(proxies)
    for i in range(5):
        # scrape URL's in sets of 21 score; requires 5 total batches
        score_min = i * 21
        score_max = min((i + 1) * 21 - 1, 100)
        new_links = get_links(score_min, score_max, genre, proxy.host, proxy.port)
        
        # check if the scrape worked
        if new_links:
            links_list_2.append(new_links)
        else:
            print("failed " + str(score_min) + ":" + str(score_max) + " of " + str(genre))

In [None]:
# there's a glitch on website that doesn't let you see 0 percent movies
# initialize links by getting 0 and 1 percent movies
links = get_links_no_proxies(0, 1)

# script to scrape by score, without proxies
for i in range(99):
    # using the flag method, give each score three chances to scrape
    flag = 1
    while(flag <= 3):
        # try scraping the URL's; if it fails, increase flag by 1
        try:
            new_links = get_links_no_proxies(i+2, i+2)
        except:
            print("score " + str(i+2) + " failed " + str(flag) + " times")
            flag += 1
        
        # if there are new links, exit the loop; otherwise, increase flag by 1
        if new_links:
            flag = 10
        else:
            flag += 1
            
    # if successful, carry on; otherwise, break
    if flag == 10:
        links += new_links
        print("scraped links of score " + str(i + 2))
    else:
        break        

  and should_run_async(code)


Reached bottom of page, scraping links
scraped links of score 0
scraped links of score 1
Reached bottom of page, scraping links
scraped links of score 2
Reached bottom of page, scraping links
scraped links of score 3
score 6 failed 1 times
scraped links of score 4
Reached bottom of page, scraping links
scraped links of score 5
Reached bottom of page, scraping links
scraped links of score 6
Reached bottom of page, scraping links
scraped links of score 7
Reached bottom of page, scraping links
scraped links of score 8
Reached bottom of page, scraping links
scraped links of score 9
Reached bottom of page, scraping links
scraped links of score 10
Reached bottom of page, scraping links
scraped links of score 11
scraped links of score 12
Reached bottom of page, scraping links
scraped links of score 13
Reached bottom of page, scraping links
scraped links of score 14
Reached bottom of page, scraping links
scraped links of score 15
Reached bottom of page, scraping links
scraped links of score 16

In [None]:
import pickle

with open('links.pkl', 'wb') as f:
    pickle.dump(links, f)