In [1]:
from urllib.request import urlretrieve, urlopen
from bs4 import BeautifulSoup
import csv
import re, string
from itertools import islice
import pickle

In [2]:
def prepare_datasets():
    f = open("movies.pickle", 'rb')
    movies = pickle.load(f)
    del(f)

    f = open("reviews.pickle", 'rb')
    all_movie_reviews = pickle.load(f)
    del(f)
    
    return [movies, all_movie_reviews]

movies, all_movie_reviews = prepare_datasets()
print(len(movies))
print(len(all_movie_reviews))

30075
63


In [3]:
print(movies[2])

['Jumanji', 1995, ['Adventure', 'Children', 'Fantasy'], 'http://www.rottentomatoes.com/m/1068044-jumanji/reviews/']


In [4]:
recent_movies = {}

In [5]:
recent_years = {2015, 2014, 2013, 2012}

count = 0
for m in movies:
    if movies[m][1] in recent_years:        
        recent_movies[m] = movies[m]
        

In [6]:
len(recent_movies)

3862

In [81]:
def chunks(data, SIZE):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k for k in islice(it, SIZE)}

recent_movies_chunks = chunks(recent_movies, SIZE=5)
recent_movies_chunks = [m for m in recent_movies_chunks]

print(sum([len(m) for m in recent_movies_chunks]))

type(recent_movies_chunks[-1])

3862


set

In [8]:
rt_url = "http://www.rottentomatoes.com"
rt_search_url = rt_url + "/search/?search="

In [9]:
exclude = set(string.punctuation.replace('(','').replace(')', '').replace("'",''))

def get_search_url(movie):
    """
    Finds the rotten tomatoes URL for the given title.
    """
    title = ''.join(ch for ch in movie[0] if ch not in exclude)
    search_url = rt_search_url + '+'.join(title.lower().split(' '))
    
    return search_url

for i in range(100,105):
    print(get_search_url(movies[i]))

http://www.rottentomatoes.com/search/?search=city+hall
http://www.rottentomatoes.com/search/?search=bottle+rocket
http://www.rottentomatoes.com/search/?search=mr+wrong
http://www.rottentomatoes.com/search/?search=unforgettable
http://www.rottentomatoes.com/search/?search=happy+gilmore


In [10]:
def convert_letter_grade(score):
    """
    Converts letter grade to score.
    """
    score = score.lower()
    
    if score == 'a':
        return 1
    
    letter = score[0]
    sign = None
    if len(score) > 1:
        sign = score[1]
    
    s = None
    if letter == 'a':
        s = .80
    elif letter == 'b':
        s = .60
    elif letter == 'c':
        s = .40
    elif letter == 'd':
        s = .20
    elif letter == 'f':
        s = 0
        
    if sign == '+':
        s += .1
    elif sign == '-':
        s -= .1
        
    return s

def compute_score(score):
    """
    Takes a score from rotten tomatoes and converts it to a score between 0 and 1. 0 being terrible
    and 1 being perfect. If there isn't a score, returns -1.
    """
    if not score:
        return None
    
    if '/' in score:
        num, denom = score.split('/')
        return float(num)/float(denom)
    
    return convert_letter_grade(score)

In [45]:
def process_review(review):
    reviewer_and_source = review.find("div", {"class" : "critic_name"})
    link_and_score = review.find("div", {"class" : "small subtle"})
    
    source = reviewer_and_source.find('em').getText()
    reviewer = reviewer_and_source.find('a')
    
    if not reviewer:
        reviewer = source
    else:
        reviewer = reviewer.getText()

    review_link = link_and_score.find('a')
    if (review_link):
        review_link = review_link['href']

    score = link_and_score.getText().split('Original Score:')
    if len(score) > 1:
        score = score[1].strip()
    else:
        score = None
    
    return [reviewer, source, compute_score(score), review_link]
    
    
page_suffix = "?page=%d&sort="
def get_reviews(url):
    """
    Given an RT reviews page, this grabs links to all of the reviews on the page.
    """
    if not url:
        return {}
    
    if not (url[-1] == '/'):
        url += '/'
    
    all_reviews = {}
    for i in range(1,20):
        page = url + (page_suffix % i)
        
        try:
            response = urlopen(page)
        except:
            break
        
        soup = BeautifulSoup(response)
        reviews = soup.findAll("div", { "class" : "row review_table_row" })
    
        for review in reviews:
            data = process_review(review)
            all_reviews[data[0]] = data
    return all_reviews

toy_story_reviews = get_reviews("http://www.rottentomatoes.com/m/toy_story/reviews")
print(len(toy_story_reviews))

77


In [46]:
def get_movie_url(movie):
    """
    Given a movie, determines the rotten tomatoes URL for the critics' reviews of that movie.
    """
    search_url = get_search_url(movie)
    
    response = urlopen(search_url)
    soup = BeautifulSoup(response)
    
    # No results tag
    if (soup.find("h1", {"class" : "center noresults"})):
        return None
    
    if "search results" in soup.find("title").text.lower():
        # Find the movie with the correct year
        year = movie[1]
        results = soup.findAll("li", {"class" : "media bottom_divider clearfix"})
        
        url = None
        for i in results:
            result_year = int(i.find("span", {"class" : "movie_year"}).getText().strip()[1:5])
            result_url = rt_url + i.find("a")["href"]
            
            title = i.find("div", {"class" : "nomargin media-heading bold"}).find("a").getText().lower()

            if year == result_year:
                url = result_url
                break
    
    else:
        url = response.url.split('?')[0]

    return url + "reviews"

# Should be the 3rd result
print(get_movie_url(recent_movies[131258]))

# Should be None
print(get_movie_url(movies[131072]))

http://www.rottentomatoes.com/m/the_pirates_2014/reviews
None


In [71]:
error_url = []
error_rev = []
recent_reviews = {}

In [67]:
!ls

Final Project.ipynb          movies.pickle                [34mrecent_reviews[m[m               reviews.pickle
Recent Movies Analysis.ipynb recent_movies.pickle         recent_reviews.pickle


In [68]:
folder = "recent_reviews/"

def pickle_reviews(mid, rs):
    """
    dump pickle of reviews to disk. Super inefficient but I really don't want to lose this data. 
    Would be better to setup SQL database but I'm cheap.
    """
    f = open("%s%d.pickle" % (folder, mid), 'wb')
    pickle.dump(rs, f)

In [86]:
count = 0

len(recent_movies_chunks)

for i in range(0,1):
    print(i)
    cur = recent_movies_chunks[i]
    
    for mid in cur:
        
        if mid in recent_reviews:
            continue
            
        movie = recent_movies[mid]
        print(movie)
        
        if len(movie) > 3:
            url = movie[3]
        else:
            try:
                url = get_movie_url(movie)
                movie.append(url)
            except Exception as ex:
                error_url.append([mid, movie, ex])
                continue
                
        try:
            rs = get_reviews(url)
        except Exception as ex:
            error_rev.append([mid, movie, ex])
            continue
    
        recent_reviews[mid] = rs
        pickle_reviews(mid, rs)
        
        
    f = open("recent_movies.pickle", 'wb')
    pickle.dump(recent_movies, f)
    f = open("recent_reviews.pickle", 'wb')
    pickle.dump(recent_reviews, f)

0
['Jesus liebt mich', 2012, ['Comedy']]
['I Want You', 2012, ['Drama', 'Romance']]
['Mad Max: Fury Road', 2015, ['Action', 'Adventure']]
['Overnighters, The', 2014, ['Documentary', 'Drama']]
["Dyatlov Pass Incident, The (Devil's Pass)", 2013, ['Mystery', 'Thriller']]


In [87]:
recent_reviews

{106508: {'Anton Bitel': ['Anton Bitel',
   'Grolsch Film Works',
   None,
   'http://grolschfilmworks.com/ca/reviews/frightfest-2013-the-dyatlov-pass-incident'],
  'Brian Orndorf': ['Brian Orndorf',
   'Blu-ray.com',
   0.30000000000000004,
   'http://www.blu-ray.com/Devils-Pass/256702/?show=preview'],
  'Drew Hunt': ['Drew Hunt',
   'Slant Magazine',
   0.5,
   'http://www.slantmagazine.com/film/review/devils-pass'],
  'Dustin Putman': ['Dustin Putman',
   'TheFilmFile.com',
   0.625,
   'http://www.dustinputman.com/reviews/d/13_devilspass.htm'],
  'Ed Whitfield': ['Ed Whitfield',
   'The Ooh Tray',
   None,
   'http://www.theoohtray.com/2013/08/28/frightfest-film-review-the-dyatlov-pass-incident/'],
  'Martyn Conterio': ['Martyn Conterio',
   'Little White Lies',
   0.6,
   'http://www.littlewhitelies.co.uk/theatrical-reviews/the-dyatlov-pass-incident-24579'],
  'MaryAnn Johanson': ['MaryAnn Johanson',
   'Flick Filosopher',
   None,
   'http://www.flickfilosopher.com/2013/08/the-dy

In [84]:
recent_movies_chunks[0]

{106508, 114692, 122880, 122882, 131072}

In [85]:
for i in recent_movies:
    if len(recent_movies[i]) > 3:
        recent_movies[i] = recent_movies[i][0:3]
        print(recent_movies[i])

['Jesus liebt mich', 2012, ['Comedy']]
