In [3]:
from urllib.request import urlretrieve, urlopen
from bs4 import BeautifulSoup
import csv
import re, string
from itertools import islice
import pickle

### Get a Complete List of Movies We Have Ratings For

In [14]:
fileloc = "/Volumes/Eriks HD/MLC Final/ml-latest/"

In [15]:
movie_file = open(fileloc + "movies.csv").readlines()

In [4]:
lines = []
for line in csv.reader(open(fileloc + "movies.csv", newline=''), delimiter=',', quotechar='\"'):
    lines.append(line)

In [5]:
def split_title(title):
    title = title.strip()
    return [title[0:-7], int(title[-5:-1])]

In [6]:
movies = {}
for i in range(1,len(lines)):
    try:
        line = lines[i]
        idnum = int(line[0])
        title, year = split_title(line[1])
        genres = line[2].split('|')

        movies[idnum] = [title, year, genres]
        
    except Exception as inst:
        print(lines[i])
        
    

['40697', 'Babylon 5', 'Sci-Fi']
['79607', 'Millions Game, The (Das Millionenspiel)', 'Action|Drama|Sci-Fi|Thriller']
['87442', 'Bicycle, Spoon, Apple (Bicicleta, cullera, poma)', 'Documentary']
['98063', 'Mona and the Time of Burning Love (Mona ja palavan rakkauden aika) (1983))', 'Drama']
['107434', 'Diplomatic Immunity (2009– )', 'Comedy']
['108548', 'Big Bang Theory, The (2007-)', 'Comedy']
['112406', 'Brazil: In the Shadow of the Stadiums', 'Documentary']
['113190', 'Slaying the Badger', 'Documentary']
['115133', 'Tatort: Im Schmerz geboren', 'Crime']
['115685', 'National Theatre Live: Frankenstein', 'Drama|Fantasy']
['125571', 'The Court-Martial of Jackie Robinson', '(no genres listed)']
['125632', 'In Our Garden', '(no genres listed)']
['125958', 'Stephen Fry In America - New World', '(no genres listed)']
['126438', 'Two: The Story of Roman & Nyro', 'Documentary|Drama']
['126929', "Li'l Quinquin", '(no genres listed)']
['127005', 'A Year Along the Abandoned Road', '(no genres li

In [7]:
for i in range(1,30,5):
    print(movies[i])

['Toy Story', 1995, ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']]
['Heat', 1995, ['Action', 'Crime', 'Thriller']]
['American President, The', 1995, ['Comedy', 'Drama', 'Romance']]
['Casino', 1995, ['Crime', 'Drama']]
['Get Shorty', 1995, ['Comedy', 'Crime', 'Thriller']]
['Othello', 1995, ['Drama']]


In [5]:
len(movies)

30075

### Query Rotten Tomatoes for Each Movie

Not using their public API, but hopefully we'll only have to do this once. I don't think people take kindly to scrapers.

In [18]:
def prepare_datasets():
    f = open("movies.pickle", 'rb')
    movies = pickle.load(f)
    del(f)

    f = open("reviews.pickle", 'rb')
    all_movie_reviews = pickle.load(f)
    
    return [movies, all_movie_reviews]

movies, all_movie_reviews = prepare_datasets()
print(len(movies))
print(len(all_movie_reviews))

30075
63


In [17]:
# Sort movies by date order
movies[1]

['Toy Story',
 1995,
 ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'],
 'http://www.rottentomatoes.com/m/toy_story/reviews/']

In [16]:
def chunks(data, SIZE=500):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k:data[k] for k in islice(it, SIZE)}
        
for it in chunks({i:i for i in range(10)}, SIZE = 3):
    print(it)

movies_chunks = chunks(movies, SIZE=500)
movies_chunks = [m for m in movies_chunks]

print([len(m) for m in movies_chunks])

type(movies_chunks[-1])

{0: 0, 1: 1, 2: 2}
{3: 3, 4: 4, 5: 5}
{8: 8, 6: 6, 7: 7}
{9: 9}
[500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 75]


dict

In [18]:
rt_url = "http://www.rottentomatoes.com"
rt_search_url = rt_url + "/search/?search="

In [24]:
exclude = set(string.punctuation.replace('(','').replace(')', '').replace("'",''))

def get_search_url(movie):
    """
    Finds the rotten tomatoes URL for the given title.
    """
    title = ''.join(ch for ch in movie[0] if ch not in exclude)
    search_url = rt_search_url + '+'.join(title.lower().split(' '))
    
    return search_url

for i in range(100,105):
    print(get_search_url(movies[i]))

http://www.rottentomatoes.com/search/?search=city+hall
http://www.rottentomatoes.com/search/?search=bottle+rocket
http://www.rottentomatoes.com/search/?search=mr+wrong
http://www.rottentomatoes.com/search/?search=unforgettable
http://www.rottentomatoes.com/search/?search=happy+gilmore


In [25]:
def convert_letter_grade(score):
    """
    Converts letter grade to score.
    """
    score = score.lower()
    
    if score == 'a':
        return 1
    
    letter = score[0]
    sign = None
    if len(score) > 1:
        sign = score[1]
    
    s = None
    if letter == 'a':
        s = .80
    elif letter == 'b':
        s = .60
    elif letter == 'c':
        s = .40
    elif letter == 'd':
        s = .20
    elif letter == 'f':
        s = 0
        
    if sign == '+':
        s += .1
    elif sign == '-':
        s -= .1
        
    return s

def compute_score(score):
    """
    Takes a score from rotten tomatoes and converts it to a score between 0 and 1. 0 being terrible
    and 1 being perfect. If there isn't a score, returns -1.
    """
    if not score:
        return None
    
    if '/' in score:
        num, denom = score.split('/')
        return float(num)/float(denom)
    
    return convert_letter_grade(score)

def process_review(review):
    reviewer_and_source = review.find("div", {"class" : "critic_name"})
    link_and_score = review.find("div", {"class" : "small subtle"})
    
    source = reviewer_and_source.find('em').getText()
    reviewer = reviewer_and_source.find('a')
    
    if not reviewer:
        reviewer = source
    else:
        reviewer = reviewer.getText()

    review_link = link_and_score.find('a')
    if (review_link):
        review_link = review_link['href']

    score = link_and_score.getText().split('Original Score:')
    if len(score) > 1:
        score = score[1].strip()
    else:
        score = None
    
    return [reviewer, source, compute_score(score), review_link]
    
    
page_suffix = "?page=%d&sort="
def get_reviews(url):
    """
    Given an RT reviews page, this grabs links to all of the reviews on the page.
    """
    
    if not (url[-1] == '/'):
        url += '/'
    
    all_reviews = {}
    for i in range(1,20):
        page = url + (page_suffix % i)
        
        try:
            response = urlopen(page)
        except:
            break
        
        soup = BeautifulSoup(response)
        reviews = soup.findAll("div", { "class" : "row review_table_row" })
    
        for review in reviews:
            data = process_review(review)
            all_reviews[data[0]] = data
    return all_reviews

toy_story_review = get_reviews("http://www.rottentomatoes.com/m/toy_story/reviews")

In [2]:
def get_movie_url(movie):
    """
    Given a movie, determines the rotten tomatoes URL for the critics' reviews of that movie.
    """
    search_url = get_search_url(movie)
    
    response = urlopen(search_url)
    soup = BeautifulSoup(response)
    
    if "search results" in soup.find("title").text.lower():
        # Find the first movie with the correct year and assume it's correct
        year = movie[1]
        results = soup.findAll("li", {"class" : "media bottom_divider clearfix"})
        
        url = None
        for i in results:
            result_year = int(i.find("span", {"class" : "movie_year"}).getText().strip()[1:5])
            result_url = rt_url + i.find("a")["href"]
            
            title = i.find("div", {"class" : "nomargin media-heading bold"}).find("a").getText()

            if year == result_year:
                url = result_url
                break
    
    else:
        url = response.url.split('?')[0]

    return url + "reviews"

# Should be the 3rd result
get_movie_url(movies[131258])

NameError: name 'movies' is not defined

In [1]:
!ls

Final Project.ipynb          Recent Movies Analysis.ipynb movies.pickle                [34mrecent_reviews[m[m               reviews.pickle


In [2]:
errored_movies = []

In [49]:
def dump_data():
    f = open("movies.pickle", 'wb')
    pickle.dump(movies, f)

    f = open("reviews.pickle", 'wb')
    pickle.dump(all_movie_reviews, f)
    
dump_data()

In [None]:
for i in range(2,20):
    selection = movies_chunks[i]
    
    print(len(movies_chunks[i]))
    
    for i in selection:
        movie = selection[i]
        if len(movie) > 3:
            url = movie[3]
            
        else:
            try:
                url = get_movie_url(movie)
                movie.append(url)
            except Exception as ex:
                errored_movies.append([movie, ex])
                continue
        
        try:
            rs = get_reviews(url)
        except:
            continue
            
        all_movie_reviews[movie[0]] = rs

500
500
500
500
500

In [None]:
len(all_movie_reviews)

In [48]:
# This is funny. Rotten Tomatoes has reviews in the wrong places:
# 
all_movie_reviews["Stargate SG-1 Children of the Gods - Final Cut"]

{'Kevin Carr': ['Kevin Carr',
  '7M Pictures',
  0.7,
  'http://www.7mpictures.com/inside/reviews/stargatesg1childrenofthegodsdvd_review.htm'],
 'New York Times': ['New York Times',
  'New York Times',
  None,
  'http://movies.nytimes.com/2011/05/20/movies/children-of-god-review.html'],
 'Peter Keough': ['Peter Keough',
  'Boston Phoenix',
  None,
  'http://www.thephoenix.com/Boston/movies/101666-children-of-god/']}