In [1]:
import json
import urllib
import urllib2
import requests
import zipfile
import csv
import sys
import difflib
import string
import time
import re
import tmdbsimple as tmdb  # pip install tmdbsimple
import rtsimple as rt  # pip install rtsimple

from IPython.display import display, clear_output
from pymongo import MongoClient
from functools import partial

## Utilities

In [2]:
def readcsv(path):
    f = open(path, "rb")
    rawdata = csv.reader(f, delimiter="\t")
    data = []
    keys = rawdata.next()
    for row in rawdata:
        item = dict()
        for i in range(len(keys)):
            if len(row) > i:
                item[keys[i]] = row[i].decode("utf-8", "ignore").strip()
            else:
                item[keys[i]] = None
        data.append(item)
    return data

## Constants

In [3]:
omdbFile     = "omdb/omdbMovies.txt"
rawOmdbFile  = "raw_omdb.json"
rawTmdbFile  = "raw_tmdb.json"
rawRtFile    = "raw_rt.json"
movieFile    = "movie_data.json"
meteorPath   = "../meteorApp/private/movie_data.json"
uri          = 'mongodb://localhost/appdb'

tmdb.API_KEY = '2a50430216cd531275198faae3531bb4'
tmdbImages = tmdb.Configuration().info()['images']
rt.API_KEY = 'dbbykgj6pbm3wnfrnvhdbdym'

## Run update

In [None]:
loadData(0)

Downloading the movies from OMDB...
OMDB movies downloaded and extracted to omdb/omdbMovies.txt.
Loading and filtering the movies from OMDB...
(55339, 'Movies loaded from OMDB.')
Saving the OMDB filtered movies to raw_omdb.json ...
OMDB movies saved.
Loading TMDB movies, hardReset:0 ...
(58327, ' movies loaded from TMDB.')
Saving the TMDB movies to raw_tmdb.json ...
TMDB Movies saved.
Loading Rotten Tomatoes movies, hardReset:0 ...
Rotten Tomatoes 0043666
success
Rotten Tomatoes 0044036
error
Rotten Tomatoes 0044155
success
Rotten Tomatoes 0044755
error
Rotten Tomatoes 0044963
error
Rotten Tomatoes 0045331
error
Rotten Tomatoes 0045498
success
Rotten Tomatoes 0045763
success
Rotten Tomatoes 0046057
success
Rotten Tomatoes 0046194
success
Rotten Tomatoes 0046443
error
Rotten Tomatoes 0046728
success
Rotten Tomatoes 0046942
success
Rotten Tomatoes 0047192
success
Rotten Tomatoes 0044963
error
Rotten Tomatoes 0047325
error
Rotten Tomatoes 0047467
success
Rotten Tomatoes 0048047
error
Rott

## Load Existing Data

In [27]:
# We first get the latest data from OMDb
def loadData(hardReset):

    fromOmdb = json.load(open(rawOmdbFile, 'r'))
    
    # OMDB
    print("Downloading the movies from OMDB...")
    #downloadOmdbData()
    print("OMDB movies downloaded and extracted to "+omdbFile+".")
    print("Loading and filtering the movies from OMDB...")
    #fromOmdb = loadFromOmdb(omdbFile)
    print(len(fromOmdb), "Movies loaded from OMDB.")
    print("Saving the OMDB filtered movies to "+rawOmdbFile+" ...")
    #saveOmdb(fromOmdb, rawOmdbFile)
    print("OMDB movies saved.")

    # Tmdb
    print("Loading TMDB movies, hardReset:"+str(hardReset)+" ...")
    fromTmdb = loadFromTmdb(rawTmdbFile, hardReset, fromOmdb)
    print(len(fromTmdb), " movies loaded from TMDB.")
    print("Saving the TMDB movies to "+rawTmdbFile+" ...")
    saveTmdb(fromTmdb, rawTmdbFile)
    print("TMDB Movies saved.")

    # Rt
    print("Loading Rotten Tomatoes movies, hardReset:"+str(hardReset)+" ...")
    fromRt = loadFromRt(rawRtFile, hardReset, fromOmdb)
    print(len(fromRt), " movies loaded from Rotten Tomatoes.")
    print("Saving the Rotten Tomatoes movies to "+rawRtFile+" ...")
    saveRt(fromRt, rawRtFile)
    print("Rotten Tomatoes movies saved.")
    
    # Netflix
    
    # Amazon

    # Saving and updating the movies
    print("Combining data from OMDB, TMDB, RT...")
    movies = sanitize(fromOmdb, fromTmdb, fromRt)
    print(len(movies)," movies combined.")
    print("Saving movies to "+movieFile+" and "+meteorPath+" ...")
    saveMovies(movies, movieFile, meteorPath)
    print("Movies saved.")
    print("Updating the mongo db with the latest information...")
    updateMovies(movies)
    print("Mongodb updated.")


##  OMDb Data

http://www.omdbapi.com

In [23]:
def downloadOmdbData():
    urllib.urlretrieve("http://beforethecode.com/projects/omdb/download.aspx?e=philipp@legner.com&tsv=movies", "omdb.zip")
    with zipfile.ZipFile('omdb.zip', "r") as z:
        z.extractall("omdb")

def loadFromOmdb(omdbFile):
    return [
        dict({
            "Awards":     m["Awards"] or None,
            "Cast":       m["Awards"].split(", "),
            "Country":    m["Country"] or None,
            "Directors":  m["Director"].split(", "),
            "FullPlot":   m["FullPlot"] or None,
            "Genres":     m["Genre"].split(", "),
            "Languages":  m["Language"].split(", "),
            "Metacritic": int(m["Metacritic"] or 0),
            "Plot":       m["Plot"] or None,
            "Poster":     m["Poster"] or None,
            "Rating":     m["Rating"].replace("NOT RATED", "") or None,
            "Released":   m["Released"] or None,
            "Runtime":    m["Runtime"] or None,
            "Title":      m["Title"] or None,
            "Writers":    m["Writer"].split(", "),
            "Year":       int(m["Year"]),
            "ImdbID":     m["imdbID"].replace("tt", ""),
            "ImdbRating": float(m["imdbRating"] or 0),
            "ImdbVotes":  int(m["imdbVotes"] or 0)
        })
        for m in readcsv(omdbFile)
        if m["imdbID"] and m["Year"] and int(m["Year"]) > 1950 and m["Poster"]
            and m['imdbVotes'] and int(m['imdbVotes']) > 100
    ]

def saveOmdb(fromOmdb, filePath):
    json.dump(fromOmdb, open(filePath, 'w'), indent=1)
    

##  TMDb Data

In [24]:
def loadFromTmdb(filePath, hardReset, omdbMovies):
    
    if not hardReset:
        tmdbMovies = json.load(open(filePath, 'r'))
    else:
        tmdbMovies = dict()
        
    
    i = 0
    for m in omdbMovies:
        i += 1
   
        imdbID = m["ImdbID"]
        if (imdbID in tmdbMovies): continue

        print("tmdb" + str(i))
        sys.stdout.flush()

        data = keywords = videos = None
        try:
            data = tmdb.Movies("tt" + imdbID).info()
            keywords = tmdb.Movies(data['id']).keywords()['keywords']
            videos = tmdb.Movies(data['id']).videos()['results']
            # titles = tmdb.Movies(data['id']).alternative_titles()
            # cast = tmdb.Movies(fromTmdb['id']).credits()['cast']
            # crew = tmdb.Movies(fromTmdb['id']).credits()['crew']
            # images = tmdb.Movies(fromTmdb['id']).images()
        except requests.HTTPError as e:
            if e.message == '404 Client Error: Not Found':
                tmdbMovies[imdbID] = False
                continue
            else:
                print('HTTP Error')
                return

        tmdbMovies[imdbID] = dict({
            "Adult":      data["adult"] or None,
            "Backdrop":   tmdbImages['base_url'] + 'original' + str(data['backdrop_path']) if data["backdrop_path"] else None,
            "Budget":     data["budget"] or None,
            "Genres":     [g["name"] for g in data["genres"]],
            "Homepage":   data["homepage"] or None,
            "Languages":  [g["name"] for g in data["spoken_languages"]],
            "Tagline":    data["tagline"] or None,
            "Title":      data["title"] or None,
            "Poster":     tmdbImages['base_url'] + 'w185' + data["poster_path"] if data["poster_path"] else None,
                          # can use 'original', u'w92', 'w154', 'w185', 'w342', 'w500', 'w780'
            "Studios":    [g["name"] for g in data["production_companies"]],
            "Overview":   data["overview"] or None,
            "Popularity": data["popularity"] or None,
            "Released":   data["release_date"] or None,
            "Revenue":    data["revenue"] or None,
            "Runtime":    data["runtime"] or None,
            "TmdbRating": data["vote_average"] or None,
            "TmdbVotes":  data["vote_count"] or None,
            "Keywords":   [k["name"] for k in keywords],
            "Videos":     [dict({ "key": v["key"], "type": v["type"], "site": v["site"] }) for v in videos]
        });
    return tmdbMovies

def saveTmdb(tmdbMovies, filePath):
    json.dump(tmdbMovies, open(filePath, 'w'), indent=1)

## Rotten Tomatoes Data

http://www.programmableweb.com/news/52-movies-apis-rovi-rotten-tomatoes-and-internet-video-archive/2013/01/22
https://pypi.python.org/pypi/rtsimple

In [29]:
def loadFromRt(filePath, hardReset, omdbMovies):
    
    if not hardReset:
        rtMovies = json.load(open(filePath, 'r'))
    else:
        rtMovies = dict()
        
    i = 0
    errorCount = 0
    for m in omdbMovies:
        i += 1
        
        if errorCount >= 10:
            return rtMovies
        
        imdbID = m["ImdbID"]
        if (imdbID in rtMovies): continue
        print("Rotten Tomatoes " + imdbID)
        sys.stdout.flush()
        
        try:
            data = rt.Alias().movie(type='imdb', id=imdbID)
        except requests.HTTPError as e:
            rtMovies[imdbID] = False
            print("HTTPError")
            errorCount = errorCount + 1
            continue
            
        if ('error' in data):
            print('error')
            continue;
        else:
            print('success')
        rtID = data['id'];
        
        # videos = rt.Movies(rtID).clips()['clips']
        # reviews = rt.Movies(fromRt['id']).reviews()
        # cast = rt.Movies(fromRt['id']).cast()['cast']
        
        rtMovies[imdbID] = dict({
            "Cast":        [dict({ "name": c["name"], "character": c["characters"] if "characters" in c else None })
                            for c in data["abridged_cast"]],
            "Directors":   [g["name"] for g in data["abridged_directors"]] if "abridged_directors" in data else [],
            "Genres":      data["genres"] or [],
            "RtID":        data["id"],
            "Rating":      data["mpaa_rating"].replace("N/A", "") or None,
            "Consensus":   (data["critics_consensus"] or None) if "critics_consensus" in data else None,
            "Posters":     data["posters"],
            "LinkRt":      data['links']['alternate'] if 'alternate' in data['links'] else None,

            "RtAudience":  data["ratings"]["audience_score"] if data["ratings"]["audience_score"] > 0 else None,
            "RtCritics":   data["ratings"]["critics_score"] if data["ratings"]["critics_score"] > 0 else None,

            "ReleasedDVD": data["release_dates"]["dvd"] if "dvd" in data["release_dates"] else None,
            "Released":    data["release_dates"]["theater"] if "theater" in data["release_dates"] else None,

            "Runtime":     data["runtime"] or None,
            "Studio":      data["studio"] if "studio" in data else None,
            "Synopsis":    data["synopsis"] or None,
            "Title":       data["title"] or None,
            "Year":        data["year"] or None
        });
    rtMovies['0088199'] = False #automagic
    return rtMovies

def saveRt(fromRt, filePath):
    json.dump(fromRt, open(filePath, 'w'), indent=1)

## Sanitize Data

In [26]:
def sanitize(fromOmdb, fromTmdb, fromRt):
    movies = []
    for m in fromOmdb:
        imdbID = m['ImdbID']
        if imdbID in fromTmdb and imdbID in fromRt and fromRt[imdbID] and fromTmdb[imdbID]:
            movies.append(formatMovie(imdbID, m, fromRt[imdbID], fromTmdb[imdbID]))
    return movies        

def saveMovies(movies, filePath, meteorPath):
    json.dump(movies, open(filePath, 'w'), indent=1)
    json.dump(movies, open(meteorPath, 'w'), indent=1)
    
def updateMovies(movies):
    #Don't run this, use Meteor instead!
    client = MongoClient(uri)
    
    for movie in movies:
        cursor = client.appdb.movies.find({"link_imdb": movie['link_imdb']})
        if cursor.count() == 0:
            movie['firstInserted'] = time.time()
            movie['lastUpdated'] = time.time()
            client.appdb.movies.insert(movie)
        else:
            movie['lastUpdated'] = time.time()
            client.appdb.movies.replace_one({"link_imdb" : movie['link_imdb']}, movie)
        
def formatMovie(imdbID, fromOmdb, fromRt, fromTmdb):
    result = dict()
    result['title'] = fromTmdb['Title']
    result['plot'] = fromTmdb['Overview'] or fromOmdb['Plot'] or fromRt['Synopsis']
    result['keywords'] = fromTmdb['Keywords']
    result['runtime'] = fromTmdb['Runtime'] or fromRt['Runtime']
    result['budget'] = fromTmdb['Budget']
    result['revenue'] = fromTmdb['Revenue']
    result['tagline'] = fromTmdb['Tagline']
    result['year'] = fromOmdb['Year']
    result['released'] = fromTmdb['Released']
    result['languages'] = fromTmdb['Languages']
    result['country'] = fromOmdb['Country']
    fromTmdb['Studios'].append(None)
    result['studio'] = fromRt['Studio'] or fromTmdb['Studios'][0]  # TODO
    result['cast'] = fromOmdb['Cast'] or fromOmdb['Cast']
    result['directors'] = fromOmdb['Directors'] or fromOmdb['Directors']
    result['homepage'] = fromTmdb['Homepage']
    
    result['awards'] = fromOmdb['Awards']
    result['rating'] = fromRt['Rating'] or fromOmdb['Rating']

    result['score_rtaudience'] = fromRt['RtAudience']
    result['score_rtcritics'] = fromRt['RtCritics']
    result['score_metacritic'] = fromOmdb['Metacritic']
    result['score_imdb'] = fromOmdb['ImdbRating']
    result['votes_imdb'] = fromOmdb['ImdbVotes']
    result['score'] = (fromRt['RtCritics'] or 0) + (fromRt['RtAudience'] or 0) + (fromOmdb['ImdbRating'] or 0)*10

    result['poster'] = fromOmdb['Poster'] or fromTmdb['Poster']
    result['background'] = fromTmdb['Backdrop']

    result['link_rt'] = fromRt['LinkRt']
    result['link_imdb'] = 'tt' + imdbID

    # Genres
    genres = set(fromTmdb['Genres'])
    if 'Kids' in genres:
        genres.remove('Kids')
        genres.add('Family')
    if 'TV Movie' in genres:
        genres.remove('TV Movie')
    result['genres'] = list(genres)

    # Trailer
    trailers = [a for a in fromTmdb['Videos'] if a['type'] == 'Trailer']
    if len(trailers) and trailers[-1]['site'] == 'YouTube':
        result['trailer_youtube'] = trailers[-1]['key']
    else :
        result['trailer_youtube'] = None

    # Search String
    result['search_str'] = (result['title'] + ' ' + ' '.join(result['cast']) + ' '.join(result['directors']) + ' '.join(result['keywords'])).lower()
        
    # Streaming
    result['streaming'] = dict()
    
    return result
    

## Download Netflix and Hulu Data

In [16]:
netflixUrl = {
    'us': 'http://www.allflicks.net/wp-content/themes/responsive/processing/processing_us.php',
    'uk': 'http://www.allflicks.net/wp-content/themes/responsive/processing/processing_uk.php',
    'fr': 'http://www.allflicks.fr/wp-content/themes/responsive/processing/processing.php',
    'de': 'http://www.allflicks.de/wp-content/themes/responsive/processing/processing.php',
    'ca': 'http://www.allflicks.net/wp-content/themes/responsive/processing/processing_ca.php',
    'au': 'http://au.allflicks.net/wp-content/themes/responsive/processing/processing_au.php',
    'nz': 'http://nz.allflicks.net/wp-content/themes/responsive/processing/processing_nz.php',
    'hulu': 'http://somethingtostream.com/hulu/wp-content/themes/responsive/processing/processing_movies.php'
}
    
netflixDetail = {
    'us': 'http://www.allflicks.net/movies/',
    'uk': 'http://www.allflicks.net/movies-uk/',
    'fr': 'http://www.allflicks.fr/film/',
    'de': 'http://www.allflicks.de/film/',
    'ca': 'http://www.allflicks.net/movies-canada/',
    'au': 'http://au.allflicks.net/movies/',
    'nz': 'http://nz.allflicks.net/movies/',
    'hulu': 'http://somethingtostream.com/hulu/movie/'
}

netflixCountries = ['us', 'uk', 'fr', 'de', 'ca', 'au', 'nz', 'hulu']

In [59]:
netflixMovies = dict();

In [76]:
def loadNetflixForCountry(country):
    response = urllib2.urlopen(netflixUrl[country] + "?shows=false&movies=true&length=1&start=0")
    length = json.load(response)['recordsTotal']
    response = urllib2.urlopen(netflixUrl[country] + "?shows=false&movies=true&length=" + str(length+1) + "&start=0")
    return json.load(response)['data']

def addNetflixMovie(cache, country, movie):
    if movie['id'] in cache:
        cache[movie['id']]['Countries'].append(country)
    else:
        cache[movie['id']] = dict({
            'Title': movie['title'],
            'Year': int(movie['year'].replace('-','0') or 0) or None,
            'Countries': [country]
        })

def getImdbIdForNetflix(allflixID, movie, omdbTitleCache):
    
    # title and year match
    title = re.sub('[^A-Za-z0-9\s]+', '', movie['Title']).lower()
    if title in omdbTitleCache:
        canidates = [x for x in omdbTitleCache[title] if x['Year'] == movie['Year']]
        if len(canidates) == 1:
            return canidates[0]['ImdbID']
    
    # tmdb lookup
    return None
    tmdb.Search().movie(query = movie['title'])
    
    # allflicks lookup
    html = urllib2.urlopen(netflixDetail[movie['Countries'][0]] + str(allflixID)).read()
    start = string.find(html, 'imdb')
    if start >= 0:
        html = html[start+15:]
        return html[:string.find(html, '/')]
            
    return None

In [66]:
def loadFromNetflix():
    netflixIdCache = dict({});
    omdbTitleCache = dict({});

    for c in netflixCountries:
        movies = loadNetflixForCountry(c);
        for m in movies:
            addNetflixMovie(netflixIdCache, c, m)

    for m in omdbMovies:
        title = re.sub('[^A-Za-z0-9\s]+', '', m['Title']).lower()
        if (title not in omdbTitleCache): omdbTitleCache[title] = [];
        omdbTitleCache[title].append(m);

    for m in netflixIdCache:
        imdb = getImdbIdForNetflix(m, netflixIdCache[m], omdbTitleCache)
        if imdb: netflixMovies[imdb] = netflixIdCache[m]

In [81]:
    netflixIdCache = dict({});

    for c in netflixCountries:
        movies = loadNetflixForCountry(c);
        for m in movies:
            addNetflixMovie(netflixIdCache, c, m)

In [82]:
len(netflixIdCache)

19492

In [77]:
loadFromNetflix()
len(netflixMovies)

8037

In [78]:
json.dump(netflixMovies, open('raw_netflix.json', 'w'))

In [79]:
netflixMovies = json.load(open('raw_netflix.json', 'r'))