# Rotten Tomatoes Pre-Processor

In [1]:
import sys
import os
import json
import re
import pprint
import time
import re
from bisect import bisect_right
import csv
import gzip

In [2]:
from data.rt_scraped import rt_movies

In [3]:
data_dir = 'data'

In [4]:
def count_occurences(movie, selector, collector):
    date = movie.get('release_date')
    name_list = movie[selector]
    for name in name_list:
        if name not in collector:
            collector[name] = {}
        if date not in collector[name]:
            collector[name][date] = 0
        collector[name][date] += 1
        
actor_counts = {}
director_counts = {}
writer_counts = {}
genres = set()

for movie in rt_movies:
    count_occurences(movie, 'actors', actor_counts)
    count_occurences(movie, 'directors', director_counts)
    count_occurences(movie, 'writers', writer_counts)
    genres.update(movie['genre'])

In [5]:
def normalize_genre(genre):
    return (        
        'genre_' + \
        re.sub(r'\s+', '_', 
               re.sub(r'\&', 'and', genre.strip().lower()))
    )
    
all_genres = set(map(normalize_genre, genres))
all_genres_list = list(sorted(all_genres))

def format_genres(genre_list):
    genres = set((map(normalize_genre, genre_list)))
    return [1 if genre in genres else 0 
            for genre in all_genres]        

In [6]:
def counts_to_cumsums(counts):
    cumsum = 0
    cumsums = []
    for date, count in sorted(counts.items()):
        cumsum += count
        cumsums.append((date, cumsum))
    return cumsums

def make_cumsums(x_counts):
    x_cumsums = {}
    for name, counts in x_counts.items():
        x_cumsums[name] = counts_to_cumsums(counts)
    return x_cumsums

actor_cumsums = make_cumsums(actor_counts)
director_cumsums = make_cumsums(director_counts)
writer_cumsums = make_cumsums(writer_counts)

def get_cumsum(cumsums, date):
    if not cumsums or date < cumsums[0][0]:
        return 0
    dates = [c[0] for c in cumsums]
    i = bisect_right(dates, date)
    return cumsums[i-1][1]

In [7]:
csv_header = [
    'release_date', 'title', 
] + all_genres_list + [
    'mpaa_rating', 'runtime',
    'cast_size',
    'cast_num_movies_tod',
    'dir_num_movies_tod',
    'writer_num_movies_tod',
    'cast_aa_win_tod',
    'cast_aa_nom_tod',
    'dir_aa_win_tod',
    'dir_aa_nom_tod',
    'writer_aa_win_tod',
    'writer_aa_nom_tod',
    'rt_rating'
]

In [8]:
def _make_num_movies_tod(movie, selector, top_n=5):
    cumsums = actor_cumsums if selector == 'actors' \
        else director_cumsums if selector == 'directors' \
        else writer_cumsums
        
    release_date = movie['release_date']
    num_movies = [
        get_cumsum(cumsums[name], release_date) for name in movie[selector]
    ]
    top_n_num_movies = list(sorted(num_movies, reverse=True))[:top_n]
    return sum(top_n_num_movies)    
    
def make_cast_num_movies_tod(movie, top_n=5):
    return _make_num_movies_tod(movie, 'actors', top_n)

def make_dir_num_movies_tod(movie, top_n=5):
    return _make_num_movies_tod(movie, 'directors', top_n)

def make_writer_num_movies_tod(movie, top_n=5):
    return _make_num_movies_tod(movie, 'writers', top_n)

In [9]:
from data.aa_scraped import \
    aa_actors_win_counts, aa_actors_nom_counts, \
    aa_directors_win_counts, aa_directors_nom_counts, \
    aa_writers_win_counts, aa_writers_nom_counts

aa_actor_win_cumsums = make_cumsums(aa_actors_win_counts)
aa_actor_nom_cumsums = make_cumsums(aa_actors_nom_counts)
aa_director_win_cumsums = make_cumsums(aa_directors_win_counts)
aa_director_nom_cumsums = make_cumsums(aa_directors_nom_counts)
aa_writer_win_cumsums = make_cumsums(aa_writers_win_counts)
aa_writer_nom_cumsums = make_cumsums(aa_writers_nom_counts)

def _make_aa_tod(movie, selector):
    win_cumsums, nom_cumsums = (aa_actor_win_cumsums, aa_actor_nom_cumsums) if selector == 'actors' \
        else (aa_director_win_cumsums, aa_director_nom_cumsums) if selector == 'directors' \
        else (aa_writer_win_cumsums, aa_writer_nom_cumsums)
        
    release_date = movie['release_date']
    n_win = sum(
        [get_cumsum(win_cumsums.get(name, []), release_date) for name in movie[selector]]
    )
    n_nom = sum(
        [get_cumsum(nom_cumsums.get(name, []), release_date) for name in movie[selector]]
    )
    return n_win, n_nom

def make_cast_aa_tod(movie):
    return _make_aa_tod(movie, 'actors')

def make_dir_aa_tod(movie):
    return _make_aa_tod(movie, 'directors')

def make_writer_aa_tod(movie):
    return _make_aa_tod(movie, 'writers')

list(make_cast_aa_tod(movie) + make_dir_aa_tod(movie) + make_writer_aa_tod(movie))

[0, 0, 0, 0, 0, 0]

In [10]:
rt_movies_sorted = sorted(
    rt_movies,
    key=lambda m: (m['release_date'], m['title'])
)

In [11]:
csv_file = os.path.join(data_dir, 'rt_movies.csv.gz')
cutover_date = '1998'
with gzip.open(csv_file, 'wt') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(csv_header)
    for movie in rt_movies_sorted:
        if movie['release_date'] < cutover_date:
            continue
        row = \
            [
                    movie['release_date'], 
                    movie['title']
            ] + \
            format_genres(movie['genre']) + \
            [
                movie['mpaa_rating'], movie['runtime'],
                len(movie['actors']),
                make_cast_num_movies_tod(movie),
                make_dir_num_movies_tod(movie),
                make_writer_num_movies_tod(movie),
            ] + \
            list(make_cast_aa_tod(movie) + make_dir_aa_tod(movie) + make_writer_aa_tod(movie)) + \
            [movie['rating']]
        csv_writer.writerow(row)