# Cross Reference Name between Rotten Tomatoes and Awards Files

In [2]:
import sys
import os
import json
import re
import pprint
import time
import re
import csv
import gzip

In [3]:
from data.rt_scraped import rt_movies

In [4]:
data_dir = 'data'

In [6]:
rt_actors = set()
rt_directors = set()
rt_writers = set()
for movie in rt_movies:
    rt_actors.update(movie['actors'])
    rt_directors.update(movie['directors'])
    rt_writers.update(movie['writers'])

In [96]:
aa_actors = set()
aa_directors = set()
aa_writers = set()

aa_file = os.path.join(data_dir, 'academy_awards.csv')

aa_to_rt_name_mapping = {
    'Francis Coppola': 'Francis Ford Coppola',
    'Francois Truffaut': 'François Truffaut',
    'Lasse Hallström': 'Lasse Hallstrom',
    'Daniel Day Lewis': 'Daniel Day-Lewis',
    'Louis Gossett Jr.': 'Louis Gossett Jr',
    'Dame Edith Evans': 'Edith Evans',''
    'Anouk Aimee': 'Anouk Aimée',
    'Genevieve Bujold': 'Geneviève Bujold',
    'Gerard Depardieu': 'Gérard Depardieu',
    'Max Von Sydow': 'Max von Sydow',
    'Penélope Cruz': 'Penelope Cruz',
    'Ed Begley': 'Ed Begley Jr.',
    'Michael Dunn': 'Michael Dunn (I)',
}

def aa_name_map(name):
    return aa_to_rt_name_mapping.get(name, name)

def normalize_aa_actor(actor):
    aa_actor = re.sub(r'\s*,\s*', ' ', actor)
    return aa_name_map(aa_actor)

def normalize_aa_directors(directors):
    aa_directors = []
    for director in re.split(r'\s*,\s*', directors.strip()):
        if director.lower() in ['jr.', 'iii']:
            continue
        for director in re.split('(?i)\s+(?:and|\&)\s+', director):
            aa_directors.append(aa_name_map(director))
    return aa_directors

def normalize_aa_writers(writers):
    aa_writers = []
    writers = re.sub(r'\s*;.*', '', writers)
    writers = re.sub(r'^.*\s+by\s+', '', writers)
    for writer in re.split(r'\s*,\s*', writers.strip()):
        if writer.lower() in ['jr.', 'iii']:
            continue
        for writer in re.split('(?i)\s+(?:and|\&)\s+', writer):
            aa_writers.append(aa_name_map(writer))
    return aa_writers

aa_actors_win_counts = {}
aa_actors_nom_counts = {}
aa_directors_win_counts = {}
aa_directors_nom_counts = {}
aa_writers_win_counts = {}
aa_writers_nom_counts = {}

def aa_update_counts(category, names, date, won):
    if type(names) not in [list]:
        names = [names]

    if category == 'actors':
        win_counts, nom_counts = aa_actors_win_counts, aa_actors_nom_counts
    elif category == 'directors':
        win_counts, nom_counts = aa_directors_win_counts, aa_directors_nom_counts
    else:
        win_counts, nom_counts = aa_writers_win_counts, aa_writers_nom_counts
        
    for collector in [win_counts, nom_counts] if won else [nom_counts]:
        for name in names:
            if name not in collector:
                collector[name] = {}
            if year not in collector[name]:
                collector[name][year] = 0
            collector[name][year] += 1

with open(aa_file, encoding="ISO-8859-1") as f:
    csv_reader = csv.reader(f)
    next(csv_reader)
    for row in csv_reader:
        year = re.sub(r'\s*\D.*', '', row[0])
        if year < '1960':
            continue
        category = row[1].lower().split()[0]
        won = row[4].lower().startswith('y')     
        if category.startswith('actor') or category.startswith('actress'):
            aa_update_counts('actors', normalize_aa_actor(row[2]), year, won)
        elif category.startswith('directing'):
            aa_update_counts('directors', normalize_aa_directors(row[3]), year, won)
        elif category.startswith('writing'):
            aa_update_counts('writers', normalize_aa_writers(row[3]), year, won)                

In [103]:
aa_file = os.path.join(data_dir, 'aa_scraped.py')
with open(aa_file, 'w') as f:
    f.write("# Academy Awards Scraped File\n\n")
    f.write("aa_actors_win_counts = ")
    pprint.pprint(aa_actors_win_counts, stream=f)
    f.write("\n")
    f.write("aa_actors_nom_counts = ")
    pprint.pprint(aa_actors_nom_counts, stream=f)
    f.write("\n")
    f.write("aa_directors_win_counts = ")
    pprint.pprint(aa_directors_win_counts, stream=f)
    f.write("\n")
    f.write("aa_directors_nom_counts = ")
    pprint.pprint(aa_directors_nom_counts, stream=f)
    f.write("\n")    
    f.write("aa_writers_win_counts = ")
    pprint.pprint(aa_writers_win_counts, stream=f)
    f.write("\n")
    f.write("aa_writers_nom_counts = ")
    pprint.pprint(aa_writers_nom_counts, stream=f)
    f.write("\n")

    