In [None]:
import requests
import re
import pandas as pd
import time
import os
import sys
import pickle
import numpy as np
from bs4 import BeautifulSoup
from calendar import month_name

In [None]:
sys.path.append("/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages")

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from fuzzywuzzy import fuzz 

In [None]:
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

# Collect sample - scrape a list of movies from Oscars.org and Metacritic

Systematically sample high-quality movies by starting with movies that were nominated for a major Oscar category, and subsequently adding movies that had high Metacritic scores 

## Functions

Create 2 functions to collect different categories of films from Oscars.org (different functions are needed for different categories because Oscars.org pages are formatted differently):  
* Best Picture, Director, Adapted Screenplay, Original Screenplay
* Best Actor, Actress, Supporting Actor, Supporting Actress

In [None]:
def bp_nominees (year, awards):
    global nom_list
    global final_bpnoms
    url = 'https://www.oscars.org/oscars/ceremonies/' + str(year)
    driver.get(url)
    nominees = driver.find_elements_by_xpath('//div[@class="view-grouping"]')
    
    nom_list = []
    final_bpnoms = []

    for i in range(len(nominees)):
        temp = nominees[i].text.split('\n')
        nom_list.append(temp)
    
    for i in range(len(nom_list)):
        for category in (awards):
            if nom_list[i][0] == category:
                new_temp = nom_list[i]
                noms = [x for x in new_temp if x.isupper()]
                movies = list(filter(lambda x: x not in [category, 'WINNER', 'NOMINEES'], noms))
                movies = [x.lower() for x in movies]
                if category == 'BEST PICTURE':
                    final_bpnoms = movies + list(set(final_bpnoms) - set(movies))
                else:                
                    final_bpnoms = final_bpnoms + list(set(movies) - set(final_bpnoms))
        else:
            continue
        
    return final_bpnoms

In [None]:
def other_nominees (year):
    global cat_list
    global nom_list 
    url = 'https://www.oscars.org/oscars/ceremonies/' + str(year)
    driver.get(url)
    nominees = driver.find_elements_by_xpath('//div[@class="view-grouping"]')
    
    cat_list = []
    nom_list = []

    for i in range(len(nominees)):
        temp = nominees[i].text.split('\n')
        nom_list.append(temp)
    
    for i in range(len(nom_list)):
        for category in ('ACTOR IN A LEADING ROLE', 'ACTRESS IN A LEADING ROLE',
                        'ACTOR IN A SUPPORTING ROLE', 'ACTRESS IN A SUPPORTING ROLE'):
            if nom_list[i][0] == category:
                new_temp = nom_list[i]
                movies = [x for x in new_temp if not x.isupper()]
                movies = [x.lower() for x in movies]
                cat_list = cat_list + list(set(movies) - set(cat_list))
        else:
            continue
               
    return cat_list

## Scrape movies from Oscars.org

In [None]:
# Best Picture nominees only
master_list = []
for i in range (1993, 2018):
    bp_nominees(i, ['BEST PICTURE'])
    oscars = final_bpnoms
    master_list.extend(oscars)

In [None]:
df = pd.DataFrame(master_list, columns=['oscar_name'])
df['bp_nominee'] = 1.0

In [None]:
file = 'df_bp_nominees_1992-2017'
fileobj = open(file,'wb') 
pickle.dump(df,fileobj) 

In [None]:
# All Oscar nominees
master_list = []
for i in range (1993, 2018):
    bp_nominees(i, ['BEST PICTURE', 'DIRECTING', 
                    'WRITING (ADAPTED SCREENPLAY)', 'WRITING (ORIGINAL SCREENPLAY)'])
    other_nominees(i)
    oscars = final_bpnoms + list(set(cat_list) - set(final_bpnoms))
    master_list.extend(oscars)

In [None]:
df = pd.DataFrame(master_list, columns=['oscar_name'])
df['nominee'] = 1.0

In [None]:
file = 'df_all_nominees_1992-2017'
fileobj = open(file,'wb') 
pickle.dump(df,fileobj) 

## Cast a wider net - scrape movies from both Oscars.org and Metacritic

This generates 25 lists of movies (1 list per year, 1992-2016) that meet one of these criteria:  
* Nominated for one of the major 8 Oscar categories  
* One of the highest rated movies on Metacritic (NOTE: threshold is much lower in earlier years due to the relatively low number of movies)

In each list, the first movie is the one that won Best Picture.

In [None]:
master_list = []
for i in range (1993, 2018):
    bp_nominees(i, ['BEST PICTURE', 'DIRECTING', 
                    'WRITING (ADAPTED SCREENPLAY)', 'WRITING (ORIGINAL SCREENPLAY)'])
    other_nominees(i)
    oscars = final_bpnoms + list(set(cat_list) - set(final_bpnoms))
        
    if i !=2018:
        url = 'http://www.metacritic.com/browse/movies/score/metascore/year/filtered?year_selected=' + str(i-1)
        driver.get(url)
        search = driver.find_elements_by_xpath('//table[@class="list score"]')
        metacritic = []

        for j in range(len(search)):
            temp2 = search[j].text.split('\n')[2::5]
            metacritic.extend(temp2)
        metacritic = [x.lower() for x in metacritic]
        year = oscars + [x for x in metacritic if x not in oscars]
    else:
        pass
    year.append(i)
    master_list.append(year)

In [None]:
# Export the list
file = 'oscars+metacritic_1992-2017'
fileobj = open(file,'wb') 
pickle.dump(master_list,fileobj) 

# Search IMDB for movies in Oscar/Metacritic list

Create functions to search for movies on IMDB. Title and year are used to find the correct match. Errors often occur, though, due to IMDB quirks (conducting the exact same search twice can lead to two different results). As a result, it's useful to repeat the search multiple times - hence, the "take2" function.

## Functions

In [None]:
driver = webdriver.Chrome(chromedriver)
driver.get('http://www.imdb.com/')

In [None]:
def imdb_search(movie, year):
    global final_choice
    global soup

    searchbox = driver.find_element_by_id('navbar-query')
    searchbox.click()
    searchbox.clear()
    searchbox.send_keys(movie + ' ' + str(year))
    time.sleep(2)

    sug_html = driver.find_element_by_id('navbar-suggestionsearch').get_attribute('innerHTML')
    
    soup=BeautifulSoup(sug_html, 'lxml')
    
    try:
        title = [x.text for x in soup.find_all('span', { "class" : "title" })]
        year = [x.text for x in soup.find_all('span', { "class" : "extra" })]
        detail = [x.text for x in soup.find_all('div', { "class" : "detail" })]
        link = [x['href'] for x in soup.find_all('a')]

        suggested_movies = list(zip(title, year, detail, link[:-1]))
        final_choice = suggested_movies[0]
    except:
        final_choice = ()
        pass
    print(final_choice)

In [None]:
# Helper function to change df as movies are matched
def corrections(num):
    df['imdb_name'][num] = final_choice[0]
    df['imdb_year'][num] = final_choice[1]
    df['imdb_actors'][num] = final_choice[2]
    df['imdb_link'][num] = final_choice[3]
    df['match'][num] = 'match'
    mismatch_list.remove(num)
    return mismatch_list

In [None]:
# Conduct repeated searches on movies that haven't been found yet
def take2 (tries):
    count = 0
    sleep = 1

    while len(mismatch_list) > 0 and count < tries:
        for i in mismatch_list:
            search = df.oscars_name_year[i][0]
            imdb_search(search, int(df.oscars_name_year[i][1]) - 1)
            time.sleep(sleep) 
            if len(final_choice) < 3:
                continue
            elif final_choice[0].lower() == search:
                corrections(i)
            elif fuzz.ratio(final_choice[0].lower(), search) > 70:
                df['match'][i] = 'unclear'
            else:
                continue
        count += 1
        sleep += 1
        print(df.match.value_counts())

## Search IMDB for movies in 'master_list' of movies that were scraped from Oscars.org and Metacritic

In [None]:
search_results = []

for i in range(len(master_list)):
    for j in range(len(master_list[i]) - 1):
        imdb_search(master_list[i][j], int(master_list[i][-1])-1)
        temp = list(final_choice)
        temp.append([master_list[i][j], master_list[i][-1]])
        search_results.append(temp)       

In [None]:
# Check whether IMDB search matched Oscar list
for i in range(2000, 2455):
    if len(search_results[i]) < 4:
        search_results[i].append('mismatch')
    elif search_results[i][0].lower() == search_results[i][-1][0]:
        search_results[i].append('match')
    elif search_results[i][0][:7].lower() == search_results[i][-1][0][:7]:
        search_results[i].append('unclear')
    else:
        search_results[i].append('mismatch')

In [None]:
df = pd.DataFrame(search_results, columns=['imdb_name', 'imdb_year', 'imdb_actors', 'imdb_link', 
                                           'oscars_name_year', 'match'])

In [None]:
# Convert empty matches into 'mismatch'
df['flag'] = df.oscars_name_year.map(lambda x: str(x))
df.loc[df['flag'] == 'None', 'match'] = 'mismatch'

for i in range (len(df)):
    if df['flag'][i] == 'None':
        df['oscars_name_year'][i] = df['imdb_name'][i]

Erroneous "matches" often occur because the title of the matched film and target film are similar (e.g., "The Godfather" and "The Godfather, Part II"). Use release year to double-check that the match is correct.

In [None]:
df['imdb_year'] = df.imdb_year.map(lambda x: x.strip('()'))
df['imdb_year_int'] = df[df.imdb_year != 'mismatch'].imdb_year.map(lambda x: int(x))
df['oscar_year_int'] = df[df.match != 'mismatch'].oscars_name_year.map(lambda x: x[-1])
df['year_diff'] = df.oscar_year_int - df.imdb_year_int
df['flag'] = df.year_diff.apply(lambda x: 'mismatch' if abs(x) > 2 else None)
df.loc[(df.flag == 'mismatch') , 'match'] = 'mismatch'

In [None]:
df.match.value_counts()

In [None]:
df[df.match == 'unclear']

In [None]:
# Manually change 'unclear' to 'match',  as appropriate:
# for i in []:
#     df['match'][i] = 'match'

In [None]:
# Re-search those who were mismatched - repeat as needed
mismatch_list = df.index[df['match'] == 'mismatch'].tolist()
len(mismatch_list)

In [None]:
take2(1)

In [None]:
df[df.match == 'mismatch']

**NOTE**  
Some films can't be matched due to IMBD errors (e.g., 'Crash' is incorrectly labeled as being released in 2004). Code below is used, as needed, to manually fix those.

In [None]:
# mismatch_list = df.index[df['match'] == 'mismatch'].tolist()
# imdb_search('tt0375679', 2005)
# corrections(1196)

**Flag the Best Picture winner**

Since the movies are sorted by year, with the first movie in each year being the Best Picture, use "diff" to identify the Best Picture.

In [None]:
df['winner'] = df.oscar_year_int.diff()
df.winner.value_counts()

In [None]:
df[df.winner == 1.0]

# Scrape data from IMDB

Now that the 'master_list' has been matched to IMDB, scrape specific features for each movie:  
* Genre  
* Metacritic and popularity scores
* Actors, Director, Writers  
* Other specs - budget, gross, release date, country, runtime
* Number of awards
* Specific awards

In [None]:
df.drop(['year_diff', 'test', 'flag'], axis=1, inplace=True)

In [None]:
df = pd.concat([df,pd.DataFrame(columns=['metacritic', 'popularity', 'country', 'gross', 'budget',
                                        'runtime', 'month', 'actors', 'director', 'writers', 'genre',
                                        'total_wins', 'total_noms'])])

In [None]:
len(df)

In [None]:
for i in range(len(df)):
    if df['match'][i] != 'match':
        continue
    else:
        url = 'http://www.imdb.com' + df.imdb_link[i]
        driver.get(url)

        # Genre
        try:
            df['genre'][i] = driver.find_element_by_xpath('//div[@itemprop="genre"]').text.split(': ')[-1]
        except:
            pass
            
        # Metacritic score and popularity
        search = driver.find_elements_by_xpath('//div[@class="titleReviewBarItem"]')
        try:
            df['metacritic'][i] = int(search[0].text.split('\n')[0])
            df['popularity'][i] = int(search[1].text.split('\n')[1].split()[0].replace(',', ''))
        except:
            pass

        # Stars
        search = driver.find_elements_by_xpath('//div[@class="credit_summary_item"]')
        try:
            df['actors'][i] = search[2].text.split('|')[0].split(':')[-1]
            df['director'][i] = search[0].text.split(':')
            df['writers'][i] = search[1].text.split(':')
        except:
            pass

        # Other specs
        try:
            search = driver.find_element_by_xpath('//div[@id="titleDetails"]').text.split('\n')
            spec_list = ['Country', 'Budget', 'Gross', 'Opening Weekend', 'Release Date', 'Runtime']
            specs = list(filter(lambda x: x.split(':')[0] in spec_list, search))
        except:
            pass
            
        # Month of release
        try:
            release_date = [x for x in specs if 'Release Date' in x]
            pattern = "|".join(month_name[1:])
            df['month'][i] = re.search(pattern, release_date[0], re.IGNORECASE).group(0)
        except:
            pass
            
        # Country, gross, budget, and runtime
        df['country'][i] = [x for x in specs if 'Country' in x][0].split(':')[-1]
        try:
            df['gross'][i] = [x for x in specs if 'Gross' in x][0].split(': ')[1].split()[0]
            df['runtime'][i] = [x for x in specs if 'Runtime' in x][0].split(': ')[1]  
            df['budget'][i] = [x for x in specs if 'Budget' in x][0].split(': ')[1].split()[0]
        except:
            pass
        
        # Award counts
        url_new = url.replace('http://www.imdb.com', '')
        url_new = url_new.replace('?ref_=nv_sr_1', '')
        link = '//a[@href="' + url_new + 'awards?ref_=tt_awd' + '"]'
        search = driver.find_elements_by_xpath(link)
        try:
            search[0].click()
            search = driver.find_elements_by_xpath('//div[@class="desc"]')
            awards_quant = search[0].text
            df['total_wins'][i] = int(re.search(r'[0-9]+\ wins', awards_quant).group().strip(' wins'))
            df['total_noms'][i] = int(re.search(r'[0-9]+\ nominations', awards_quant).group().strip(' nominations')) 
        except:
            continue

In [None]:
df.count()

**Create dummy variables for country and genre**

In [None]:
for i in ['USA', 'UK']:
    df[i] = df['country'].apply(lambda x: 1 if i in str(x) else 0)

In [None]:
for i in ['Drama', 'Comedy', 'Action', 'Crime', 'Romance', 'Sport', 'Biography', 'Mystery',
          'Musical', 'Thriller', 'Horror', 'Adventure', 'Sci-Fi', 'Family', 'History']:
    df[i.lower()] = df['genre'].apply(lambda x: 1 if i in str(x) else 0)

In [None]:
# DUMP
file = 'df_all_movies_1992-2017'
fileobj = open(file,'wb') 
pickle.dump(df_all,fileobj) 

# Merge everything

In [None]:
# Load data and limit to matches
file = 'df_all_movies_1992-2017'
f1 = open(file,'rb') 
df_all = pickle.load(f1) 
f1.close()

file = 'df_all_nominees_1992-2017'
f2 = open(file,'rb') 
df_noms = pickle.load(f2) 
f2.close()

file = 'df_bp_nominees_1992-2017'
f3 = open(file,'rb') 
df_bp = pickle.load(f3) 
f3.close()

In [None]:
df_all['oscar_name'] = df_all.oscars_name_year.map(lambda x: x[0])
df_all['oscar_year'] = df_all.oscars_name_year.map(lambda x: x[1])

In [None]:
df = pd.merge(df_all, df_noms, on='oscar_name', how='outer')

In [None]:
df = pd.merge(df, df_bp, on='oscar_name', how='outer')

In [None]:
for i in ['nominee', 'bp_nominee']:
    df.loc[df[i].isnull() , i] = 0

In [None]:
# Dump into pickle file
file = 'df_master_1992-2017'
fileobj = open(file,'wb') 
pickle.dump(df,fileobj) 