# Import

In [1]:
import pandas as pd
import json

In [2]:
imports = []
import_numbers = [1,2,3,5,6,7,10,11,12,15,16,17,20,21,22,25,26,27,30,31,32,35,36,37,40,41,42,45,46,47]   
for num in import_numbers:
    imports.append('movie_details_exports/movie_details_export_%s.json' %(str(num)))  

In [3]:
details_df = pd.DataFrame()

In [4]:
for imp in imports:
    details_df = details_df.append(pd.read_json(imp, orient='index'))

In [5]:
# len(details_df)

In [6]:
# details_df.head()

In [7]:
# details_df.columns

# Get List of Unique Directors and URLS

In [8]:
directors_df = details_df[['Director_URL','Director']].copy()

In [9]:
directors_df = directors_df.dropna(subset = ['Director_URL'])

In [10]:
# directors_df.head()

In [11]:
# len(directors_df) --> 29711

In [12]:
directors_df = directors_df.drop_duplicates(subset='Director_URL',keep='first')

In [13]:
# len(directors_df) --> 11939

In [14]:
director_urls = list(directors_df['Director_URL'])

In [15]:
director_urls = sorted(director_urls, reverse=True)

# Scrape Director Details

In [16]:
# Director Details
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time

directors_scraped = 0
director_details = {}
director_index = 0                     
prefix = 'http://www.imdb.com'
export_number = 1
export_no = str(export_number)
load_attempts = 0

def director_scraper():
    global director_index
    
    print(str(datetime.now()),': working on export ' + export_no)
    
    for url in director_urls[director_index:2000]:
#         print(url)
        if (director_index+1) % 20 == 0:
            time.sleep(1.5)
            print('%d: %s' %(director_index+1, url))
        page = request_page(url)
        get_director_details(url, page)
        director_index += 1
    
    export_to_json(director_details)
    print("Done")
    
def request_page(url):
    global load_attempts
    response = requests.get(url)
    
    if response.status_code == 200:
        return(response.text)
        load_attempts = 0
    else:
        print(response.status_code,'for %s' %(url))
        if response.status_code != 404:
            if load_attempts == 10:
                time.sleep(60)
            else:
                time.sleep(10)
            request_page(url)
            load_attempts += 1

def get_director_details(url, page):
    soup = BeautifulSoup(page, 'html.parser')

    director_details[url] = {}
    
    gender = soup.find(id='name-job-categories')
    gender = gender.findAll('a') if gender else None
    gender = [x['href'] for x in gender] if gender else None
    gender = [re.sub('[^a-z]+','',x) for x in gender] if gender else None
    if gender:
        if 'actor' in gender:
            gender = 'Male'
        elif 'actress' in gender:
            gender = 'Female'
        else:
            gender = soup.find(id='name-bio-text')
            gender = gender.text if gender else None
            gender = re.sub('\n', '', gender) if gender else None
            gender = re.sub('  +',' ', gender) if gender else None
            gender = gender.split(' ') if gender else None
            male_count = gender.count('His') + gender.count('his') + gender.count('He') + gender.count('he') if gender else None
            female_count = gender.count('Her') + gender.count('her') + gender.count('She') + gender.count('she') if gender else None
            if gender:
                if male_count > female_count:
                    gender = 'Male'
                elif female_count > male_count:
                    gender = 'Female'
                else:
                    gender = None
    director_details[url]['Gender'] = gender
    
    
    director_name = soup.find('h1')
    director_name = director_name.text if director_name else None
    director_name = director_name.strip() if director_name else None
    director_name = director_name.split('\n') if director_name else None

    deathyear = director_name[-1] if len(director_name)>=2 else None
    deathyear = deathyear.split('–')[-1] if deathyear else None
    deathyear = re.sub('[^0-9]+','',deathyear) if deathyear else None
    deathyear = int(deathyear) if deathyear else None
    director_details[url]['Deathyear'] = deathyear
    

    director_name = director_name[0] if director_name else None
    director_details[url]['Name'] = director_name    
    
    
    born_info = soup.find(id='name-born-info')

    birthdate = born_info.find('time') if born_info else None
    if birthdate:
        if birthdate.has_attr('datetime'):
            birthdate = birthdate['datetime']
        else:
            birthdate = None
    director_details[url]['Birthdate'] = birthdate

    birthyear = birthdate[0:4] if birthdate and len(birthdate)>=8 else None
    birthyear = int(birthyear) if birthyear else None
    director_details[url]['Birthyear'] = birthyear

    age_at_death = deathyear-birthyear if deathyear and birthyear else None
    director_details[url]['Age at Death'] = age_at_death

    
    birthplace = born_info.findAll('a') if born_info else None
    birthplace = birthplace[-1] if birthplace else None
    birthplace = birthplace.text if birthplace else None
    director_details[url]['Birthplace'] = birthplace

    
    birthcountry = birthplace.split(', ') if birthplace else None
    birthcountry = birthcountry[-1] if birthcountry and len(birthcountry)>1 else None
    director_details[url]['Birth Country'] = birthcountry

    award_highlight = soup.find('div', class_='article highlighted')
    award_highlight = award_highlight.findAll('span') if award_highlight else None
    award_highlight = [x.text for x in award_highlight] if award_highlight else None
    award_highlight = [x.strip() for x in award_highlight] if award_highlight else None
    award_highlight = [x.split('\n') for x in award_highlight] if award_highlight else None
    award_highlight = [''.join(x) for x in award_highlight[:-1]] if award_highlight else None
    award_highlight = [re.sub("  +", " ", x) for x in award_highlight] if award_highlight else None
    award_highlight = ' '.join(award_highlight) if award_highlight else None    
    director_details[url]['Award Info'] = award_highlight
    
    writing_roles = soup.find(id='filmo-head-writer')
    writing_roles = writing_roles.text if writing_roles else None
    writing_roles = writing_roles.strip() if writing_roles else None
    writing_roles = writing_roles.split('(')[-1] if writing_roles else None
    writing_roles = re.sub('[^0-9]+','',writing_roles) if writing_roles else None
    writing_roles = int(writing_roles) if writing_roles else None
    director_details[url]['Writing Roles'] = writing_roles
    
    editing_roles = soup.find(id='filmo-head-editor')
    editing_roles = editing_roles.text if editing_roles else None
    editing_roles = editing_roles.strip() if editing_roles else None
    editing_roles = editing_roles.split('(')[-1] if editing_roles else None
    editing_roles = re.sub('[^0-9]+','',editing_roles) if editing_roles else None
    editing_roles = int(editing_roles) if editing_roles else None
    director_details[url]['Editing Roles'] = editing_roles
    
    acting_roles = soup.find(id='filmo-head-actor')
    acting_roles = acting_roles.text if acting_roles else None
    acting_roles = acting_roles.strip() if acting_roles else None
    acting_roles = acting_roles.split('(')[-1] if acting_roles else None
    acting_roles = re.sub('[^0-9]+','',acting_roles) if acting_roles else None
    acting_roles = int(acting_roles) if acting_roles else None
    director_details[url]['Acting Roles'] = acting_roles
    
    directing_roles = soup.find(id='filmo-head-director')
    directing_roles = directing_roles.text if directing_roles else None
    directing_roles = directing_roles.strip() if directing_roles else None
    directing_roles = directing_roles.split('(')[-1] if directing_roles else None
    directing_roles = re.sub('[^0-9]+','',directing_roles) if directing_roles else None
    directing_roles = int(directing_roles) if directing_roles else None
    director_details[url]['Directing Roles'] = directing_roles  
    
    producing_roles = soup.find(id='filmo-head-producer')
    producing_roles = producing_roles.text if producing_roles else None
    producing_roles = producing_roles.strip() if producing_roles else None
    producing_roles = producing_roles.split('(')[-1] if producing_roles else None
    producing_roles = re.sub('[^0-9]+','',producing_roles) if producing_roles else None
    producing_roles = int(producing_roles) if producing_roles else None
    director_details[url]['Producing Roles'] = producing_roles
    
    cinematographic_roles = soup.find(id='filmo-head-cinematographer')
    cinematographic_roles = cinematographic_roles.text if cinematographic_roles else None
    cinematographic_roles = cinematographic_roles.strip() if cinematographic_roles else None
    cinematographic_roles = cinematographic_roles.split('(')[-1] if cinematographic_roles else None
    cinematographic_roles = re.sub('[^0-9]+','',cinematographic_roles) if cinematographic_roles else None
    cinematographic_roles = int(cinematographic_roles) if cinematographic_roles else None
    director_details[url]['Cinematographic Roles'] = cinematographic_roles
    
    tv_appearances = soup.find(id='filmo-head-self')
    tv_appearances = tv_appearances.text if tv_appearances else None
    tv_appearances = tv_appearances.strip() if tv_appearances else None
    tv_appearances = tv_appearances.split('(')[-1] if tv_appearances else None
    tv_appearances = re.sub('[^0-9]+','',tv_appearances) if tv_appearances else None
    tv_appearances = int(tv_appearances) if tv_appearances else None
    director_details[url]['TV Appearances'] = tv_appearances
    

    height = soup.find('h2',text=re.compile('Personal Details'))
    height = height.parent if height else None
    height = height.find(id='details-height') if height else None
    height = height.text if height else None
    height = height.split('(')[-1] if height else None
    height = re.sub('[^0-9.]+','',height) if height else None
    director_details[url]['Height in m'] = height

    publicity_listings = soup.find('h2',text=re.compile('Personal Details'))
    publicity_listings = publicity_listings.parent if publicity_listings else None
    publicity_listings = publicity_listings.find(id='details-publicity-listings') if publicity_listings else None
    publicity_listings = publicity_listings.text if publicity_listings else None
    publicity_listings = re.sub('[^0-9]+',' ',publicity_listings) if publicity_listings else None
    publicity_listings = publicity_listings.strip() if publicity_listings else None
    publicity_listings = publicity_listings.split(' ') if publicity_listings else None
    publicity_listings = [int(x) for x in publicity_listings] if publicity_listings else None
    publicity_listings = sum(publicity_listings) if publicity_listings else None
    director_details[url]['Publicity Listings'] = publicity_listings
        
    star_sign = soup.find('h4',text=re.compile('Star Sign'))
    star_sign = star_sign.parent if star_sign else None
    star_sign = star_sign.text if star_sign else None
    star_sign = star_sign.strip() if star_sign else None
    star_sign = star_sign.split('\n')[-1] if star_sign else None
    director_details[url]['Star Sign'] = star_sign

    go_to_next_and_export()

def go_to_next_and_export():
    global directors_scraped
    global director_details
    
    directors_scraped+=1

    if len(director_details) == 1000:                                         
        export_to_json(director_details)
        directors_scraped = 0
        director_details = {}

def export_to_json(director_details):
    global export_no
    global export_number
    with open('director_details_exports/director_details_export_%s.json' %(export_no),'w')  as f:
        json.dump(director_details,f)

    export_number += 1
    export_no = str(export_number)
    print(str(datetime.now()),': working on export ' + export_no)

start = time.time()
director_scraper()
end = time.time()
print(round(end-start,3))