# Importing and Dropping Duplicates

In [1]:
import pandas as pd
import json

In [2]:
imports = []
for i in range(1,41):
    imports.append('genre_exports/export_%s.json' %(str(i)))

In [3]:
movies_df = pd.DataFrame()

In [4]:
for imp in imports:
    movies_df = movies_df.append(pd.read_json(imp, orient='index'))

In [5]:
len(movies_df)

## Drop Duplicates

In [9]:
movies = movies_df.drop_duplicates(subset='ID',keep='first')

In [10]:
len(movies)

## Drop NaN Ratings

In [11]:
movies['Rating'].isnull().sum()

29609

In [12]:
movies = movies.dropna(subset = ['Rating'])

In [13]:
len(movies)

77297

## Keep Movies with Drop Movies with 1000 Votes or More

In [14]:
movies['Votes'] = movies['Votes'].str.replace(',','')

In [15]:
# movies['Votes']

In [16]:
movies['Votes'] = movies['Votes'].astype(int)

In [17]:
len(movies[movies['Votes'] >= 500])

29717

In [18]:
movies = movies[movies['Votes'] >= 500]

# Scrape Details with Movie URL

In [19]:
len(movies.index)

29717

In [20]:
movies.index

Index(['http://www.imdb.com/title/tt0006206/',
       'http://www.imdb.com/title/tt0009682/',
       'http://www.imdb.com/title/tt0015163/',
       'http://www.imdb.com/title/tt0015324/',
       'http://www.imdb.com/title/tt0017925/',
       'http://www.imdb.com/title/tt0018578/',
       'http://www.imdb.com/title/tt0019421/',
       'http://www.imdb.com/title/tt0020815/',
       'http://www.imdb.com/title/tt0021079/',
       'http://www.imdb.com/title/tt0022753/',
       ...
       'http://www.imdb.com/title/tt0439504/',
       'http://www.imdb.com/title/tt0457530/',
       'http://www.imdb.com/title/tt0495747/',
       'http://www.imdb.com/title/tt1384925/',
       'http://www.imdb.com/title/tt1388402/',
       'http://www.imdb.com/title/tt1555110/',
       'http://www.imdb.com/title/tt2014202/',
       'http://www.imdb.com/title/tt2140381/',
       'http://www.imdb.com/title/tt4893452/',
       'http://www.imdb.com/title/tt5143890/'],
      dtype='object', length=29717)

In [21]:
movie_urls = list(movies.index)

In [22]:
len(movie_urls)

29717

In [23]:
movie_urls = sorted(movie_urls,reverse = True)

In [24]:
# movies.columns

In [25]:
# Get Movie Details
import re
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import time

movies_scraped = 0
movie_details = {}
movie_index = 0                       
prefix = 'http://www.imdb.com'
export_number = 1
export_no = str(export_number)
load_attempts = 0

def movie_scraper():
    global movie_index
    
    print(str(datetime.now()),': working on export ' + export_no)
    
    for url in movie_urls[movie_index:3000]:
        if (movie_index+1) % 20 == 0:
            time.sleep(1.5)
            print('%d: %s' %(movie_index+1, url))
        page = request_page(url)
        get_movie_details(url, page)
        movie_index += 1
    
    export_to_json(movie_details)
    print("Done")
    
def request_page(url):
    global load_attempts
    response = requests.get(url)
    
    if response.status_code == 200:
        return(response.text)
        load_attempts = 0
    else:
        print(response.status_code,'for %s' %(url))
        if response.status_code != 404:
            if load_attempts == 10:
                time.sleep(60)
            else:
                time.sleep(10)
            request_page(url)
            load_attempts += 1

def get_movie_details(url, page):
    soup = BeautifulSoup(page, 'html.parser')

    movie_details[url] = {}
    
    plot_wrapper = soup.find('div', class_='plot_summary_wrapper')

    if plot_wrapper:
        if plot_wrapper.find(text=re.compile('Director:')):
            directorURL = plot_wrapper.find(text=re.compile('Director:'))
        elif plot_wrapper.find(text=re.compile('Directors:')):
            directorURL = plot_wrapper.find(text=re.compile('Directors:'))
        else:
            directorURL = None
    directorURL = directorURL.parent.parent if directorURL else None
    directorURL = directorURL.find('a') if directorURL else None
    director = directorURL.text if directorURL else None
    directorURL = directorURL['href'] if directorURL else None
    directorURL = prefix+directorURL.split("?")[0] if directorURL else None
    movie_details[url]['Director'] = director
    movie_details[url]['Director_URL'] = directorURL

    if plot_wrapper:
        if plot_wrapper.find(text=re.compile('Stars:')):
            starURLs = plot_wrapper.find(text=re.compile('Stars:'))
        elif plot_wrapper.find(text=re.compile('Star:')):
            starURLs = plot_wrapper.find(text=re.compile('Star:'))
        else:
            starURLs = None
    starURLs = starURLs.parent.parent if starURLs else None
    starURLs = starURLs.findAll('a') if starURLs else None
    if starURLs:
        if len(starURLs) == 4:
            starURLs = [[star['href'],star.text] for star in starURLs[:-1]]
        else:
            starURLs = [[star['href'],star.text] for star in starURLs]
    if starURLs:
        i = 0
        while len(starURLs) >= i+1:
            for URL in starURLs:
                star = starURLs[i]
                movie_details[url]['Star%s' %str(i+1)] = star[1].split('\n')[0].strip() if star[1] else None
                movie_details[url]['Star%s_URL' %str(i+1)] = prefix+star[0].split("?")[0] if star[0] else None
            i += 1 
    
    title = soup.find('div', class_='title_wrapper').find('h1',itemprop='name')
    title = title.contents[0].replace(u'\xa0', u'')
    movie_details[url]['Title'] = title

    release_date = soup.find('div', class_='title_wrapper')
    release_date = release_date.findAll('a') if release_date else None
    release_date = release_date[-1] if release_date else None
    release_date = release_date.find('meta') if release_date else None
    release_date = str(release_date) if release_date else None
    release_date = re.sub('[^0-9-]+','', release_date) if release_date else None
    movie_details[url]['Release Date'] = release_date

    title_details = soup.find(id='titleDetails')
    
    production_company = title_details.find(text=re.compile('Production Co:')) if title_details else None
    production_company = production_company.parent.parent if production_company else None
    production_company = production_company.find('a') if production_company else None
    production_company = production_company.text if production_company else None
    movie_details[url]['Production Company'] = production_company

    country = title_details.find(text=re.compile('Country:')) if title_details else None
    country = country.parent.parent if country else None
    country = country.find('a') if country else None
    country = country.text if country else None
    movie_details[url]['Country'] = country

    language = title_details.find(text=re.compile('Language:')) if title_details else None
    language = language.parent.parent if language else None
    language = language.find('a') if language else None
    language = language.text if language else None
    movie_details[url]['Language'] = language

    primary_filming_location = title_details.find(text=re.compile('Filming Locations:')) if title_details else None
    primary_filming_location = primary_filming_location.parent.parent if primary_filming_location else None
    primary_filming_location = primary_filming_location.find('a') if primary_filming_location else None
    primary_filming_location= primary_filming_location.text if primary_filming_location else None
    movie_details[url]['Primary Filming Location'] = primary_filming_location

    budget = title_details.find(text=re.compile('Budget:')) if title_details else None
    budget = budget.parent.parent if budget else None
    budget = budget.text.strip() if budget else None
    budget = re.sub('[^0-9]+','', budget) if budget else None
    movie_details[url]['Budget'] = budget

    opening_weekend_usa = title_details.find(text=re.compile('Opening Weekend USA:')) if title_details else None
    opening_weekend_usa = opening_weekend_usa.parent.parent if opening_weekend_usa else None
    opening_weekend_usa = opening_weekend_usa.text.strip() if opening_weekend_usa else None
    opening_weekend_usa = re.sub('[^0-9]+','', opening_weekend_usa) if opening_weekend_usa else None
    movie_details[url]['Opening Weekend USA'] = opening_weekend_usa

    #rename gross to gross USA in listing scrape
    global_gross = title_details.find(text=re.compile('Cumulative Worldwide Gross:')) if title_details else None
    global_gross = global_gross.parent.parent if global_gross else None
    global_gross = global_gross.text.strip() if global_gross else None
    global_gross = re.sub('[^0-9]+','', global_gross) if global_gross else None
    movie_details[url]['Cumulative Worldwide Gross'] = global_gross

    gross_usa = title_details.find(text=re.compile('Gross USA:')) if title_details else None
    gross_usa = gross_usa.parent.parent if gross_usa else None
    gross_usa = gross_usa.text.strip() if gross_usa else None
    gross_usa = re.sub('[^0-9]+','', gross_usa) if gross_usa else None
    movie_details[url]['Gross USA'] = gross_usa

    go_to_next_and_export()

def go_to_next_and_export():
    global movies_scraped
    global movie_details
    
    movies_scraped+=1

    if len(movie_details) == 1000:                                         
        export_to_json(movie_details)
        movies_scraped = 0
        movie_details = {}

def export_to_json(movie_details):
    global export_no
    global export_number
    with open('movie_details_export_%s.json' %(export_no),'w')  as f:
        json.dump(movie_details,f)

    export_number += 1
    export_no = str(export_number)
    print(str(datetime.now()),': working on export ' + export_no)

movie_scraper()