# Scrape Movies per Genre

In [1]:
import os
import sys 

import requests

from bs4 import BeautifulSoup

import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

import time

In [2]:
# Initiate Chromedriver and fetch URL
chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

url = 'http://www.imdb.com/feature/genre/'

driver = webdriver.Chrome(chromedriver)
driver.get(url)

In [3]:
# Scrape Categories and then Scrape Movies from those Categories
import re

pages_scraped = 0
movies = {}
category_urls = []
category_index = 0                             
prefix = 'http://www.imdb.com'
response = requests.get(driver.current_url)
pages_scraped_current_genre = 0
genre_scrapes = str(pages_scraped_current_genre)
export_number = 1
export_no = str(export_number)

csv_columns = []

def genre_scraper():

    print('Working on export ' + export_no)
    
    def scrape_categories():

        soup = BeautifulSoup(driver.page_source, 'html.parser')

        movie_section = soup.findAll('div', class_='article')[5]
        genre_items = movie_section.findAll('a',href=True)
        genre_names = [item.text.split('(')[0].strip().lower().replace(' ','-') for item in genre_items]

        for name in genre_names:
            category_url = 'http://www.imdb.com/search/title?genres=%s&title_type=feature&explore=genres' %(name)
            category_urls.append(category_url)

        loop_through_categories(category_urls)

        
    def loop_through_categories(category_urls):
        url = category_urls[category_index]
        driver.get(url)
        get_movie_listings()

    def get_movie_listings():
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        listings = soup.find_all("div",class_='lister-item-content')
        time.sleep(0.1)
        get_movie_data(listings)

    def get_movie_data(listings):
        global pages_scraped
        global category_index
        global pages_scraped_current_genre
        global movies
        
        for listing in listings:
            scrape_details(listing)
            
        pages_scraped+=1
#         print(pages_scraped)
        
        if pages_scraped == 100:                                        
            export_to_json(movies)
            pages_scraped = 0
            movies = {}
        
        try:
            driver.find_element_by_class_name("lister-page-next").click()
            pages_scraped_current_genre += 1
        except:
            category_index += 1
            pages_scraped_current_genre = 0
            if category_index <= 23:
                loop_through_categories(category_urls)
        
        if response.status_code != 404:
            time.sleep(0.1)
            get_movie_listings()
        else:
            category_index += 1
            pages_scraped_current_genre = 0
            if category_index <= 23:
                loop_through_categories(category_urls)
            else: 
                export_to_json(movies)
                print("Done")

                
    def scrape_details(listing):
        global movies

        url = listing.find('a')['href']
        url = prefix+url.split("?")[0]
        movies[url] = {}
        
        title = listing.find('a').text 
        movies[url]['Title'] = title
        
        year = listing.find('span', class_='lister-item-year text-muted unbold')
        year = year.text if year else None
        movies[url]['Year'] = re.sub('[^a-za-z0-9]+','', year)
        
        rating = listing.find('div', class_='inline-block ratings-imdb-rating')
        rating = rating.text.split('(')[0].strip() if rating else None
        movies[url]['Rating'] = rating
        
        votes = listing.find('p', class_='sort-num_votes-visible')
        votes = votes.findAll('span') if votes else None
        votes = votes[1].text if votes and len(votes)>1 else None
        movies[url]['Votes'] = votes
        
        metascore = listing.find('div', class_='inline-block ratings-metascore')
        metascore = metascore.text[0:4].split('(')[0].strip() if metascore else None
        movies[url]['Metascore'] = metascore
        
        gross = listing.find('p', class_='sort-num_votes-visible')
        gross = gross.findAll('span') if gross else None
        gross = gross[4] if gross and len(gross)>=5 else None
        gross = gross.text.replace('$','').replace('M','') if gross else None
        movies[url]['Gross'] = gross  
        
        genre = listing.find('span', class_ = 'genre')
        genre = genre.text.split('(')[0].strip() if genre else None
        movies[url]['Genre'] = genre
        
        certificate = listing.find('span', class_ = 'certificate')
        certificate = certificate.text if certificate else None
        movies[url]['Certificate'] = certificate
        
        runtime = listing.find('span', class_ = 'runtime')
        runtime = runtime.text if runtime else None
        movies[url]['Runtime'] = runtime

        people = listing.findAll('p')[2]
        director = people.find('a')
        director = director.text if director else None
        movies[url]['Director'] = director

        stars = people.findAll('a')[1:5]
        i=0
        for star in stars:
            star = stars[i]
            star = star.text if star else None
            movies[url]['Star%s' %str(i+1)] = star
            starURL = stars[i]['href']
            if not starURL:
                return None
            movies[url]['Star%s_URL' %str(i+1)] = prefix+starURL
            i += 1
    
    def export_to_json(movies):
        global export_no
        global export_number
        with open('genre_export_%s.json' %(export_no),'w')  as f:    #renamed from 'export_'
            json.dump(movies,f)
            
        export_number += 1
        export_no = str(export_number)
        print('Working on export ' + export_no)

    scrape_categories()

genre_scraper()