In [1]:
from IPython.core.display import clear_output
from time import sleep, time
from random import randint
from requests import get
import urllib
from bs4 import BeautifulSoup
import pandas as pd
import re, os
from datetime import datetime

In [2]:
# Redeclaring the lists to store data in
movie_ids = []
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
genres = []
run_times = []
directors = []
actors = []

In [3]:
output_path = 'D:\\Data\\IMDB\\Scrapped'
date = datetime.today().strftime('%Y-%m-%d %H-%M')

## Get basic movie information using Advanced Title Search

In [None]:
page_size = 250

# Preparing the monitoring of the loop
start_time = time()
requests = 0
year_urls = [str(i) for i in range(2000, 2019)]
for year_url in year_urls:
    crawl_complete = False
    page = 0
    while crawl_complete == False:
        search_url =('https://www.imdb.com/search/title'
                '?title_type=feature'
                '&release_date=' + year_url +
                '&countries=us&languages=en'
                '&count='+ str(page_size) +           
                '&start='+ str(page * page_size +  1))
        response = get(search_url)
        page = page + 1
        print(search_url)

        # Pause the loop
        sleep(randint(2, 4))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Elapsed:{}; Frequency: {} requests/s'.format(requests, elapsed_time, requests/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
        
        if len(mv_containers) == 0:
            print("No container. Crawl Complete for year: " + str(year_url))
            crawl_complete = True
            break

        # Scrape some attributes for each movie
        for container in mv_containers:
            # Scrape the movie id
            try:
                result = re.search('/title/(.*)/(.*)', container.h3.a['href'])
                movie_id = result.group(1)  
            except:
                movie_id = None

            if movie_id in movie_ids:
                print("Movie already exists. Crawl Complete for year: " + str(year_url))
                crawl_complete = True
                break
            movie_ids.append(movie_id)

            # Scrape the name
            try:
                name = container.h3.a.text
            except:
                name = None
            names.append(name)

            # Scrape the year
            try:
                year_text = container.h3.find('span', class_ = 'lister-item-year').text
                result = re.search('\((\d{4})\)$', year_text)
                year = result.group(1)  
            except:
                year = None
            years.append(year)

            # Scrape the IMDB rating
            try:
                imdb = float(container.strong.text)
            except:
                imdb = None
            imdb_ratings.append(imdb)

            # Scrape the Metascore
            try:
                m_score = int(container.find('span', class_ = 'metascore').text) 
            except:
                m_score = 0               
            metascores.append(m_score)

            # Scrape the number of votes
            try:
                vote = int(container.find('span', attrs = {'name':'nv'})['data-value'])
            except:
                vote = None
            votes.append(vote)

            # Scrape the genres
            try:
                genre = container.find('span', class_ = 'genre').text.strip()
            except:
                genre = None
            genres.append(genre)

            # Scrape the run time
            try:
                runtime = container.find('span', class_ = 'runtime').text
                runtime = int(runtime.replace('min', '').strip())
            except:
                runtime = None
            run_times.append(runtime)

            # Scrape director and actors
            try:
                people_div = container.find("div", class_="ratings-bar").find_next_sibling("p").find_next_sibling("p")
                people_texts = people_div.text.replace('\n','').strip().split('|')
                
                if len(people_texts) >= 1:
                    director_text = people_texts[0].strip()
                    result = re.search('Director(s?):(.*)', director_text)
                    director = result.group(2)

                if len(people_texts) >= 2:    
                    actors_text = people_texts[1].strip()
                    result = re.search('Star(s?):(.*)', actors_text)
                    actor = result.group(2)
            except:
                director = None
                actor = None

            directors.append(director)
            actors.append(actor)

In [None]:
movie_ratings = pd.DataFrame({'movie_id': movie_ids,
                              'name': names,
                              'year': years,
                              'rating': imdb_ratings,
                              'metascore': metascores,
                              'votes': votes,
                              'genre': genres,
                              'runtime': run_times,
                              'director': directors,
                              'actor' : actors})
print(movie_ratings.info())
movie_ratings.head(5)

Export data frames to csv

In [None]:
movie_ratings.to_csv(os.path.join(output_path, 'movie_ratings_' + date + '.csv'), sep=';', encoding= 'utf8', index=False)

## Get budget and revenue info by scrapping individual movie pages

In [4]:
headers = {"Accept-Language": "en-US, en;q=0.5"} # to prevent localization in results

def make_page_request(url):
    request = urllib.request.Request(url, headers=headers)
    return urllib.request.urlopen(request)

def get_page_html(page):
    return BeautifulSoup(page, 'html.parser')

def get_release_date(html):
    try:
        return (html.find("h4", text="Release Date:")).next.next.strip()
    except:
        return None

def get_revenue(html):
    try:
        text = (html.find("h4", text="Gross USA:")).next.next.strip().strip(',')
        return int(text[1:].replace(',',''))
    except: 
        return None
    
def get_revenue_date(html):
    try:
        return (html.find("h4", text="Gross USA:")).next_sibling.next.text
    except:
        return None

def get_budget(html):
    try:
        text = (html.find("h4", text="Budget:")).next.next.strip().strip(',')
        return int(text[1:].replace(',',''))
    except:
        return None

Get the list of movie ids from the previously scrapped movies data

In [52]:
m = pd.read_csv(os.path.join(output_path, 'movie_ratings_full.csv'), header=0, sep=';', engine='python', encoding= 'utf8')

In [53]:
base_url = 'https://www.imdb.com/title/'
years = [2016]

for year in years:
    m_year = m[m.year==year]
    movie_ids = m_year['movie_id'].unique()

    release_dates = []
    revenues = []
    revenue_dates = []
    budgets = []
    skipped_ids = []
    
    requests = 0
    start_time = time()
    for m_id in movie_ids:
        url = base_url + m_id + '/'
        try:
            page = make_page_request(url)
            html = get_page_html(page)
        except:
            skipped_ids.append(m_id)
            continue

        release_date = get_release_date(html)
        release_dates.append(release_date)

        revenue = get_revenue(html)
        revenues.append(revenue)

        revenue_date = get_revenue_date(html)
        revenue_dates.append(revenue_date)    

        budget = get_budget(html)
        budgets.append(budget)

        requests +=1
        if requests % 25 == 0:  
            # Pause the loop
            sleep(randint(1, 2))

            # Monitor the requests
            elapsed_time = time() - start_time
            print('Request:{}; Elapsed:{}; Frequency: {} requests/s'.format(requests, elapsed_time, requests/elapsed_time))
            clear_output(wait = True)
    
    movie_finance = pd.DataFrame({'movie_id': [item for item in movie_ids if item not in skipped_ids],
                                  'release_date': release_dates,
                                  'budget': budgets,
                                  'revenue': revenues,
                                  'revenue_date': revenue_dates})
    
    movie_finance.to_csv(os.path.join(output_path, 'movie_finance-' + str(year) + '_' + date + '.csv'), 
                         sep=';', encoding= 'utf8', index=False)

Request:3525; Elapsed:5538.783479690552; Frequency: 0.6364213392571431 requests/s


In [None]:
movie_finance = pd.DataFrame({'movie_id': movie_ids,
                              'release_date': release_dates,
                              'budget': budgets,
                              'revenue': revenues,
                              'revenue_date': revenue_dates})
print(movie_finance.info())
movie_finance.head(5)

Export data frames to csv