# Peaky Blinders IMDb Ratings, Part III

Following this <a href="https://www.dataquest.io/blog/web-scraping-beautifulsoup/">tutorial</a> on scraping IMDb with Beautiful Soup.

In [3]:
# Scraping goal, to pull....
# tconst value in URL
# Parent tconst 
# Season #
# Episode #
# Average vote out of 10

In [5]:
from requests import get
url = 'http://www.imdb.com/search/title?release_date=2017&sort=num_votes,desc&page=1'
response = get(url)
print(response.text[:500])




<!DOCTYPE html>
<html
    xmlns:og="http://ogp.me/ns#"
    xmlns:fb="http://www.facebook.com/2008/fbml">
    <head>
         
        <meta charset="utf-8">
        <meta http-equiv="X-UA-Compatible" content="IE=edge">

    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">



        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>

<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle"


In [6]:
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)

bs4.BeautifulSoup

In [10]:
movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


In [16]:
first_movie = movie_containers[0]
first_movie

<div class="lister-item mode-advanced">
<div class="lister-top-right">
<div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342"></div>
</div>
<div class="lister-item-image float-left">
<a href="/title/tt3315342/"> <img alt="Logan" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://m.media-amazon.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://m.media-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB466725069_.png" width="67"/>
</a> </div>
<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1.</span>
<a href="/title/tt3315342/">Logan</a>
<span class="lister-item-year text-muted unbold">(2017)</span>
</h3>
<p class="text-muted">
<span class="certificate">18A</span>
<span class="ghost">|</span>
<span class="runtime">137 min</span>
<span class="ghost">|</span>
<span class="genre">
A

In [22]:
first_name = first_movie.h3.a.text
first_name

'Logan'

In [25]:
first_year = first_movie.h3.find('span', class_='lister-item-year text-muted unbold').text
first_year

'(2017)'

In [30]:
first_rating = float(first_movie.strong.text)
first_rating

8.1

In [35]:
first_mscore = int(first_movie.find('span', class_='metascore').text)
first_mscore

77

In [43]:
first_votes = int(first_movie.find('span', attrs = {'name':'nv'})['data-value'])
first_votes

612536

In [45]:
movie_tconst = first_movie.find('span', class_='userRatingValue')['data-tconst']
movie_tconst

'tt3315342'

In [46]:
# Create lists to score the scraped data
movie_names = []
movie_year = []
imdb_ratings = []
votes = []
movie_tconst = []

In [47]:
# Extra data from each movie container
for container in movie_containers:
    # Movie name
    name = container.h3.a.text
    movie_names.append(name)
    
    # Movie year
    year = container.h3.find('span', class_='lister-item-year text-muted unbold').text
    movie_year.append(year)
    
    # Movie IMDb Rating
    rating = float(container.strong.text)
    imdb_ratings.append(rating)
    
    # Number of votes
    vote = int(container.find('span', attrs = {'name':'nv'})['data-value'])
    votes.append(vote)
    
    # Movie tconst
    the_tconst = container.find('span', class_='userRatingValue')['data-tconst']
    movie_tconst.append(the_tconst)

In [48]:
# Use Pandas to chart out the lists 
import pandas as pd

In [51]:
test_df = pd.DataFrame({'movie': movie_names,
                        'year': movie_year,
                        'imdb': imdb_ratings,
                        'votes': votes,
                        'tconst': movie_tconst})
print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
movie     50 non-null object
year      50 non-null object
imdb      50 non-null float64
votes     50 non-null int64
tconst    50 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 2.1+ KB
None


Unnamed: 0,movie,year,imdb,votes,tconst
0,Logan,(2017),8.1,612536,tt3315342
1,Thor: Ragnarok,(2017),7.9,547661,tt3501632
2,Guardians of the Galaxy Vol. 2,(2017),7.6,536382,tt3896198
3,Star Wars: Episode VIII - The Last Jedi,(2017),7.0,535054,tt2527336
4,Wonder Woman,(2017),7.4,525383,tt0451279
5,Dunkirk,(2017),7.9,514006,tt5013056
6,Spider-Man: Homecoming,(2017),7.4,481473,tt2250912
7,Get Out,(I) (2017),7.7,456895,tt5052448
8,It,(I) (2017),7.3,431413,tt1396484
9,Blade Runner 2049,(2017),8.0,427375,tt1856101


In [53]:
# Scrape other webpages by changing the URL
# Create list with strings that corresopnds to the first four pages
pages = [str(i) for i in range(1,5)]
pages

['1', '2', '3', '4']

In [54]:
# Create a list with strings corresponding to years 2000 to 2017
years_url = [str(i) for i in range(2000, 2018)]
years_url

['2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017']

In [55]:
# Control the crawl-rate by using sleep() to pause execution of loop
# Use randint() to imic human behaviour
from time import sleep
from random import randint

for _ in range(0,5):
    print('Blah')
    
    # Pause for (x) seconds before continuing executing the for loop
    sleep(randint(1,4))

Blah
Blah
Blah
Blah
Blah


In [60]:
# Tracking the frequency/speed of requests to make sure the program
# is not overloading the server and IP address gets banned
from time import time
from IPython.core.display import clear_output

start_time = time()
requests = 0

for _ in range(5):
    # Also track number of requests to terminate loop if # of 
    # expected requests is exeeded
    requests += 1
    
    sleep(randint(1,10))
    elapsed_time = time() - start_time
    print('Request #: {}; Frequency: {:.4f} request per second'.format(requests, requests/elapsed_time))
    
    # To only show the most recent request and frequency info
    clear_output(wait = True) 

Request #: 5; Frequency: 0.1470 request per second


In [64]:
# Monitor the status of code 
# Where successful request has a status code of 200
# Issue a warning if status code is not 200
from warnings import warn
warn('Warning Simulation')

  """


In [77]:
pages = [str(i) for i in range(1,3)]
years_url = [str(i) for i in range(2000,2006)]
headers = {"Accept-Language": "en-US, en;q=0.5"}

# Lists to score the scraped data
movie_names = []
movie_year = []
imdb_ratings = []
votes = []
movie_tconst = []

# To monitor the loop
start_time = time()
requests = 0

# For every year in the interval from 2000 to 2017
for year_url in years_url:
    
    # For every page from 1 to 4
    for page in pages:
        
        # Make a get request
        response = get('http://www.imdb.com/search/title?release_date=' + year_url + '&sort=num_votes,desc&page=' + page, headers=headers)
        
        # Pause the loop
        sleep(randint(8,15))
        
        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request #: {}; Frequency: {:.4f} request per second'.format(requests, requests/elapsed_time))
        clear_output(wait = True) 
        
        # If request is not successful, status code is not 200 so issue warning
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))
        
        # Break the loop if number of requests is greater than expected
        if requests > 72:
            warn('Number of requests was greater than expected. Quitting...')
            break
            
        # Parse content of the request/webpage iwth BeautifulSoup
        html_soup = BeautifulSoup(response.text, 'html.parser')

        # Select all movie containers by div class from that page
        movie_containers = html_soup.find_all('div', class_ = 'lister-item mode-advanced')
        
        # For every movie out of 50
        for container in movie_containers:
            # Movie name
            name = container.h3.a.text
            movie_names.append(name)

            # Movie year
            year = container.h3.find('span', class_='lister-item-year text-muted unbold').text
            movie_year.append(year)

            # Movie IMDb Rating
            rating = float(container.strong.text)
            imdb_ratings.append(rating)

            # Number of votes
            vote = int(container.find('span', attrs = {'name':'nv'})['data-value'])
            votes.append(vote)

            # Movie tconst
            the_tconst = container.find('span', class_='userRatingValue')['data-tconst']
            movie_tconst.append(the_tconst)

Request #: 12; Frequency: 0.0813 request per second


In [78]:
test_df = pd.DataFrame({'movie': movie_names,
                        'year': movie_year,
                        'imdb': imdb_ratings,
                        'votes': votes,
                        'tconst': movie_tconst})
print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 5 columns):
movie     600 non-null object
year      600 non-null object
imdb      600 non-null float64
votes     600 non-null int64
tconst    600 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 23.6+ KB
None


Unnamed: 0,movie,year,imdb,votes,tconst
0,Gladiator,(2000),8.5,1284694,tt0172495
1,Memento,(2000),8.4,1079990,tt0209144
2,Snatch,(2000),8.3,754909,tt0208092
3,Requiem for a Dream,(2000),8.3,736361,tt0180093
4,X-Men,(2000),7.4,555955,tt0120903
...,...,...,...,...,...
595,Hard Candy,(2005),7.1,150507,tt0424136
596,Criminal Minds,(2005–2020),8.1,149046,tt0452046
597,Rome,(2005–2007),8.7,148527,tt0384766
598,Flightplan,(2005),6.3,148424,tt0408790


In [73]:
# Where his code differs from mine
pages = [str(i) for i in range(1,3)]
years_url = [str(i) for i in range(2000,2006)]
headers = {"Accept-Language": "en-US, en;q=0.5"}

# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Preparing the monitoring of the loop
start_time = time()
requests = 0

# For every year in the interval 2000-2017
for year_url in years_url:

    # For every page in the interval 1-4
    for page in pages:

        # Make a get request
        response = get('http://www.imdb.com/search/title?release_date=' + year_url +
        '&sort=num_votes,desc&page=' + page, headers = headers)

        # Pause the loop
        sleep(randint(8,15))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))

        # Break the loop if the number of requests is greater than expected
        if requests > 72:
            warn('Number of requests was greater than expected.')
            break

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

        # For every movie of these 50
        for container in mv_containers:
            # If the movie has a Metascore, then:
            if container.find('div', class_ = 'ratings-metascore') is not None:

                # Scrape the name
                name = container.h3.a.text
                names.append(name)

                # Scrape the year
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)

                # Scrape the IMDB rating
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)

                # Scrape the Metascore
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))

                # Scrape the number of votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))

Request:12; Frequency: 0.081455316378573 requests/s


In [74]:
test_df = pd.DataFrame({'movie': names,
                        'year': years,
                        'imdb': imdb_ratings,
                        'votes': votes})
print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 556 entries, 0 to 555
Data columns (total 4 columns):
movie    556 non-null object
year     556 non-null object
imdb     556 non-null float64
votes    556 non-null int64
dtypes: float64(1), int64(1), object(2)
memory usage: 17.5+ KB
None


Unnamed: 0,movie,year,imdb,votes
0,Gladiator,(2000),8.5,1284694
1,Memento,(2000),8.4,1079990
2,Snatch,(2000),8.3,754909
3,Requiem for a Dream,(2000),8.3,736361
4,X-Men,(2000),7.4,555955
...,...,...,...,...
551,Hostel,(2005),5.9,164689
552,The Longest Yard,(2005),6.4,157138
553,Hard Candy,(2005),7.1,150507
554,Flightplan,(2005),6.3,148424
