In [1]:
import requests

In [2]:
from bs4 import BeautifulSoup

In [46]:
url = "https://www.imdb.com/search/title?release_date=2017-01-01,&sort=num_votes,desc"

In [47]:
response = requests.get(url)

In [48]:
soup = BeautifulSoup(response.text,'html.parser')

In [50]:
movie_containers = soup.find_all('div', class_ = 'lister-item mode-advanced')

In [51]:
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


## Extracting the data for a single movie

In [52]:
first_movie = movie_containers[0]

In [53]:
print(first_movie.prettify())

<div class="lister-item mode-advanced">
 <div class="lister-top-right">
  <div class="ribbonize" data-caller="filmosearch" data-tconst="tt3315342">
  </div>
 </div>
 <div class="lister-item-image float-left">
  <a href="/title/tt3315342/?ref_=adv_li_i">
   <img alt="Logan: Wolverine" class="loadlate" data-tconst="tt3315342" height="98" loadlate="https://ia.media-imdb.com/images/M/MV5BYzc5MTU4N2EtYTkyMi00NjdhLTg3NWEtMTY4OTEyMzJhZTAzXkEyXkFqcGdeQXVyNjc1NTYyMjg@._V1_UX67_CR0,0,67,98_AL_.jpg" src="https://images-na.ssl-images-amazon.com/images/G/01/imdb/images/nopicture/large/film-184890147._CB499613450_.png" width="67"/>
  </a>
 </div>
 <div class="lister-item-content">
  <h3 class="lister-item-header">
   <span class="lister-item-index unbold text-primary">
    1.
   </span>
   <a href="/title/tt3315342/?ref_=adv_li_tt">
    Logan: Wolverine
   </a>
   <span class="lister-item-year text-muted unbold">
    (2017)
   </span>
  </h3>
  <p class="text-muted ">
   <span class="certificate">
 

* Title
* The year of release
* IMDB rating
* The Metascore
* The number of votes

In [54]:
first_title = first_movie.h3.a.text
first_title

'Logan: Wolverine'

In [57]:
first_year = first_movie.h3.find('span', class_ = 'lister-item-year text-muted unbold').text
first_year

'(2017)'

In [58]:
first_imdb = float(first_movie.strong.text)
first_imdb

8.1

In [59]:
first_mscore = first_movie.find('span', class_ = 'metascore favorable')

first_mscore = int(first_mscore.text)
print(first_mscore)

77


In [62]:
first_votes = int(first_movie.find('span', attrs = {'name':'nv'})["data-value"])
first_votes

473172

## The script for a single page

In [67]:
titles = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Extract data from individual movie container
for container in movie_containers:

    # If the movie has Metascore, then extract:
    if container.find('div', class_ = 'ratings-metascore') is not None:

        # The name
        title = container.h3.a.text
        titles.append(title)

        # The year
        year = container.h3.find('span', class_ = 'lister-item-year').text
        years.append(year)

        # The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)

        # The Metascore
        m_score = container.find('span', class_ = 'metascore').text
        metascores.append(int(m_score))

        # The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))

In [68]:
import pandas as pd

In [69]:
test_df = pd.DataFrame({"movie":titles,
                       "year":years,
                       "imdb":imdb_ratings,
                       "metascore":metascores,
                       "votes":votes})

In [71]:
test_df.head(10)

Unnamed: 0,imdb,metascore,movie,votes,year
0,8.1,77,Logan: Wolverine,473172,(2017)
1,7.5,76,Wonder Woman,403327,(2017)
2,8.0,94,Dunkierka,381018,(2017)
3,7.3,85,Gwiezdne wojny: Ostatni Jedi,367555,(2017)
4,7.7,67,Strażnicy Galaktyki vol. 2,365050,(2017)
5,7.5,73,Spider-Man: Homecoming,307500,(2017)
6,7.9,74,Thor: Ragnarok,303229,(2017)
7,7.7,84,Uciekaj!,297220,(I) (2017)
8,7.7,86,Baby Driver,292476,(2017)
9,8.1,81,Blade Runner 2049,284937,(2017)


## Changing the URL's parameters

In [82]:
pages = [str(i)for i in range(1,5)]
years_url = [str(i) for i in range(2015,2018)]

In [74]:
from time import sleep
from random import randint

In [75]:
from time import time
from IPython.core.display import clear_output
from warnings import warn

In [83]:
titles = []
years = []
imdb_ratings = []
metascores = []
votes = []

start_time = time()
requests_nmb = 0

for year_url in years_url:
    for page in pages:
        response = requests.get('http://www.imdb.com/search/title?release_date=' + year_url + 
        '&sort=num_votes,desc&page=' + page)
        
        sleep(randint(8,15))
        
        requests_nmb += 1
        elapsed_time = time() - start_time
        
        print('Request: {}; Frequency: {} requests/s'.format(requests_nmb,requests_nmb/elapsed_time))
        clear_output(wait=True)
        
        #Throw a warning if non-200 status codes
        if response.status_code != 200:
            warn('Requests: {}; Status code: {}'.format(requests_nmb,response.status_code))
        
        # Break the loop if the number of requests is greater than expected
        if requests_nmb > 72:
            warn('Number of requests was greater than expected.')
            break
            
        soup = BeautifulSoup(response.text,'html.parser')
        
        mv_containers = soup.find_all('div',class_= 'lister-item mode-advanced')
        
        # Extract data from individual movie container
        for container in mv_containers:

            # If the movie has Metascore, then extract:
            if container.find('div', class_ = 'ratings-metascore') is not None:

                # The name
                title = container.h3.a.text
                titles.append(title)

                # The year
                year = container.h3.find('span', class_ = 'lister-item-year').text
                years.append(year)

                # The IMDB rating
                imdb = float(container.strong.text)
                imdb_ratings.append(imdb)

                # The Metascore
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))

                # The number of votes
                vote = container.find('span', attrs = {'name':'nv'})['data-value']
                votes.append(int(vote))

Request: 12; Frequency: 0.06917855882246769 requests/s


In [112]:
movie_ratings = pd.DataFrame({'movie': titles,
                              'year': years,
                              'imdb': imdb_ratings,
                              'metascore': metascores,
                              'votes': votes})
print(movie_ratings.info())
movie_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 5 columns):
imdb         450 non-null float64
metascore    450 non-null int64
movie        450 non-null object
votes        450 non-null int64
year         450 non-null object
dtypes: float64(1), int64(2), object(2)
memory usage: 17.7+ KB
None


Unnamed: 0,imdb,metascore,movie,votes,year
0,8.0,81,Gwiezdne wojny: Przebudzenie mocy,736392,(2015)
1,8.1,90,Mad Max: Na drodze gniewu,707696,(2015)
2,8.0,80,Marsjanin,615245,(2015)
3,7.4,66,Avengers: Czas Ultrona,572170,(2015)
4,8.0,76,Zjawa,556781,(2015)
5,7.0,59,Jurassic World,488825,(2015)
6,8.2,94,W głowie się nie mieści,477349,(I) (2015)
7,7.3,64,Ant-Man,420111,(2015)
8,7.8,68,Nienawistna ósemka,388519,(2015)
9,6.8,60,Spectre,331487,(I) (2015)


In [113]:
movie_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 5 columns):
imdb         450 non-null float64
metascore    450 non-null int64
movie        450 non-null object
votes        450 non-null int64
year         450 non-null object
dtypes: float64(1), int64(2), object(2)
memory usage: 17.7+ KB


## Cleaning the data

In [114]:
movie_ratings = movie_ratings[["movie","year","imdb","metascore","votes"]]
movie_ratings.head()

Unnamed: 0,movie,year,imdb,metascore,votes
0,Gwiezdne wojny: Przebudzenie mocy,(2015),8.0,81,736392
1,Mad Max: Na drodze gniewu,(2015),8.1,90,707696
2,Marsjanin,(2015),8.0,80,615245
3,Avengers: Czas Ultrona,(2015),7.4,66,572170
4,Zjawa,(2015),8.0,76,556781


In [115]:
movie_ratings["year"].unique()

array(['(2015)', '(I) (2015)', '(II) (2015)', '(VI) (2015)',
       '(III) (2015)', '(2016)', '(II) (2016)', '(I) (2016)',
       '(IX) (2016)', '(V) (2016)', '(2017)', '(I) (2017)', '(III) (2017)',
       '(II) (2017)'], dtype=object)

In [116]:
movie_ratings["year"] = movie_ratings["year"].str.extract('([\d]{4})',expand=True).astype(int)

In [117]:
movie_ratings["year"].unique()

array([2015, 2016, 2017])

In [118]:
movie_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450 entries, 0 to 449
Data columns (total 5 columns):
movie        450 non-null object
year         450 non-null int64
imdb         450 non-null float64
metascore    450 non-null int64
votes        450 non-null int64
dtypes: float64(1), int64(3), object(1)
memory usage: 17.7+ KB


In [119]:
movie_ratings[["metascore","imdb"]].describe()

Unnamed: 0,metascore,imdb
count,450.0,450.0
mean,58.546667,6.679111
std,18.050629,0.841231
min,12.0,3.1
25%,44.25,6.2
50%,58.0,6.7
75%,72.75,7.3
max,99.0,8.5
