# Scraping data for over 2000 movies

In [None]:
# Learning sources:
# https://www.dataquest.io/blog/web-scraping-tutorial-python/
# https://www.dataquest.io/blog/web-scraping-beautifulsoup/

## Extracting data for a single movie

In [144]:
import requests

url = "https://www.imdb.com/search/title/?release_date=2017-01-01,2017-12-31&sort=num_votes,desc"

page = requests.get(url)

page

<Response [200]>

In [None]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(page.content, "html.parser")

print(soup.prettify())

In [None]:
movie_containers = soup.find_all("div", class_="lister-item mode-advanced")

print(type(movie_containers))
print(len(movie_containers))

Now we’ll select only the first container, and extract, by turn, each item of interest:

The name of the movie.
The year of release.
The IMDB rating.
The Metascore.
The number of votes.

In [None]:
print(movie_containers[0].prettify())

In [None]:
movie_name = movie_containers[0].h3.a.text.strip()
release_year = movie_containers[0].h3.find("span", class_="lister-item-year").text.strip()
imdb_rating = float(movie_containers[0].strong.text)
metascore = int(movie_containers[0].find("span", class_="metascore favorable").text)
no_of_votes = movie_containers[0].find("span", attrs = {"name": "nv"})["data-value"]

print(movie_name)
print(release_year)
print(imdb_rating)
print(metascore)
print(no_of_votes)

In [None]:
votes_row = movie_containers[0].find("span", attrs = {"name": "nv"})["data-value"]

In [None]:
# lists to store the scraped data

names = []
years = []
imdb_ratings = []
metascores = []
votes = []

for container in movie_containers:
    
    # if the movie has meta-score then and only then take that movie
    if container.find("span", class_="metascore favorable") is not None:
        
        # movie names
        names.append(container.h3.a.text.strip())
        # release year
        years.append(container.h3.find("span", class_="lister-item-year").text)
        # imdb ratings
        imdb_ratings.append(float(container.strong.text))
        # meta-scores
        metascores.append(int(container.find("span", class_="metascore favorable").text))
        # number of votes
        votes.append(int(container.find("span", attrs = {"name": "nv"})["data-value"]))
        
print("Done")
    

In [None]:
import pandas as pd

df = pd.DataFrame({
    "Movie": names,
    "Release year": years,
    "IMDB ratings": imdb_ratings,
    "Meta-scores": metascores,
    "No of votes": votes
})

df.head()

## Script for extracting from multiple pages

### Changing the URL’s parameters

In [None]:
# pages
starting_point = ["1", "51", "101", "151"]

# years
years_url = [str(i) for i in range(2016,2018)] # make it (2000, 2018) for more movies

### Controlling the crawl-rate

In [None]:
from time import sleep
from random import randint

for _ in range(0,5):
    print("Hello")
    sleep(randint(1,4))

### Monitoring the loop as it’s still going

In [None]:
from IPython.core.display import clear_output
from time import time

start_time = time()
req = 0

for _ in range(5):
    
    # a request would go here.
    req = req + 1
    
    sleep(randint(1,3))
    
    time_taken = time() - start_time
    
    print("Request: {}, Frequence: {} requests/s".format(req, req/time_taken))
    
    clear_output(wait=True)


In [None]:
from warnings import warn

warn("warning simulation")

### Piecing everything together

In [None]:
# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Preparing the monitoring of the loop
start_time = time()
req = 0

for year in years_url:
    
    for star_point in starting_point:
        
        # Make a get request
        url = "https://www.imdb.com/search/title/?release_date=" + year + "&sort=num_votes,desc&start=" + star_point
        response = requests.get(url)
        
        # Pause the loop
        sleep(randint(3,8))
        
        # Monitor the requests
        req = req + 1
        taken_time = time() - start_time
        
        print("year: ", year, "Starting point: ", star_point)
        print('Request:{}; Frequency: {} requests/s'.format(req, req/taken_time))
        clear_output(wait = True)
        
        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))
            
        # Break the loop if the number of requests is greater than expected
        if req > 72:
            warn('Number of requests was greater than expected.')
            break
            
        soup = BeautifulSoup(response.content, "html.parser")
        
        movie_containers = soup.find_all("div", class_="lister-item mode-advanced")
        
        for container in movie_containers:
    
            # if the movie has meta-score then and only then take that movie
            if container.find("span", class_="metascore favorable") is not None:
                
                # movie names
                names.append(container.h3.a.text.strip())
                # release year
                years.append(container.h3.find("span", class_="lister-item-year").text)
                # imdb ratings
                imdb_ratings.append(float(container.strong.text))
                # meta-scores
                metascores.append(int(container.find("span", class_="metascore favorable").text))
                # number of votes
                votes.append(int(container.find("span", attrs = {"name": "nv"})["data-value"]))
        

print("Done")

In [None]:
movie_ratings = pd.DataFrame({'Movie': names,
'Release year': years,
'IMDB rating': imdb_ratings,
'Meta-score': metascores,
'Number of votes': votes
})

print(movie_ratings.info())
movie_ratings.head(10)

### Cleaning the data before exporting

In [None]:
movie_ratings["Release year"].unique()

In [None]:
movie_ratings["Release year"] = movie_ratings["Release year"].str[-5:-1].astype(int)

In [None]:
movie_ratings["Release year"].unique()

### Exporting scraped data

In [None]:
movie_ratings.to_csv("IMDB_Movie ratings.csv")

print("Data exported to csv")