In [1]:
# 1. import libraries
from bs4 import BeautifulSoup
import requests

In [2]:
# 2. url: we start with the 'second' page
url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=51&ref_=adv_nxt"

In [3]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [4]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
soup


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Feature Film,
Released between 1990-01-01 and 1992-12-31,
User Rating at least 7.5
(Sorted by Popularity Ascending) - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
 

In [None]:
iterations = range(1, 631, 50)

for i in iterations:
    start_at= str(i)
    url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=" + start_at + "&ref_=adv_nxt"
    print(url)

Respectful scraping:

There we have it, all the URLs we need! Before starting with the actual scraping, though, there's something we need to note when sending massive, automated requests to websites: it's rude. 

We just have 13 of them, which is not too many, but it's still a good practice to let a few seconds pass in between requests. 

Some pages don't like being scraped and will block your IP if they detect it's sending automated requests. Others might have a small server for the traffic they handle, and sending too many requests might crash the site. The sleep module will help us with that. 

Here's how it works, waiting 2 seconds between each iteration in a for loop:

In [5]:
#we need a few more tools for this one 
from random import randint
import time
time.sleep(2)

In [6]:
#basic use of sleep in for loop 
for i in range(5):
    print(i)
    time.sleep(2)

0
1
2
3
4


In [7]:
#more human use of sleep 

for i in range(5):
    print(i)
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " seconds.")
    time.sleep(wait_time)

0
I will sleep for 2 seconds.
1
I will sleep for 1 seconds.
2
I will sleep for 3 seconds.
3
I will sleep for 4 seconds.
4
I will sleep for 1 seconds.


to make this more interactive 
- you can split the below scripts out into separate cells and run them yourself or re-write them in your own python style 

In [12]:
#Assembling the script to send and store multiple requests

pages = []
iterations = range(1, 631, 50)
for i in iterations:
    # assemble the url:
    start_at= str(i)
    url = "https://www.imdb.com/search/title/?title_type=feature&release_date=1990-01-01,1992-12-31&user_rating=7.5,&start=" + start_at + "&ref_=adv_nxt"

    # download html with a get request:
    response = requests.get(url)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap:
    wait_time = randint(1,4)
    time.sleep(wait_time)

Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200
Status code: 200


In [13]:
# print the object pages after running the code above, you'll just see the response code messages, but the html code is still accessible and you can parse it the same way we've always done:

BeautifulSoup(pages[0].content, "html.parser")


<!DOCTYPE html>

<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Feature Film,
Released between 1990-01-01 and 1992-12-31,
User Rating at least 7.5
(Sorted by Popularity Ascending) - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
</script>
<script>
    if (typeof uex == 'function') {
 

In [14]:
# Parse just the first page, for testing purposes
soup = BeautifulSoup(pages[0].content, "html.parser")

In [15]:
# Paste the Selector from the first movie title copied from Chrome Dev Tools
soup.select("#main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content > h3 > a")


[<a href="/title/tt0099785/">Kevin - Allein zu Haus</a>]

In [16]:
# Trim the selection: now it grabs all the titles
soup.select("div.lister-item-content > h3 > a")

[<a href="/title/tt0099785/">Kevin - Allein zu Haus</a>,
 <a href="/title/tt0099685/">GoodFellas - Drei Jahrzehnte in der Mafia</a>,
 <a href="/title/tt0099674/">Der Pate 3</a>,
 <a href="/title/tt0102926/">Das Schweigen der Lämmer</a>,
 <a href="/title/tt0100802/">Total Recall - Die totale Erinnerung</a>,
 <a href="/title/tt0104952/">Mein Vetter Winnie</a>,
 <a href="/title/tt0101507/">Boyz n the Hood - Jungs im Viertel</a>,
 <a href="/title/tt0103064/">Terminator 2: Tag der Abrechnung</a>,
 <a href="/title/tt0105323/">Der Duft der Frauen</a>,
 <a href="/title/tt0104691/">Der letzte Mohikaner</a>,
 <a href="/title/tt0099487/">Edward mit den Scherenhänden</a>,
 <a href="/title/tt0099810/">Jagd auf Roter Oktober</a>,
 <a href="/title/tt0099348/">Der mit dem Wolf tanzt</a>,
 <a href="/title/tt0105236/">Reservoir Dogs - Wilde Hunde</a>,
 <a href="/title/tt0104257/">Eine Frage der Ehre</a>,
 <a href="/title/tt0105695/">Erbarmungslos</a>,
 <a href="/title/tt0102138/">JFK: Tatort Dallas</a>,

In [24]:
# Paste the Selector from the first movie title copied from Chrome Dev Tools
soup.select("#main > div > div.lister.list.detail.sub-list > div > div:nth-child(1) > div.lister-item-content > p:nth-child(4)")

[<p class="text-muted">
     An eight-year-old troublemaker must protect his house from a pair of burglars when he is accidentally left home alone by his family during Christmas vacation.</p>]

In [18]:
# Trim the selection: now it grabs all the titles
soup.select("div.lister-item-content > p:nth-child(4)")

[<p class="text-muted">
     An eight-year-old troublemaker must protect his house from a pair of burglars when he is accidentally left home alone by his family during Christmas vacation.</p>,
 <p class="text-muted">
     The story of <a href="/name/nm1453737">Henry Hill</a> and his life in the mob, covering his relationship with his wife Karen Hill and his mob partners Jimmy Conway and Tommy DeVito in the Italian-American crime syndicate.</p>,
 <p class="text-muted">
     Follows Michael Corleone, now in his 60s, as he seeks to free his family from crime and find a suitable successor to his empire.</p>,
 <p class="text-muted">
     A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims.</p>,
 <p class="text-muted">
     When a man goes for virtual vacation memories of the planet Mars, an unexpected and harrowing series of events forces him to go to the planet for real - or is he

One of the ugliest things about the code above is that the HTML element containing the synopsis does not have any combination of tag and attribute that makes it unique. We've had to use select("p:nth-child(4)") and simply grab the 4th <p> element. Not very elegant... potentially will break... but, for now, it works.

We have noticed how both the title and the synopsis are children of div.lister-item-content. That will make our looping task a bit simpler.

There are many approaches to do this. The one we'll follow is:

Loop through the pages we collected, parse them ("create the soup") and store the parsed pages in a list.
For each parsed page, select the "blocks of HTML elements" that contain all the information of each movie (the title, the synopsis and other stuff).
For each one of the "blocks" we collected in the previous step:
Get the movie titles and store them in a list
Get the synopsis and store them in a list
Here's the code that does that:

In [19]:
pages_parsed = []
titles = []
synopsis = []

for i in range(len(pages)):
    # parse all pages
    pages_parsed.append(BeautifulSoup(pages[i].content, "html.parser"))
    # select only the info about the movies
    movies_html = pages_parsed[i].select("div.lister-item-content")
    # for movie, store titles and reviews into lists
    for j in range(len(movies_html)):
        titles.append(movies_html[j].select("h3 > a")[0].get_text())
        synopsis.append(movies_html[j].select("p:nth-child(4)")[0].get_text().strip())

In [20]:
pages_parsed

[
 <!DOCTYPE html>
 
 <html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
 <meta charset="utf-8"/>
 <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
 <meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
 <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
 <script>
     if (typeof uet == 'function') {
       uet("bb", "LoadTitle", {wb: 1});
     }
 </script>
 <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
 <title>Feature Film,
 Released between 1990-01-01 and 1992-12-31,
 User Rating at least 7.5
 (Sorted by Popularity Ascending) - IMDb</title>
 <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
 <script>
     if (typeof uet == 'function') {
       uet("be", "LoadTitle", {wb: 1});
     }
 </script>
 <script>
     if (ty

In [21]:
titles

['Kevin - Allein zu Haus',
 'GoodFellas - Drei Jahrzehnte in der Mafia',
 'Der Pate 3',
 'Das Schweigen der Lämmer',
 'Total Recall - Die totale Erinnerung',
 'Mein Vetter Winnie',
 'Boyz n the Hood - Jungs im Viertel',
 'Terminator 2: Tag der Abrechnung',
 'Der Duft der Frauen',
 'Der letzte Mohikaner',
 'Edward mit den Scherenhänden',
 'Jagd auf Roter Oktober',
 'Der mit dem Wolf tanzt',
 'Reservoir Dogs - Wilde Hunde',
 'Eine Frage der Ehre',
 'Erbarmungslos',
 'JFK: Tatort Dallas',
 'Die Muppets Weihnachtsgeschichte',
 'Misery',
 'Die Schöne und das Biest',
 'Aladdin',
 'Grüne Tomaten',
 'Thelma & Louise',
 'König der Fischer',
 "Jacob's Ladder - In der Gewalt des Jenseits",
 'Armee der Finsternis',
 "Miller's Crossing",
 'Ein Engel an meiner Tafel',
 'Zeit des Erwachens',
 'Glengarry Glen Ross',
 'Barton Fink',
 'Malcolm X',
 'Chaplin - Das Leben der unsterblichen Filmlegende',
 'Die Commitments',
 'Braindead',
 'The Player',
 'Von Mäusen und Menschen',
 'Die zwei Leben der Veroni

In [22]:
synopsis 

['An eight-year-old troublemaker must protect his house from a pair of burglars when he is accidentally left home alone by his family during Christmas vacation.',
 'The story of Henry Hill and his life in the mob, covering his relationship with his wife Karen Hill and his mob partners Jimmy Conway and Tommy DeVito in the Italian-American crime syndicate.',
 'Follows Michael Corleone, now in his 60s, as he seeks to free his family from crime and find a suitable successor to his empire.',
 'A young F.B.I. cadet must receive the help of an incarcerated and manipulative cannibal killer to help catch another serial killer, a madman who skins his victims.',
 'When a man goes for virtual vacation memories of the planet Mars, an unexpected and harrowing series of events forces him to go to the planet for real - or is he?',
 'Two New Yorkers accused of murder in rural Alabama while on their way back to college call in the help of one of their cousins, a loudmouth lawyer with no trial experience

In [26]:
import pandas as pd

In [34]:
movies = pd.DataFrame({"Titles":titles,
                       "Synopsis ":synopsis,
                      })

In [35]:
movies

Unnamed: 0,Titles,Synopsis
0,Kevin - Allein zu Haus,An eight-year-old troublemaker must protect hi...
1,GoodFellas - Drei Jahrzehnte in der Mafia,The story of Henry Hill and his life in the mo...
2,Der Pate 3,"Follows Michael Corleone, now in his 60s, as h..."
3,Das Schweigen der Lämmer,A young F.B.I. cadet must receive the help of ...
4,Total Recall - Die totale Erinnerung,When a man goes for virtual vacation memories ...
...,...,...
626,Një djalë edhe një vajzë,Artan a teenager in high school starts hanging...
627,Innebunesc si-mi pare rau,Add a Plot
628,Linda McCartney: Behind the Lens,Add a Plot
629,Sobachye shchastye,"Dog's happiness is at it's highest, when this ..."
