# Beautiful Soup Tutorial -- Dev Sharma, Applied Analytics Club

Import libraries

In [1]:
from bs4 import BeautifulSoup
import requests

Define the url and use the GET html method to extract the web page

In [2]:
url = "https://www.imdb.com/search/title?genres=drama&groups=top_250&sort=user_rating,desc"
res = requests.get(url)

Let's check the response variable

In [3]:
print(res)

<Response [200]>


A response code of 200 indicates an 'OK' signal. There are various codes (e.g. 401, 404) which can be found here: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes


Use BeautifulSoup to parse the response variable

In [4]:
soup = BeautifulSoup(res.text)
# print(soup)

### Selecting a single element

In [5]:
movie = soup.select_one(".lister-item-header a")
print(movie)
print(movie.text)
print(movie["href"])

<a href="/title/tt0111161/">The Shawshank Redemption</a>
The Shawshank Redemption
/title/tt0111161/


### Selecting multiple elements

Use BeautifulSoup's select function to scrape the desired content 

In [6]:
# Use selector gadget plug in to select the CSS selector
movies = soup.select(".lister-item-header a")

print(movies)

[<a href="/title/tt0111161/">The Shawshank Redemption</a>, <a href="/title/tt0068646/">The Godfather</a>, <a href="/title/tt0468569/">The Dark Knight</a>, <a href="/title/tt0071562/">The Godfather: Part II</a>, <a href="/title/tt7286456/">Joker</a>, <a href="/title/tt0167260/">The Lord of the Rings: The Return of the King</a>, <a href="/title/tt0110912/">Pulp Fiction</a>, <a href="/title/tt0108052/">Schindler's List</a>, <a href="/title/tt0050083/">12 Angry Men</a>, <a href="/title/tt0137523/">Fight Club</a>, <a href="/title/tt0120737/">The Lord of the Rings: The Fellowship of the Ring</a>, <a href="/title/tt0109830/">Forrest Gump</a>, <a href="/title/tt3417422/">Drishyam</a>, <a href="/title/tt0167261/">The Lord of the Rings: The Two Towers</a>, <a href="/title/tt0099685/">Goodfellas</a>, <a href="/title/tt0073486/">One Flew Over the Cuckoo's Nest</a>, <a href="/title/tt0056058/">Harakiri</a>, <a href="/title/tt0816692/">Interstellar</a>, <a href="/title/tt0317248/">City of God</a>, <

In [7]:
movies_titles = []
movies_links = []

for item in movies:
    movies_titles.append(item.text)
    link = "http://imdb.com" + item["href"]
    movies_links.append(link)

print(movies_titles)
print("\n")
print(movies_links)

['The Shawshank Redemption', 'The Godfather', 'The Dark Knight', 'The Godfather: Part II', 'Joker', 'The Lord of the Rings: The Return of the King', 'Pulp Fiction', "Schindler's List", '12 Angry Men', 'Fight Club', 'The Lord of the Rings: The Fellowship of the Ring', 'Forrest Gump', 'Drishyam', 'The Lord of the Rings: The Two Towers', 'Goodfellas', "One Flew Over the Cuckoo's Nest", 'Harakiri', 'Interstellar', 'City of God', 'Saving Private Ryan', 'The Green Mile', 'Life Is Beautiful', 'Se7en', 'The Silence of the Lambs', 'Seven Samurai', "It's a Wonderful Life", 'Parasite', 'Whiplash', 'The Intouchables', 'The Prestige', 'The Departed', 'The Pianist', 'Gladiator', 'American History X', 'Léon: The Professional', 'The Lion King', 'Cinema Paradiso', 'Grave of the Fireflies', 'Casablanca', 'The Great Dictator', 'Modern Times', 'City Lights', 'Capharnaüm', 'Your Name.', 'Dangal', 'Django Unchained', '3 Idiots', 'Taare Zameen Par', 'The Lives of Others', 'Oldeuboi']


['http://imdb.com/titl

## Challenge

### Scrape the 250 best TV shows' titles and links

Link: https://www.imdb.com/chart/toptv/?ref_=nv_tvv_250

In [8]:
# Answer
#
#
#
#
#

In [9]:
url = "https://www.imdb.com/chart/toptv/?ref_=nv_tvv_250"
res = requests.get(url)
soup = BeautifulSoup(res.text)

shows = soup.select(".titleColumn a")

print(shows[:10])
print("\n")

shows_titles = [title.text for title in shows]
shows_links = ["http://imdb.com"+title["href"] for title in shows]

print(shows_titles[:10])
print("\n")
print(shows_links[:10])

[<a href="/title/tt5491994/" title="David Attenborough">Planet Earth II</a>, <a href="/title/tt0795176/" title="David Attenborough, Sigourney Weaver">Planet Earth</a>, <a href="/title/tt0185906/" title="Scott Grimes, Damian Lewis">Band of Brothers</a>, <a href="/title/tt7366338/" title="Jessie Buckley, Jared Harris">Chernobyl</a>, <a href="/title/tt0903747/" title="Bryan Cranston, Aaron Paul">Breaking Bad</a>, <a href="/title/tt6769208/" title="David Attenborough, Peter Drost">Blue Planet II</a>, <a href="/title/tt0306414/" title="Dominic West, Lance Reddick">The Wire</a>, <a href="/title/tt0944947/" title="Emilia Clarke, Peter Dinklage">Game of Thrones</a>, <a href="/title/tt9253866/" title="David Attenborough">Our Planet</a>, <a href="/title/tt2395695/" title="Neil deGrasse Tyson, Stoney Emshwiller">Cosmos</a>]


['Planet Earth II', 'Planet Earth', 'Band of Brothers', 'Chernobyl', 'Breaking Bad', 'Blue Planet II', 'The Wire', 'Game of Thrones', 'Our Planet', 'Cosmos']


['http://imdb

## Bonus: Creating a for loop to scrape multiple pages

In [11]:
base_url = "http://quotes.toscrape.com/page/"
number_of_pages = 3
quotes = []

for i in range(1,number_of_pages+1):
    url = base_url + str(i) # URL Manupilation for each page
    res = requests.get(url)
    soup = BeautifulSoup(res.text)
    quotes = quotes + soup.select(".text")
    
print(quotes[:10])
print("\n")
print("Length of quotes is",len(quotes))

quotes_text = [quote.text for quote in quotes]

print(quotes_text[:10])

[<span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>, <span class="text" itemprop="text">“It is our choices, Harry, that show what we truly are, far more than our abilities.”</span>, <span class="text" itemprop="text">“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”</span>, <span class="text" itemprop="text">“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”</span>, <span class="text" itemprop="text">“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”</span>, <span class="text" itemprop="text">“Try not to become a man of success. Rather become a man of value.”</span>, <span class="text" itemprop="text">“It is better to be hated for what you are than to be loved for what you are not.”</spa