# IMDb 250 Webscraping

In [1]:
import pandas as pd
import urllib.request as URL
from bs4 import BeautifulSoup as BS

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Setting the header
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36',
          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}

In [3]:
# Source of the data
url = 'https://www.imdb.com/chart/top/'

In [4]:
# Requesting the data from the source
req = URL.Request(url, None, header)

In [7]:
# Opening the requested information
response = URL.urlopen(req)

In [8]:
# Reading the content using an html parser
content = BS(response.read(), 'html.parser')

In [9]:
# Displaying the title of the content 
content.title

<title>IMDb Top 250 Movies</title>

## Scraping the title of the movies 

In [10]:
top = content.find_all('h3', class_='ipc-title__text')[1:251]

In [11]:
top

[<h3 class="ipc-title__text">1. The Shawshank Redemption</h3>,
 <h3 class="ipc-title__text">2. The Godfather</h3>,
 <h3 class="ipc-title__text">3. The Dark Knight</h3>,
 <h3 class="ipc-title__text">4. The Godfather Part II</h3>,
 <h3 class="ipc-title__text">5. 12 Angry Men</h3>,
 <h3 class="ipc-title__text">6. Schindler's List</h3>,
 <h3 class="ipc-title__text">7. The Lord of the Rings: The Return of the King</h3>,
 <h3 class="ipc-title__text">8. Pulp Fiction</h3>,
 <h3 class="ipc-title__text">9. The Lord of the Rings: The Fellowship of the Ring</h3>,
 <h3 class="ipc-title__text">10. Il buono, il brutto, il cattivo</h3>,
 <h3 class="ipc-title__text">11. Forrest Gump</h3>,
 <h3 class="ipc-title__text">12. Fight Club</h3>,
 <h3 class="ipc-title__text">13. The Lord of the Rings: The Two Towers</h3>,
 <h3 class="ipc-title__text">14. Inception</h3>,
 <h3 class="ipc-title__text">15. Star Wars: Episode V - The Empire Strikes Back</h3>,
 <h3 class="ipc-title__text">16. The Matrix</h3>,
 <h3 cl

In [12]:
titles = []
for i, title in enumerate(top):
    if i < 10:
        titles.append(title.text[3:])
    elif 10 <= i < 100:
        titles.append(title.text[4:])
    else:
        titles.append(title.text[5:])

In [13]:
titles

['The Shawshank Redemption',
 'The Godfather',
 'The Dark Knight',
 'The Godfather Part II',
 '12 Angry Men',
 "Schindler's List",
 'The Lord of the Rings: The Return of the King',
 'Pulp Fiction',
 'The Lord of the Rings: The Fellowship of the Ring',
 ' Il buono, il brutto, il cattivo',
 ' Forrest Gump',
 ' Fight Club',
 ' The Lord of the Rings: The Two Towers',
 ' Inception',
 ' Star Wars: Episode V - The Empire Strikes Back',
 ' The Matrix',
 ' Goodfellas',
 " One Flew Over the Cuckoo's Nest",
 ' Se7en',
 " It's a Wonderful Life",
 ' Shichinin no samurai',
 ' Interstellar',
 ' The Silence of the Lambs',
 ' Saving Private Ryan',
 ' Cidade de Deus',
 ' La vita è bella',
 ' The Green Mile',
 ' Spider-Man: Across the Spider-Verse',
 ' Star Wars',
 ' Terminator 2: Judgment Day',
 ' Back to the Future',
 ' Sen to Chihiro no kamikakushi',
 ' The Pianist',
 ' Psycho',
 ' Gisaengchung',
 ' Gladiator',
 ' The Lion King',
 ' Léon',
 ' American History X',
 ' The Departed',
 ' Whiplash',
 ' The

## Release Year and Running Time

In [14]:
data = content.find_all('span', class_="sc-43986a27-8 jHYIIK cli-title-metadata-item")

In [15]:
global_list = []

for items in data:
    global_list.append(items.string) 

In [16]:
release_year = global_list[::3]

In [17]:
running_time = global_list[1::3][:77] + global_list[231:][::3]

## Finding the ratings of the movies

In [18]:
class_rating = 'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating'
span_elements = content.find_all('span', class_=class_rating)

In [19]:
span_elements

[<span aria-label="IMDb rating: 9.3" class="ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating"><svg class="ipc-icon ipc-icon--star-inline" fill="currentColor" height="24" role="presentation" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M12 20.1l5.82 3.682c1.066.675 2.37-.322 2.09-1.584l-1.543-6.926 5.146-4.667c.94-.85.435-2.465-.799-2.567l-6.773-.602L13.29.89a1.38 1.38 0 0 0-2.581 0l-2.65 6.53-6.774.602C.052 8.126-.453 9.74.486 10.59l5.147 4.666-1.542 6.926c-.28 1.262 1.023 2.26 2.09 1.585L12 20.099z"></path></svg>9.3<span class="ipc-rating-star--voteCount"> (<!-- -->2.8M<!-- -->)</span></span>,
 <span aria-label="IMDb rating: 9.2" class="ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating"><svg class="ipc-icon ipc-icon--star-inline" fill="currentColor" height="24" role="presentation" viewbox="0 0 24 24" width="24" xmlns="http://www.w3.org/2000/svg"><path d="M12 20.1l5.82 3.682c1.066.675 2

In [20]:
rating_list = []
for span_element in span_elements:
    rating_list.append(span_element['aria-label'].split(': ')[1])

In [21]:
rating_list

['9.3',
 '9.2',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '8.9',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.7',
 '8.7',
 '8.7',
 '8.7',
 '8.6',
 '8.6',
 '8.6',
 '8.7',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.7',
 '8.6',
 '8.6',
 '8.5',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.4',
 '8.5',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.3',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.3',
 '8.4',
 '8.3',
 '8.4',
 '8.4',
 '8.3',
 '8.3',
 '8.4',
 '8.3',
 '8.4',
 '8.4',
 '8.3',
 '8.4',
 '8.3',
 '8.4',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.2',
 '8.3',
 '8.2',
 '8.3',
 '8.2',
 '8.3',
 '8.2',
 '8.2',
 '8.2',
 '8.2',


## Creating the IMDb 250 List

In [22]:
#storing the scraped information in a dataframe
IMDb_250 = pd.DataFrame({'title': titles, 'release year':release_year, 'rating': rating_list})

In [23]:
#Displaying the Dataframe
IMDb_250

Unnamed: 0,title,release year,rating
0,The Shawshank Redemption,1994,9.3
1,The Godfather,1972,9.2
2,The Dark Knight,2008,9.0
3,The Godfather Part II,1974,9.0
4,12 Angry Men,1957,9.0
...,...,...,...
245,. Les quatre cents coups,1h 39m,8.1
246,. Aladdin,1h 30m,8.0
247,. Dances with Wolves,3h 1m,8.0
248,. Persona,1h 25m,8.1


                ________  ________   _______   ______ 
                 /_  __/ / / / ____/  / ____/ | / / __ \
                  / / / /_/ / __/    / __/ /  |/ / / / /
                 / / / __  / /___   / /___/ /|  / /_/ / 
                /_/ /_/ /_/_____/  /_____/_/ |_/_____/  