# Web Scraping IMDB Top 250 Movies

by Collins Kimotho

In [88]:
# Import the necessary libraries
import pandas as pd
from bs4 import BeautifulSoup
import requests
import csv
import re

In [89]:
# Define the URL of IMDB's top movies chart
url = "https://www.imdb.com/chart/top/?ref_=nv_mv_250"

# Send a GET request to the URL
page = requests.get(url)
page

<Response [200]>

In [90]:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(page.content, "html.parser")

In [91]:
# Find all the movie titles, ratings, and years using their respective HTML attributes
titles = pagesoup.findAll("td", attrs={"class": "titleColumn"})
ratings = pagesoup.findAll("td", attrs={"class": "ratingColumn imdbRating"})
years = pagesoup.findAll("span", attrs={"class": "secondaryInfo"})

In [93]:
# Open a CSV file named "imdb_movies.csv" in write mode
with open("imdb_movies.csv", "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(['title', 'rating', 'year'])
    
    # Iterate through the lists of titles, ratings, and years simultaneously
    for title, rating, year in zip(titles, ratings, years):
        print(title.text, rating.text, year.text)
        writer.writerow([title.text.strip(), rating.text.strip(), year.text.strip()])
        
print("Data saved to imdb_movies.csv")


      1.
      The Shawshank Redemption
(1994)
 
9.2
 (1994)

      2.
      The Godfather
(1972)
 
9.2
 (1972)

      3.
      The Dark Knight
(2008)
 
9.0
 (2008)

      4.
      The Godfather Part II
(1974)
 
9.0
 (1974)

      5.
      12 Angry Men
(1957)
 
9.0
 (1957)

      6.
      Schindler's List
(1993)
 
8.9
 (1993)

      7.
      The Lord of the Rings: The Return of the King
(2003)
 
8.9
 (2003)

      8.
      Pulp Fiction
(1994)
 
8.8
 (1994)

      9.
      The Lord of the Rings: The Fellowship of the Ring
(2001)
 
8.8
 (2001)

      10.
      Il buono, il brutto, il cattivo
(1966)
 
8.8
 (1966)

      11.
      Forrest Gump
(1994)
 
8.8
 (1994)

      12.
      Fight Club
(1999)
 
8.7
 (1999)

      13.
      The Lord of the Rings: The Two Towers
(2002)
 
8.7
 (2002)

      14.
      Inception
(2010)
 
8.7
 (2010)

      15.
      Star Wars: Episode V - The Empire Strikes Back
(1980)
 
8.7
 (1980)

      16.
      The Matrix
(1999)
 
8.7
 (1999)

      17.
      GoodFe

In [94]:
# Read the CSV file into a DataFrame
path= r"C:\Users\kimotho\imdb_movies.csv"
movies=pd.read_csv(path, encoding='latin-1')
movies

Unnamed: 0,title,rating,year
0,1.\n The Shawshank Redemption\n(1994),9.2,(1994)
1,2.\n The Godfather\n(1972),9.2,(1972)
2,3.\n The Dark Knight\n(2008),9.0,(2008)
3,4.\n The Godfather Part II\n(1974),9.0,(1974)
4,5.\n 12 Angry Men\n(1957),9.0,(1957)
...,...,...,...
245,246.\n The Iron Giant\n(1999),8.0,(1999)
246,247.\n The Help\n(2011),8.0,(2011)
247,248.\n Aladdin\n(1992),8.0,(1992)
248,249.\n Dances with Wolves\n(1990),8.0,(1990)


In [95]:
# Extract the titles from the movies DataFrame and print them
titles = [value.split("\n")[1] for value in movies['title']]

for title in titles:
    print(title)

      The Shawshank Redemption
      The Godfather
      The Dark Knight
      The Godfather Part II
      12 Angry Men
      Schindler's List
      The Lord of the Rings: The Return of the King
      Pulp Fiction
      The Lord of the Rings: The Fellowship of the Ring
      Il buono, il brutto, il cattivo
      Forrest Gump
      Fight Club
      The Lord of the Rings: The Two Towers
      Inception
      Star Wars: Episode V - The Empire Strikes Back
      The Matrix
      GoodFellas
      One Flew Over the Cuckoo's Nest
      Se7en
      It's a Wonderful Life
      Shichinin no samurai
      The Silence of the Lambs
      Saving Private Ryan
      Cidade de Deus
      Interstellar
      La vita è bella
      The Green Mile
      Star Wars
      Terminator 2: Judgment Day
      Back to the Future
      Sen to Chihiro no kamikakushi
      The Pianist
      Psycho
      Gisaengchung
      Léon
      Gladiator
      The Lion King
      American History X
      The Departed
      Whiplas

In [96]:
# Assign the 'titles' list back to the 'title' column of the 'movies' DataFrame
movies=movies.assign(title = titles)
movies

Unnamed: 0,title,rating,year
0,The Shawshank Redemption,9.2,(1994)
1,The Godfather,9.2,(1972)
2,The Dark Knight,9.0,(2008)
3,The Godfather Part II,9.0,(1974)
4,12 Angry Men,9.0,(1957)
...,...,...,...
245,The Iron Giant,8.0,(1999)
246,The Help,8.0,(2011)
247,Aladdin,8.0,(1992)
248,Dances with Wolves,8.0,(1990)


In [97]:
movies.dtypes

title      object
rating    float64
year       object
dtype: object

In [98]:
# Remove brackets from the 'year' column using regular expressions
movies['year'] = movies['year'].apply(lambda x: re.sub(r'[()]', '', str(x)))
movies

Unnamed: 0,title,rating,year
0,The Shawshank Redemption,9.2,1994
1,The Godfather,9.2,1972
2,The Dark Knight,9.0,2008
3,The Godfather Part II,9.0,1974
4,12 Angry Men,9.0,1957
...,...,...,...
245,The Iron Giant,8.0,1999
246,The Help,8.0,2011
247,Aladdin,8.0,1992
248,Dances with Wolves,8.0,1990


In [99]:
# Convert the 'year' column to the integer data type
movies['year'] = movies['year'].astype(int)
movies.dtypes

title      object
rating    float64
year        int32
dtype: object

In [100]:
# Check for missing values in the movies dataframe
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   title   250 non-null    object 
 1   rating  250 non-null    float64
 2   year    250 non-null    int32  
dtypes: float64(1), int32(1), object(1)
memory usage: 5.0+ KB


In [102]:
movies.describe()

Unnamed: 0,rating,year
count,250.0,250.0
mean,8.2516,1986.524
std,0.235114,25.230373
min,8.0,1921.0
25%,8.1,1966.25
50%,8.2,1994.0
75%,8.4,2006.75
max,9.2,2023.0
