In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

## Top 200 Movies based on IMDB and No. of Reviews

In [2]:
options = Options()
options.add_argument("--headless")
options.add_argument("user-agent=Mozilla/5.0")

driver = webdriver.Chrome(options=options)
driver.get('https://www.imdb.com/chart/top/')
time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

movies = []
movie_blocks = soup.find_all("li", class_="ipc-metadata-list-summary-item")

for item in movie_blocks:
    try:
        # title
        title_tag = item.find("h3")
        title = title_tag.text.strip() if title_tag else "N/A"

        # num of reviews
        vote_tag = item.select_one("span.ipc-rating-star--voteCount")
        votes_raw = vote_tag.text.strip() if vote_tag else "0"
        votes_raw = votes_raw.replace('\xa0', '').replace('(', '').replace(')', '')

        multiplier = 1
        if 'K' in votes_raw:
            multiplier = 1_000
            votes_raw = votes_raw.replace('K', '')
        elif 'M' in votes_raw:
            multiplier = 1_000_000
            votes_raw = votes_raw.replace('M', '')

        votes = int(float(votes_raw) * multiplier)

        movies.append((title, votes))

    except Exception as e:
        print(f"Error parsing row: {e}")


df = pd.DataFrame(movies, columns=["Title", "NumReviews"])

print(df.head())
print(f"Total movies saved: {len(df)}")

                         Title  NumReviews
0  1. The Shawshank Redemption     3000000
1             2. The Godfather     2100000
2           3. The Dark Knight     3000000
3     4. The Godfather Part II     1400000
4              5. 12 Angry Men      920000
Total movies saved: 250


In [3]:
print(df.head(200).to_string())

                                                                        Title  NumReviews
0                                                 1. The Shawshank Redemption     3000000
1                                                            2. The Godfather     2100000
2                                                          3. The Dark Knight     3000000
3                                                    4. The Godfather Part II     1400000
4                                                             5. 12 Angry Men      920000
5                            6. The Lord of the Rings: The Return of the King     2100000
6                                                         7. Schindler's List     1500000
7                                                             8. Pulp Fiction     2300000
8                        9. The Lord of the Rings: The Fellowship of the Ring     2100000
9                                          10. The Good, the Bad and the Ugly      849000
10        

## Top Foreign Movies

In [7]:
driver = webdriver.Chrome(options=options)
driver.get('https://www.imdb.com/list/ls052393071/')
time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

movies = []
movie_blocks = soup.find_all("li", class_="ipc-metadata-list-summary-item")

for item in movie_blocks:
    try:
        #title
        title_tag = item.find("h3")
        title = title_tag.text.strip() if title_tag else "N/A"

        vote_tag = item.select_one("span.ipc-rating-star--voteCount")
        votes_raw = vote_tag.text.strip() if vote_tag else "0"
        votes_raw = votes_raw.replace('\xa0', '').replace('(', '').replace(')', '')

        multiplier = 1
        if 'K' in votes_raw:
            multiplier = 1_000
            votes_raw = votes_raw.replace('K', '')
        elif 'M' in votes_raw:
            multiplier = 1_000_000
            votes_raw = votes_raw.replace('M', '')

        votes = int(float(votes_raw) * multiplier)

        movies.append((title, votes))

    except Exception as e:
        print(f"Error parsing row: {e}")

df = pd.DataFrame(movies, columns=["Title", "NumReviews"])

print(df.head())
print(f"Total movies saved: {len(df)}")

                    Title  NumReviews
0  1. The Lives of Others      426000
1       2. Noi the Albino        9700
2             3. Das Boot      278000
3      4. Pan's Labyrinth      723000
4               5. Oldboy      675000
Total movies saved: 250


In [8]:
print(df.head(200).to_string())

                                              Title  NumReviews
0                            1. The Lives of Others      426000
1                                 2. Noi the Albino        9700
2                                       3. Das Boot      278000
3                                4. Pan's Labyrinth      723000
4                                         5. Oldboy      675000
5                                 6. Open Your Eyes       75000
6                          7. Max Manus: Man of War       31000
7                                        8. Respiro        5000
8                                   9. Run Lola Run      213000
9                                          10. Diva       15000
10   11. Spring, Summer, Fall, Winter... and Spring       89000
11               12. The Beat That My Heart Skipped       21000
12                                     13. The Wave      118000
13                           14. The Counterfeiters       48000
14                                  15. 

## Top 10 Directors

In [10]:
driver = webdriver.Chrome(options=options)
driver.get('https://www.imdb.com/list/ls026411399/')
time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

directors = []

blocks = soup.find_all("li", class_="ipc-metadata-list-summary-item")

for item in blocks[:10]:
    try:
        name_tag = item.find("h3")
        name = name_tag.text.strip() if name_tag else "N/A"
        directors.append(name)
    except Exception as e:
        print("Error parsing director:", e)

In [12]:
print("Top 10 Directors")
for name in directors:
    print(f"{name}")

Top 10 Directors
1. Christopher Nolan
2. Steven Spielberg
3. Quentin Tarantino
4. Martin Scorsese
5. Ridley Scott
6. David Fincher
7. Robert Zemeckis
8. Stanley Kubrick
9. Clint Eastwood
10. Francis Ford Coppola


## Recommending Movies Based on Actors