In [58]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup
import time

from selenium import webdriver
from selenium.webdriver.chrome.options import Options

## Top 200 Movies based on IMDB and No. of Reviews

In [56]:
options = Options()
options.add_argument("--headless")
options.add_argument("user-agent=Mozilla/5.0")

driver = webdriver.Chrome(options=options)
driver.get('https://www.imdb.com/chart/top/')
time.sleep(5)

soup = BeautifulSoup(driver.page_source, 'html.parser')
driver.quit()

# --- Scrape Data ---
movies = []
movie_blocks = soup.find_all("li", class_="ipc-metadata-list-summary-item")

for item in movie_blocks:
    try:
        # Title
        title_tag = item.find("h3")
        title = title_tag.text.strip() if title_tag else "N/A"

        # Votes
        vote_tag = item.select_one("span.ipc-rating-star--voteCount")
        votes_raw = vote_tag.text.strip() if vote_tag else "0"
        votes_raw = votes_raw.replace('\xa0', '').replace('(', '').replace(')', '')

        multiplier = 1
        if 'K' in votes_raw:
            multiplier = 1_000
            votes_raw = votes_raw.replace('K', '')
        elif 'M' in votes_raw:
            multiplier = 1_000_000
            votes_raw = votes_raw.replace('M', '')

        votes = int(float(votes_raw) * multiplier)

        movies.append((title, votes))

    except Exception as e:
        print(f"Error parsing row: {e}")

# --- Save to DataFrame ---
df = pd.DataFrame(movies, columns=["Title", "NumReviews"])
df.to_csv("top_250_movies_with_reviews.csv", index=False)

print(df.head())
print(f"Total movies saved: {len(df)}")

                         Title  NumReviews
0  1. The Shawshank Redemption     3000000
1             2. The Godfather     2100000
2           3. The Dark Knight     3000000
3     4. The Godfather Part II     1400000
4              5. 12 Angry Men      920000
Total movies saved: 250


In [57]:
print(df.head(200).to_string())

                                                                        Title  NumReviews
0                                                 1. The Shawshank Redemption     3000000
1                                                            2. The Godfather     2100000
2                                                          3. The Dark Knight     3000000
3                                                    4. The Godfather Part II     1400000
4                                                             5. 12 Angry Men      920000
5                            6. The Lord of the Rings: The Return of the King     2100000
6                                                         7. Schindler's List     1500000
7                                                             8. Pulp Fiction     2300000
8                        9. The Lord of the Rings: The Fellowship of the Ring     2100000
9                                          10. The Good, the Bad and the Ugly      849000
10        

## Top Foreign Movies

## Top 10 Directors

## Recommending Movies Based on Actors