In [None]:
# Import required libraries for HTTP requests and HTML parsing
import requests
from bs4 import BeautifulSoup

In [None]:
def getPage(url):
    """
    Utility function to request a URL and return a BeautifulSoup object for parsing.
    """

    # Create a session to persist certain parameters across requests
    session = requests.Session()
    # Set headers to mimic a real browser and avoid basic bot detection
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
    try:
        # Send GET request to the target URL
        req = session.get(url, headers=headers)
    except requests.exceptions.RequestException:
        # Return None if there is any network/HTTP error
        return None
    # Parse the response HTML with BeautifulSoup
    bs = BeautifulSoup(req.text, 'html.parser')
    return bs

In [None]:
# Target the IMDB Top 250 chart page and parse it
url = "https://www.imdb.com/chart/top"
soup = getPage(url)

In [None]:
# Find the embedded JSON-LD script tag that contains structured data about the chart
script_tag = soup.find('script', type='application/ld+json')

In [None]:
# Extract and parse the JSON-LD content into a Python dict
json_data = script_tag.string
import json
data = json.loads(json_data)

In [None]:
# The list of movie entries in the Top 250
itemListElement = data['itemListElement']

In [None]:
# Build a list of movies with basic fields extracted from JSON-LD
movies = []
for i,item in enumerate(itemListElement):
    movie = {}
    movie['movie_ranking'] = i+1  # 1-based ranking
    movie['movie_title'] = item['item']['name']
    movie['movie_year'] = None  # to be filled later
    movie['movie_country'] = None  # to be filled later
    movie['movie_rating'] = item['item']['aggregateRating']['ratingValue']
    movie['movie_genre'] = item['item']['genre']
    movie['movie_imdb_id'] = item['item']['url'].split('/')[4]
    movie['movie_url'] = item['item']['url']

    movies.append(movie) 

In [None]:
# Find the Next.js bootstrapped data that contains additional metadata (e.g., release years)
script_tag = soup.find('script', id="__NEXT_DATA__", type="application/json")

In [None]:
# Extract and parse the Next.js data JSON
json_data = script_tag.string
data = json.loads(json_data)

In [None]:
# Navigate to the chart edges that contain detailed movie metadata
edges = data['props']['pageProps']['pageData']['chartTitles']['edges']


In [None]:
# Extract release years from the Next.js data and update movies list
release_years = []
for edge in edges:
    release_years.append(edge['node']['releaseYear']['year'])

# Match release years with movies and update the movie_year field
for movie, year in zip(movies, release_years):
    movie['movie_year'] = year

In [None]:
# Display the movies list to verify data extraction
movies

[{'movie_ranking': 1,
  'movie_title': 'The Shawshank Redemption',
  'movie_year': 1994,
  'movie_country': None,
  'movie_rating': 9.3,
  'movie_genre': 'Drama',
  'movie_imdb_id': 'tt0111161',
  'movie_url': 'https://www.imdb.com/title/tt0111161/'},
 {'movie_ranking': 2,
  'movie_title': 'The Godfather',
  'movie_year': 1972,
  'movie_country': None,
  'movie_rating': 9.2,
  'movie_genre': 'Crime, Drama',
  'movie_imdb_id': 'tt0068646',
  'movie_url': 'https://www.imdb.com/title/tt0068646/'},
 {'movie_ranking': 3,
  'movie_title': 'The Dark Knight',
  'movie_year': 2008,
  'movie_country': None,
  'movie_rating': 9.1,
  'movie_genre': 'Action, Crime, Drama',
  'movie_imdb_id': 'tt0468569',
  'movie_url': 'https://www.imdb.com/title/tt0468569/'},
 {'movie_ranking': 4,
  'movie_title': 'The Godfather Part II',
  'movie_year': 1974,
  'movie_country': None,
  'movie_rating': 9,
  'movie_genre': 'Crime, Drama',
  'movie_imdb_id': 'tt0071562',
  'movie_url': 'https://www.imdb.com/title/tt

In [None]:
# Scrape country information for each movie by visiting individual movie pages
import re
country_list = []
for movie in movies:
    url = movie['movie_url']
    soup = getPage(url)
    # Find country links using regex pattern matching
    country_tag = soup.find_all('a', href=re.compile(r'country_of_origin='))
    country_list.append(country_tag[0].get_text())
country_list


['United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'New Zealand',
 'United States',
 'New Zealand',
 'United States',
 'Italy',
 'United States',
 'New Zealand',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'Japan',
 'United States',
 'United States',
 'Brazil',
 'Italy',
 'United States',
 'United States',
 'United States',
 'Japan',
 'France',
 'United States',
 'South Korea',
 'United States',
 'United States',
 'Japan',
 'United States',
 'United States',
 'Japan',
 'United Kingdom',
 'United States',
 'France',
 'United States',
 'United States',
 'Italy',
 'France',
 'United States',
 'United Kingdom',
 'United States',
 'United States',
 'United States',
 'Italy',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'Germany',
 'Unit

In [None]:
# Update movies with country information
for movie, country in zip(movies, country_list):
    movie['movie_country'] = country

In [None]:
# Initialize empty reviews list for each movie
for movie in movies:
    movie['reviews'] = []

In [None]:
def fetch_reviews_graphql(movie_id: str, after: None) -> dict:
    """Fetch reviews from GraphQL API using a simplified payload structure."""

    # 1. Use dictionary comprehension to simplify Headers
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
        "Content-Type": "application/json",
        "Referer": f"https://www.imdb.com/title/{title_id}/reviews",
        "Accept": "application/json",
        "Origin": "https://www.imdb.com",
    }
    
    # 2. Simplify Payload structure, embed variables directly
    payload = {
        "operationName": "TitleReviewsRefine",
        "variables": {
            "after": after,
            "const": movie_id,
            "first": 25,
            "locale": "en-US",
            "sort": {"by": "HELPFULNESS_SCORE", "order": "DESC"},
            "filter": {},  # Empty dictionary placed last
        },
        # Persisted query section remains unchanged, this is required by the API
        "extensions": {
            "persistedQuery": {
                "sha256Hash": "d389bc70c27f09c00b663705f0112254e8a7c75cde1cfd30e63a2d98c1080c87",
                "version": 1,
            }
        },
    }
    
    # 3. Send request
    url = "https://caching.graphql.imdb.com/"
    resp = requests.post(url, json=payload, headers=headers, timeout=30)
    resp.raise_for_status()
    return resp.json()

In [None]:
def clean_text(text: str) -> str:
    """Remove HTML tags and normalize text."""
    if not text:
        return ""
    
    # Remove HTML tags using regex
    text = re.sub(r'<[^>]+>', '', str(text))
    
    # Decode common HTML entities
    entities = {'&amp;': '&', '&lt;': '<', '&gt;': '>', '&quot;': '"', '&#39;': "'", '&nbsp;': ' '}
    for entity, replacement in entities.items():
        text = text.replace(entity, replacement)
    
    # Normalize whitespace (multiple spaces/tabs/newlines to single space)
    return re.sub(r'\s+', ' ', text).strip()

In [None]:
# Scrape reviews for movies
from tqdm import tqdm 

for movie in tqdm(movies, desc='Scraping Reviews for Movies'):
    
    title_id = movie['movie_imdb_id']
    all_reviews = []
    after = None  # Cursor for pagination

    # Fetch first 20 pages of reviews per movie
    for page in range(20):  # Fetch first 20 pages
        data = fetch_reviews_graphql(title_id, after)
        reviews = data['data']['title']['reviews']['edges']
        # Extract review data and clean text content
        for review in reviews:
            review_dict = {}
            review_dict['review_rating'] = clean_text(review['node']['authorRating'])
            review_dict['review_title'] = review['node']['summary']['originalText']
            review_dict['review_content'] = clean_text(review['node']['text']['originalText']['plaidHtml'])
            all_reviews.append(review_dict)

        # Check if there are more pages available
        page_info = data['data']['title']['reviews']['pageInfo']
        if not page_info['hasNextPage']:
            break
        after = page_info['endCursor']  # Update cursor for next page
    movie['reviews'] = all_reviews


Scraping Reviews for Movies: 100%|██████████| 21/21 [05:50<00:00, 16.70s/it]


In [None]:
# Function to save movies and reviews data to CSV file
import csv


def save_to_csv(movies, output_file='movies_reviews_part.csv'):

    """Save movies and reviews to CSV file."""
    print(f"Saving data to {output_file}...")
    
    with open(output_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        
        # Write CSV header row
        writer.writerow([
            'movie_rank', 'movie_title', 'movie_year', 'movie_country', 
            'movie_rating', 'movie_genre', 'movie_imdb_id', 'movie_url',
            'review_title', 'review_rating', 'review_content'
        ])
        
        # Write data rows (one row per review)
        for movie in movies:
            if movie['reviews']:
                for review in movie['reviews']:
                    writer.writerow([
                        movie['movie_ranking'], movie['movie_title'], movie['movie_year'], movie['movie_country'],
                        movie['movie_rating'], movie['movie_genre'], movie['movie_imdb_id'], movie['movie_url'],
                        review['review_title'], review['review_rating'], review['review_content']
                    ])

In [None]:
# Execute the save function to export all scraped data to CSV
save_to_csv(movies)

Saving data to movies_reviews_part3.csv...
