In [4]:
!pip install isodate --quiet

In [113]:
import httpx
from bs4 import BeautifulSoup
import json
import isodate
import pandas as pd
import numpy as np
from ast import literal_eval
import os
from pathlib import Path
import html
from io import StringIO

## Data Extraction

In [6]:
def _get_page(id_: str, title: bool = True, awards: bool = False):
    if title:
        url = f"https://www.imdb.com/title/{id_}"
    else:
        url = f"https://www.imdb.com/name/{id_}"

    if awards:
        url += "/awards"
        
    res = httpx.get(url,
            headers={
        		"Content-Type": "application/json",
    			"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0",
                "Referer": "https://www.imdb.com/chart/top/"
        },
        follow_redirects=True,
        timeout=None
    )
    if res.status_code == 200 and res.text:
        return res.text

    return None

In [7]:
def get_page(id_: str, title: bool=True):
    return _get_page(id_=id_, title=title)

In [8]:
def extract_metadata(imdb_id: str, page: str):
    soup = BeautifulSoup(page, "html.parser")
    
    main_script = soup.find("script", {"type":"application/ld+json"})
    if main_script:
        main_script_json = json.loads(main_script.text)
    
    additional_meta = soup.find("script", {"id":"__NEXT_DATA__"})
    if additional_meta:
        props = json.loads(additional_meta.text).get("props", {}).get("pageProps", {})
        above_the_fold = props.get("aboveTheFoldData")
        main_column = props.get("mainColumnData")
        
    os.makedirs(os.path.join(os.path.dirname(os.curdir), "json"), exist_ok=True)
    
    with open(f"json/{imdb_id}.json", "w") as f:
        data = {"meta": main_script_json, "additional_meta": above_the_fold | main_column}
        json.dump(data, f)

In [9]:
def process_page(page: str):
    soup = BeautifulSoup(page, "html.parser")
    script = soup.find("script", {"type":"application/ld+json"})
    if script:
        script_json = json.loads(script.text)
        return {
            "type": script_json.get("@type"),
            "url": script_json.get("url"),
            "name": script_json.get("name"),
            "image": script_json.get("image"),
            "description": script_json.get("description"),
            "description": script_json.get("description"),
            "rating": script_json.get("aggregateRating", {}).get("ratingValue"),
            "rating_count": script_json.get("aggregateRating", {}).get("ratingCount"),
            "content_rating": script_json.get("contentRating"),
            "genre": script_json.get("genre"),
            "keywords": script_json.get("keywords"),
            "release_date_1": script_json.get("datePublished"),
            "actors": script_json.get("actor"),
            "director": script_json.get("director"),
            "creator": script_json.get("creator"),
            "duration": isodate.parse_duration(script_json.get("duration")).total_seconds() / 60
        }

In [85]:
def process_additional_page_metadata(page: str):
    soup = BeautifulSoup(page, "html.parser")
    script = soup.find("script", {"id":"__NEXT_DATA__"})
    if script:
        script_json = json.loads(script.text)
        lifetime_gross = script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("lifetimeGross")
        budget = script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("productionBudget", {})
        popularity = script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("meterRanking", {})
        metascore = script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("metacritic", {})
        return {
            
            "release_year": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("releaseYear", {}).get("year", None),
            "release_date_2": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("releaseDate", {}),
            "popularity": popularity.get("currentRank", None) if popularity else None ,
            "metascore": metascore.get("metascore", {}).get("score", None) if metascore else None,
            "plot": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("plot", {}).get("plotText", {}).get("plainText", None),
            "interests": [item.get("node", {}).get("primaryText", {}).get("text") for item in script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("interests", {}).get("edges", {})],
            "budget": {"amount": script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("productionBudget", {}).get("budget", {}).get("amount", 0) if budget else None,
                       "currency":script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("productionBudget", {}).get("budget", {}).get("currency", None) if budget else None},
            "gross": {"amount": lifetime_gross.get("total", {}).get("amount", 0) if lifetime_gross else None,
                       "currency": lifetime_gross.get("total", {}).get("currency", None) if lifetime_gross else None},
            "credits": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("credits", {}).get("total", None),
            "reviews": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("reviews", {}).get("total", None),
            "reviews": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("triviaTotal", {}).get("total", None),
            "engagement_stats": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("engagementStatistics", {}),
            "countries": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("countriesOfOrigin", {}).get("countries", []),
            "production": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("production", {}).get("edges", []),
            "wins": script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("wins", {}).get("total", None),
            "nominations": script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("nominationsExcludeWins", {}).get("total", None),
            "prestigious_awards": script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("prestigiousAwardSummary", None),

        }

In [11]:
def get_awards_page(id_: str, title: bool =True):
    return _get_page(id_=id_, title=title, awards=True)

In [12]:
def process_awards_page(page: str):
    soup = BeautifulSoup(page, "html.parser")
    awards = soup.find_all("section", class_="ipc-page-section")
    
    skip_sections = [
        "Contribute to this page",
        "More from this title",
        "Recently viewed"
    ]
    
    all_awards = []
    
    for award in awards:        
        h3 = award.find("h3")
        if h3:
            if h3.text not in skip_sections:
                award_name = h3.text
                
        ul = award.find("ul", class_="meta-data-award-list")
        
        if ul:                  
            lis = ul.find_all("li", class_="ipc-metadata-list-summary-item")
        
            for li in lis: 
                print(li.text)
                print("=" * 20)
                
                # data = {}
                # year_achievement = li.find("a", class_="ipc-metadata-list-summary-item__t")
                # if year_achievement:
                #     year, achievement = year_achievement.find(string=True, recursive=False).strip().split(" ")
                
                # if li.find(class_="awardCategoryName"):
                #     award_category_name = li.find(class_="awardCategoryName").text

                    
                # winners_nominees_ul = li.find(class_="ipc-metadata-list-summary-item__stl")
                # if winners_nominees_ul:
                #     winners_nominees = winners_nominees_ul.find_all("li")
                #     for winner_nominee in winners_nominees:
                #         award_for = winner_nominee.text.strip()
                #         data = {
                #             "year": int(year),
                #             "award_name": award_name,
                #             "achievement": achievement,
                #             "award_category_name": award_category_name,
                #             "award_for": award_for,
                #         }
                #         all_awards.append(data)
    return all_awards

In [13]:
def get_hidden_awards():
    url = "https://caching.graphql.imdb.com"
    params = {
        "operationName": "NameAwardsSubPagePagination",
        "variables": json.dumps({"after":"YW4wMTk4ODAx","const":"nm0000129","filter":{"events":["ev0000004"]},"first":50,"locale":"en-US","originalTitleText":False,"queryParams":"nmawd"}),
        "extensions": json.dumps({"persistedQuery":{"sha256Hash":"aa506f0fa08d88049180ee0ca73bf7af8be09708af1a2066023c29d1ee37bb5d","version":1}})
    }
    res = httpx.get(url,
              params=params,
            headers={
                    "content-type": "application/json",
                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0"
                    },
    )
    
    res.json()

In [14]:
def get_top_250_movies():
    url = "https://www.imdb.com/chart/top/"
    res = httpx.get(url, headers={
        		"content-type": "application/json",
    			"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0"
        },
        follow_redirects=True,
        timeout=None)
    if res.status_code == 200:
        return res.text
    return None

def process_top_250_movies(page: str):
    soup = BeautifulSoup(page, "html.parser")
    metadata = soup.find("script", {"type":"application/ld+json"})
    if metadata:
        metadata_json = json.loads(metadata.text)
        return metadata_json.get("itemListElement")

In [15]:
top_250_page = get_top_250_movies()

In [16]:
processed_top_250 = process_top_250_movies(top_250_page)

In [17]:
top_250_imdb_ids = [item.get("item",{}).get("url").split("/")[-2] for item in processed_top_250]

In [22]:
def get_raw_data(ids: list):
    for title in top_250_imdb_ids:
        print(f"Getting {title}...")
        page = get_page(id_=title, title=True)
        extract_metadata(imdb_id=title, page=page)

In [23]:
get_raw_data(top_250_imdb_ids)

Getting tt0111161...
Getting tt0068646...
Getting tt0468569...
Getting tt0071562...
Getting tt0050083...
Getting tt0108052...
Getting tt0167260...
Getting tt0110912...
Getting tt0120737...
Getting tt0060196...
Getting tt0109830...
Getting tt0167261...
Getting tt0137523...
Getting tt1375666...
Getting tt0080684...
Getting tt0133093...
Getting tt0099685...
Getting tt0073486...
Getting tt0816692...
Getting tt0114369...
Getting tt0038650...
Getting tt0047478...
Getting tt0102926...
Getting tt0120815...
Getting tt0317248...
Getting tt0118799...
Getting tt0120689...
Getting tt0103064...
Getting tt0076759...
Getting tt0088763...
Getting tt0245429...
Getting tt0253474...
Getting tt6751668...
Getting tt0054215...
Getting tt0172495...
Getting tt0110357...
Getting tt9362722...
Getting tt0407887...
Getting tt2582802...
Getting tt0120586...
Getting tt0110413...
Getting tt0482571...
Getting tt0095327...
Getting tt0056058...
Getting tt15239678...
Getting tt0114814...
Getting tt0034583...
Getting tt16

## Processing

In [322]:
def process_pages(pages: list):
    processed_pages = []
    for json_file in pages:
        with open(f"./json/{json_file}") as f:
            data = json.load(f)


            
            title_id = json_file.split(".")[0]
            name = data.get("meta", {}).get("name")
            poster = data.get("meta", {}).get("image")
            description = data.get("meta", {}).get("description")
            rating_count = data.get("meta", {}).get("aggregateRating", {}).get("ratingCount", 0)
            rating_value = data.get("meta", {}).get("aggregateRating", {}).get("ratingValue", 0)
            content_rating = data.get("meta", {}).get("contentRating", None)
            genre = data.get("meta", {}).get("genre", None)
            date_published = data.get("meta", {}).get("datePublished", None)
            keywords = data.get("meta", {}).get("keywords", None)
            actors = [item.get("name") for item in data.get("meta", {}).get("actor", [])] if data.get("meta", {}).get("actor", []) else []
            directors = [item.get("name") for item in data.get("meta", {}).get("director", [])] if data.get("meta", {}).get("director", []) else []
            creators = [item for item in [item.get("name") for item in data.get("meta", {}).get("creator", [])] if item] if data.get("meta", {}).get("creator", []) else []
            release_year = data.get("additional_meta", {}).get("releaseYear", None).get("year", None)
            release_date_data = data.get("additional_meta", {}).get("releaseDate", None)
            release_date = f"{release_date_data.get('year')}-{release_date_data.get('month')}-{release_date_data.get('day')}"
            runtime_minutes = data.get("additional_meta", {}).get("runtime", {}).get("seconds") / 60
            rank = data.get("additional_meta", {}).get("ratingsSummary", {}).get("topRanking", {}).get("rank")
            metascore = data.get("additional_meta", {}).get("metacritic", {}).get("metascore", {}).get("score", None) if data.get("additional_meta", {}).get("metacritic", {}) else None
            interests = [i.get("node",{}).get("primaryText", {}).get("text", None) for i in data.get("additional_meta", {}).get("interests", {}).get("edges", [])] if data.get("additional_meta", {}).get("interests", {}).get("edges", []) else []
            plot = data.get("additional_meta", {}).get("plot", {}).get("plotText", {}).get("plainText")
            # language = data.get("additional_meta", {}).get("plot", {}).get("language", {}).get("id", None)
            credits = data.get("additional_meta", {}).get("credits", {}).get("total", None)
            user_reviews = data.get("additional_meta", {}).get("reviews", {}).get("total", None)
            critic_reviews = data.get("additional_meta", {}).get("criticReviewsTotal", {}).get("total", None)
            trivia = data.get("additional_meta", {}).get("triviaTotal", {}).get("total", None)
            added_to_watchlist = data.get("additional_meta", {}).get("engagementStatistics", {}).get("watchlistStatistics", {}).get("displayableCount", {}).get("text", None)        
            countries = [c.get("text") for c in data.get("additional_meta", {}).get("countriesOfOrigin", {}).get("countries", {})]
            production = [c.get("node", {}).get("company", {}).get("companyText", {}).get("text", None) for c in data.get("additional_meta", {}).get("production", {}).get("edges")]
            wins = data.get("additional_meta", {}).get("wins", {}).get("total", 0)
            nominations = data.get("additional_meta", {}).get("nominationsExcludeWins", {}).get("total", 0)
            budget_amount = data.get("additional_meta", {}).get("productionBudget", {}).get("budget", {}).get("amount", None) if data.get("additional_meta", {}).get("productionBudget") else None
            budget_currency = data.get("additional_meta", {}).get("productionBudget", {}).get("budget", {}).get("currency", None) if data.get("additional_meta", {}).get("productionBudget") else None
            gross_amount = data.get("additional_meta", {}).get("worldwideGross", {}).get("total", {}).get("amount", None) if data.get("additional_meta", {}).get("worldwideGross") else None
            gross_currency = data.get("additional_meta", {}).get("worldwideGross", {}).get("total", {}).get("currency", None) if data.get("additional_meta", {}).get("worldwideGross") else None
            opening_weekend_gross_amount = data.get("additional_meta", {}).get("openingWeekendGross", {}).get("gross",{}).get("amount", None) if data.get("additional_meta", {}).get("openingWeekendGross") else None
            opening_weekend_gross_currency = data.get("additional_meta", {}).get("openingWeekendGross", {}).get("gross",{}).get("currency", None) if data.get("additional_meta", {}).get("openingWeekendGross") else None
            aspect_ratios = [a.get("aspectRatio") for a in data.get("additional_meta", {}).get("technicalSpecifications", {}).get("aspectRatios", {}).get("items", {})]
            colorations = [c.get("text") for c in data.get("additional_meta", {}).get("technicalSpecifications", {}).get("colorations", {}).get("items", [])]
            spoken_languages = [l.get("text") for l in data.get("additional_meta", {}).get("spokenLanguages", {}).get("spokenLanguages",[])]
            filming_locations = [loc.get("node", {}).get("text", None) for loc in data.get("additional_meta", {}).get("filmingLocations", {}).get("edges", [])]
            goofs = data.get("additional_meta", {}).get("goofsTotal", {}).get("total", None)
            alternate_versions = data.get("additional_meta", {}).get("alternateVersions", {}).get("total", None)
            prestigious_award_wins = data.get("additional_meta", {}).get("prestigiousAwardSummary", {}).get("wins", 0) if data.get("additional_meta", {}).get("prestigiousAwardSummary") else 0
            prestigious_award_nominations = data.get("additional_meta", {}).get("prestigiousAwardSummary", {}).get("nominations", 0) if data.get("additional_meta", {}).get("prestigiousAwardSummary") else 0
            prestigious_award_type = data.get("additional_meta", {}).get("prestigiousAwardSummary", {}).get("award", {}).get("text", None) if data.get("additional_meta", {}).get("prestigiousAwardSummary") else None
    
            processed_page =  {
    
                "title_id": title_id,
                "name": html.unescape(name),
                "poster": poster,
                "description": html.unescape(description),
                "rating": rating_value,
                "rating_count": rating_count,
                "content_rating": content_rating,
                "genre": genre,
                "date_published": date_published,
                "keywords": keywords,
                "actors": actors,
                "director": directors,
                "creator": creators,
                "release_year": release_year,
                "release_date": release_date,
                "runtime_minutes": runtime_minutes,
                "rank": rank,
                "metascore": metascore,
                "interests": interests,
                "plot": html.unescape(plot),
                # "language": language,
                "credits": credits,
                "user_reviews": user_reviews,
                "critic_reviews": critic_reviews,
                "trivia": trivia,
                "added_to_watchlist": added_to_watchlist,
                "countries": countries,
                "production": production,
                "wins": wins,
                "nominations": nominations,
                "budget_amount": budget_amount,
                "budget_currency": budget_currency,
                "gross_amount": gross_amount,
                "gross_currency": gross_currency,
                "opening_weekend_gross_amount": gross_amount,
                "opening_weekend_gross_currency": gross_currency,
                "aspect_ratios": aspect_ratios,
                "colorations": colorations,
                "spoken_languages": spoken_languages,
                "filming_locations": filming_locations,
                "goofs": goofs,
                "alternate_versions": alternate_versions,
                "prestigious_award_wins": prestigious_award_wins,
                "prestigious_award_nominations": prestigious_award_nominations,
                "prestigious_award_type": prestigious_award_type,
                
            }

            

            processed_pages.append(processed_page)
    
    return processed_pages    

In [101]:
pages = [i for i in os.listdir("./json") if i.endswith(".json")]

In [323]:
processed_pages = process_pages(pages)

A quick sanity check:

In [311]:
len(processed_pages)

250

## Post-Processing

In [324]:
df = pd.read_json(StringIO(json.dumps(processed_pages)))

In [325]:
df = df.sort_values(by="rank").reset_index(drop=True)

Fixing dates

In [326]:
df["date_published"] = pd.to_datetime(df["date_published"])

In [327]:
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")

Fixing colorations column

In [328]:
df["colorations"] = df["colorations"].apply(lambda x: literal_eval(str(x))[0] if literal_eval(str(x)) else None)

Fixing filming locations column

In [329]:
df["filming_locations"] = df["filming_locations"].apply(lambda x: literal_eval(str(x))[0] if literal_eval(str(x)) else None)

In [287]:
def process_engagement_stats(added_by_str: str):
    added_by_str = added_by_str.split(" ")[2]

    number = float(added_by_str[:-1])
    scale = added_by_str[-1]
    multiplier = 1_000_000 if scale == "M" else 1_000
    return int(number * multiplier)

In [330]:
df["added_to_watchlist"] = df["added_to_watchlist"].apply(process_engagement_stats)

In [331]:
df.to_csv("IMDB_Top250.csv", index=False)