In [162]:
!pip install isodate --quiet

In [5]:
import httpx
from bs4 import BeautifulSoup
import json
import isodate
import pandas as pd
import numpy as np
from ast import literal_eval

## Data Extraction

In [6]:
def _get_page(id_: str, title: bool = True, awards: bool = False):
    if title:
        url = f"https://www.imdb.com/title/{id_}"
    else:
        url = f"https://www.imdb.com/name/{id_}"

    if awards:
        url += "/awards"
        
    res = httpx.get(url,
            headers={
        		"Content-Type": "application/json",
    			"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0",
                "Referer": "https://www.imdb.com/chart/top/"
        },
        follow_redirects=True,
        timeout=None
    )
    if res.status_code == 200 and res.text:
        return res.text

    return None

In [7]:
def get_page(id_: str, title: bool=True):
    return _get_page(id_=id_, title=title)

In [8]:
def process_page(page: str):
    soup = BeautifulSoup(page, "html.parser")
    script = soup.find("script", {"type":"application/ld+json"})
    if script:
        script_json = json.loads(script.text)
        return {
            "type": script_json.get("@type"),
            "url": script_json.get("url"),
            "name": script_json.get("name"),
            "image": script_json.get("image"),
            "description": script_json.get("description"),
            "description": script_json.get("description"),
            "rating": script_json.get("aggregateRating", {}).get("ratingValue"),
            "rating_count": script_json.get("aggregateRating", {}).get("ratingCount"),
            "content_rating": script_json.get("contentRating"),
            "genre": script_json.get("genre"),
            "keywords": script_json.get("keywords"),
            "release_date_1": script_json.get("datePublished"),
            "actors": script_json.get("actor"),
            "director": script_json.get("director"),
            "creator": script_json.get("creator"),
            "duration": isodate.parse_duration(script_json.get("duration")).total_seconds() / 60
        }

In [9]:
def process_additional_page_metadata(page: str):
    soup = BeautifulSoup(page, "html.parser")
    script = soup.find("script", {"id":"__NEXT_DATA__"})
    if script:
        script_json = json.loads(script.text)
        lifetime_gross = script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("lifetimeGross")
        budget = script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("productionBudget", {})
        popularity = script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("meterRanking", {})
        metascore = script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("metacritic", {})
        return {
            
            "release_year": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("releaseYear", {}).get("year", None),
            "release_date_2": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("releaseDate", {}),
            "popularity": popularity.get("currentRank", None) if popularity else None ,
            "metascore": metascore.get("metascore", {}).get("score", None) if metascore else None,
            "plot": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("plot", {}).get("plotText", {}).get("plainText", None),
            "interests": [item.get("node", {}).get("primaryText", {}).get("text") for item in script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("interests", {}).get("edges", {})],
            "budget": {"amount": script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("productionBudget", {}).get("budget", {}).get("amount", 0) if budget else None,
                       "currency":script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("productionBudget", {}).get("budget", {}).get("currency", None) if budget else None},
            "gross": {"amount": lifetime_gross.get("total", {}).get("amount", 0) if lifetime_gross else None,
                       "currency": lifetime_gross.get("total", {}).get("currency", None) if lifetime_gross else None},
            "credits": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("credits", {}).get("total", None),
            "reviews": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("reviews", {}).get("total", None),
            "reviews": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("triviaTotal", {}).get("total", None),
            "engagement_stats": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("engagementStatistics", {}),
            "countries": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("countriesOfOrigin", {}).get("countries", []),
            "production": script_json.get("props", {}).get("pageProps", {}).get("aboveTheFoldData").get("production", {}).get("edges", []),
            "wins": script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("wins", {}).get("total", None),
            "nominations": script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("nominationsExcludeWins", {}).get("total", None),
            "prestigious_awards": script_json.get("props", {}).get("pageProps", {}).get("mainColumnData", {}).get("prestigiousAwardSummary", None),

        }

In [141]:
def get_awards_page(id_: str, title: bool =True):
    return _get_page(id_=id_, title=title, awards=True)

In [142]:
def process_awards_page(page: str):
    soup = BeautifulSoup(page, "html.parser")
    awards = soup.find_all("section", class_="ipc-page-section")
    
    skip_sections = [
        "Contribute to this page",
        "More from this title",
        "Recently viewed"
    ]
    
    all_awards = []
    
    for award in awards:        
        h3 = award.find("h3")
        if h3:
            if h3.text not in skip_sections:
                award_name = h3.text
                
        ul = award.find("ul", class_="meta-data-award-list")
        
        if ul:                  
            lis = ul.find_all("li", class_="ipc-metadata-list-summary-item")
        
            for li in lis: 
                print(li.text)
                print("=" * 20)
                
                # data = {}
                # year_achievement = li.find("a", class_="ipc-metadata-list-summary-item__t")
                # if year_achievement:
                #     year, achievement = year_achievement.find(string=True, recursive=False).strip().split(" ")
                
                # if li.find(class_="awardCategoryName"):
                #     award_category_name = li.find(class_="awardCategoryName").text

                    
                # winners_nominees_ul = li.find(class_="ipc-metadata-list-summary-item__stl")
                # if winners_nominees_ul:
                #     winners_nominees = winners_nominees_ul.find_all("li")
                #     for winner_nominee in winners_nominees:
                #         award_for = winner_nominee.text.strip()
                #         data = {
                #             "year": int(year),
                #             "award_name": award_name,
                #             "achievement": achievement,
                #             "award_category_name": award_category_name,
                #             "award_for": award_for,
                #         }
                #         all_awards.append(data)
    return all_awards

In [151]:
def get_hidden_awards():
    url = "https://caching.graphql.imdb.com"
    params = {
        "operationName": "NameAwardsSubPagePagination",
        "variables": json.dumps({"after":"YW4wMTk4ODAx","const":"nm0000129","filter":{"events":["ev0000004"]},"first":50,"locale":"en-US","originalTitleText":False,"queryParams":"nmawd"}),
        "extensions": json.dumps({"persistedQuery":{"sha256Hash":"aa506f0fa08d88049180ee0ca73bf7af8be09708af1a2066023c29d1ee37bb5d","version":1}})
    }
    res = httpx.get(url,
              params=params,
            headers={
                    "content-type": "application/json",
                    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0"
                    },
    )
    
    res.json()

In [275]:
def get_top_250_movies():
    url = "https://www.imdb.com/chart/top/"
    res = httpx.get(url, headers={
        		"content-type": "application/json",
    			"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0"
        },
        follow_redirects=True,
        timeout=None)
    if res.status_code == 200:
        return res.text
    return None

def process_top_250_movies(page: str):
    soup = BeautifulSoup(page, "html.parser")
    metadata = soup.find("script", {"type":"application/ld+json"})
    if metadata:
        metadata_json = json.loads(metadata.text)
        return metadata_json.get("itemListElement")

In [273]:
top_250_page = get_top_250_movies()

In [278]:
processed_top_250 = process_top_250_movies(top_250_page)

In [465]:
top_250_imdb_ids = [item.get("item",{}).get("url").split("/")[-2] for item in processed_top_250]

In [466]:
imdb_top_250_results = []

for title in top_250_imdb_ids:
    print(f"Getting {title}...")
    page = get_page(id_=title, title=True)
    meta = process_page(page)
    additional_meta = process_additional_page_metadata(page)
    all_meta = meta | additional_meta
    imdb_top_250_results.append(all_meta)

with open("imdb_top_250.json", "w") as f:
    json.dump(imdb_top_250_results, f)


Getting tt0111161...
Getting tt0068646...
Getting tt0468569...
Getting tt0071562...
Getting tt0050083...
Getting tt0108052...
Getting tt0167260...
Getting tt0110912...
Getting tt0120737...
Getting tt0060196...
Getting tt0109830...
Getting tt0167261...
Getting tt0137523...
Getting tt1375666...
Getting tt0080684...
Getting tt0133093...
Getting tt0099685...
Getting tt0073486...
Getting tt0816692...
Getting tt0114369...
Getting tt0038650...
Getting tt0047478...
Getting tt0102926...
Getting tt0120815...
Getting tt0317248...
Getting tt0118799...
Getting tt0120689...
Getting tt0103064...
Getting tt0076759...
Getting tt0088763...
Getting tt0245429...
Getting tt0253474...
Getting tt6751668...
Getting tt0054215...
Getting tt0172495...
Getting tt0110357...
Getting tt9362722...
Getting tt0407887...
Getting tt2582802...
Getting tt0120586...
Getting tt0110413...
Getting tt0482571...
Getting tt0095327...
Getting tt15239678...
Getting tt0056058...
Getting tt0114814...
Getting tt0034583...
Getting tt16

In [10]:
top250 = pd.read_json("imdb_top_250.json")

## Post-Processing

In [11]:
processed_data = top250

In [12]:
processed_data["release_date"] = pd.to_datetime(processed_data["release_date_1"])

In [13]:
processed_data["rank"] = processed_data.index + 1

In [14]:
processed_data["countries"] = processed_data["countries"].apply(lambda x: [c.get("id") for c in literal_eval(str(x))][0])

In [15]:
processed_data["production"] = processed_data["production"].apply(lambda x: [c.get("node").get("company").get("companyText").get("text") for c in literal_eval(str(x))])

In [20]:
def process_engagement_stats(stat):
    stat_dict = literal_eval(str(stat))
    added_by_str = stat_dict.get("watchlistStatistics")\
                    .get("displayableCount")\
                    .get("text").split(" ")[2]

    number = float(added_by_str[:-1])
    scale = added_by_str[-1]
    multiplier = 1_000_000 if scale == "M" else 1_000
    return int(number * multiplier)

In [22]:
processed_data["added_by"] = processed_data["engagement_stats"].apply(process_engagement_stats)

In [30]:
processed_data["director"] = processed_data["director"].apply(lambda x: [d.get("name") for d in literal_eval(str(x))])

In [85]:
processed_data["actors"] = processed_data["actors"].apply(lambda x: [a.get("name") for a in literal_eval(str(x))])

In [43]:
processed_data["creator"] = processed_data["creator"].apply(lambda x: [c.get("name") for c in literal_eval(str(x)) if c.get("name")])

In [56]:
processed_data["budget_amount"] = pd.to_numeric(processed_data["budget"].apply(lambda x: literal_eval(str(x)).get("amount")), errors="coerce")
processed_data["budget_currency"] = processed_data["budget"].apply(lambda x: literal_eval(str(x)).get("currency"))

In [57]:
processed_data["gross_amount"] = pd.to_numeric(processed_data["gross"].apply(lambda x: literal_eval(str(x)).get("amount")), errors="coerce")
processed_data["gross_currency"] = processed_data["gross"].apply(lambda x: literal_eval(str(x)).get("currency"))

In [71]:
def process_prestigious_awards_name(item):
    data_dict = literal_eval(str(item))
    if data_dict:
        return data_dict.get("award").get("text")
    return None

In [74]:
def process_prestigious_awards_wins(item):
    data_dict = literal_eval(str(item))
    if data_dict:
        return int(data_dict.get("wins"))
    return None

In [77]:
def process_prestigious_awards_nominations(item):
    data_dict = literal_eval(str(item))
    if data_dict:
        return int(data_dict.get("nominations"))
    return None

In [72]:
processed_data["prestigious_award_type"] = processed_data["prestigious_awards"].apply(process_prestigious_awards_name)

In [76]:
processed_data["prestigious_award_wins"] = processed_data["prestigious_awards"].apply(process_prestigious_awards_wins)

In [78]:
processed_data["prestigious_award_nominations"] = processed_data["prestigious_awards"].apply(process_prestigious_awards_nominations)

In [93]:
final = processed_data[[
        "url", "name", "image", "description", "rating", "rating_count",
       "content_rating", "genre", "keywords", "actors",
       "director", "creator", "duration", "release_year",
       "popularity", "metascore", "plot", "interests", "budget", "gross",
       "credits", "reviews", "countries", "production",
       "wins", "nominations", "release_date", "rank",
       "added_by", "budget_amount", "budget_currency", "gross_amount",
       "gross_currency", "prestigious_award_type", "prestigious_award_wins",
       "prestigious_award_nominations"
]]

In [95]:
final.reset_index(inplace=True, drop=True)

In [96]:
final.to_csv("IMDB_Top250.csv")