In [13]:
import requests
import pandas as pd
import time
import numpy as np

In [4]:
API_KEY = "33c3009d4610d7c8a3484f3bc865055b"          
BASE_URL = "https://api.themoviedb.org/3"

# The 19 IDs given in the lab (0 is invalid → we remove it)
movie_ids = [
    299534, 19995, 140607, 299536, 597, 135397, 420818,
    24428, 168259, 99861, 284054, 12445, 181808, 330457,
    351286, 109445, 321612, 260513
    # 0 was removed because TMDb has no movie with ID 0 → it returns 404
]

# === 2. Function to get one movie (with credits appended) ===
def fetch_movie(movie_id: int) -> dict:
    url = f"{BASE_URL}/movie/{movie_id}"
    params = {
        "api_key": API_KEY,
        "language": "en-US",
        "append_to_response": "credits"   # gets cast + crew in the same call
    }
    
    response = requests.get(url, params=params)
    
    if response.status_code == 200:
        return response.json()
    elif response.status_code == 404:
        print(f"Movie ID {movie_id} not found (404)")
        return None
    else:
        print(f"Error {response.status_code} for ID {movie_id}: {response.text}")
        return None

In [5]:

# === 3. Fetch all movies with polite delay (TMDb allows ~50 requests/sec, but we play nice) ===
print("Starting to fetch movies...")
raw_movies_data = []

for idx, mid in enumerate(movie_ids, 1):
    print(f"  [{idx:02d}/{len(movie_ids)}] Fetching ID {mid}...", end=" ")
    movie = fetch_movie(mid)
    if movie:
        raw_movies_data.append(movie)
        print("Success")
    else:
        print("Failed")
    
    time.sleep(0.22)  # ~4–5 requests per second → stays far below rate limit

print(f"\nFinished! Successfully fetched {len(raw_movies_data)} movies.")


Starting to fetch movies...
  [01/18] Fetching ID 299534... Success
  [02/18] Fetching ID 19995... Success
  [03/18] Fetching ID 140607... Success
  [04/18] Fetching ID 299536... Success
  [05/18] Fetching ID 597... Success
  [06/18] Fetching ID 135397... Success
  [07/18] Fetching ID 420818... Success
  [08/18] Fetching ID 24428... Success
  [09/18] Fetching ID 168259... Success
  [10/18] Fetching ID 99861... Success
  [11/18] Fetching ID 284054... Success
  [12/18] Fetching ID 12445... Success
  [13/18] Fetching ID 181808... Success
  [14/18] Fetching ID 330457... Success
  [15/18] Fetching ID 351286... Success
  [16/18] Fetching ID 109445... Success
  [17/18] Fetching ID 321612... Success
  [18/18] Fetching ID 260513... Success

Finished! Successfully fetched 18 movies.


In [6]:
# === 4. Convert the list of JSONs into a Pandas DataFrame ===
df_raw = pd.DataFrame(raw_movies_data)


print(f"\nDataFrame shape: {df_raw.shape}")
print("Columns:", list(df_raw.columns))
print("\nFirst 2 rows sample:")
display(df_raw[['id', 'title', 'release_date', 'budget', 'revenue']].head(10))


df_raw.to_json("raw_tmdb_movies.json", orient="records", indent=2)

print("\nRaw data saved as raw_tmdb_movies.json and .csv")


DataFrame shape: (18, 27)
Columns: ['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id', 'imdb_id', 'origin_country', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'credits']

First 2 rows sample:


Unnamed: 0,id,title,release_date,budget,revenue
0,299534,Avengers: Endgame,2019-04-24,356000000,2799439100
1,19995,Avatar,2009-12-15,237000000,2923706026
2,140607,Star Wars: The Force Awakens,2015-12-15,245000000,2068223624
3,299536,Avengers: Infinity War,2018-04-25,300000000,2052415039
4,597,Titanic,1997-11-18,200000000,2264162353
5,135397,Jurassic World,2015-06-06,150000000,1671537444
6,420818,The Lion King,2019-07-12,260000000,1662020819
7,24428,The Avengers,2012-04-25,220000000,1518815515
8,168259,Furious 7,2015-04-01,190000000,1515400000
9,99861,Avengers: Age of Ultron,2015-04-22,235000000,1405403694



Raw data saved as raw_tmdb_movies.json and .csv


In [10]:
df_raw.info()
print(f"Original shape: {df_raw.shape}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  18 non-null     bool   
 1   backdrop_path          18 non-null     object 
 2   belongs_to_collection  16 non-null     object 
 3   budget                 18 non-null     int64  
 4   genres                 18 non-null     object 
 5   homepage               18 non-null     object 
 6   id                     18 non-null     int64  
 7   imdb_id                18 non-null     object 
 8   origin_country         18 non-null     object 
 9   original_language      18 non-null     object 
 10  original_title         18 non-null     object 
 11  overview               18 non-null     object 
 12  popularity             18 non-null     float64
 13  poster_path            18 non-null     object 
 14  production_companies   18 non-null     object 
 15  producti

##### STEP 2 – DATA CLEANING & PREPROCESSING

In [None]:
# === 1. Drop irrelevant columns (exactly as specified) ===
drop_cols = ['adult', 'imdb_id', 'original_title', 'video', 'homepage']
df_clean = df_raw.drop(columns=[col for col in drop_cols if col in df_raw.columns], errors='ignore')
print(f"\nDropped {len(drop_cols)} irrelevant columns. New shape: {df_clean.shape}")

# === 2. Helper functions to extract from JSON-like columns ===
def extract_collection_name(collection):
    """Extract name from belongs_to_collection dict or return NaN."""
    if isinstance(collection, dict) and 'name' in collection:
        return collection['name']
    return np.nan

def extract_genres(genres_list):
    """Extract genre names as 'Action|Adventure' or NaN."""
    if isinstance(genres_list, list) and len(genres_list) > 0:
        return '|'.join([genre['name'] for genre in genres_list if 'name' in genre])
    return np.nan

def extract_spoken_languages(languages_list):
    """Extract ISO 639-1 codes as 'en|fr|zh' or NaN."""
    if isinstance(languages_list, list) and len(languages_list) > 0:
        return '|'.join([lang['iso_639_1'] for lang in languages_list if 'iso_639_1' in lang])
    return np.nan

def extract_production_countries(countries_list):
    """Extract country names as 'United States|United Kingdom' or NaN."""
    if isinstance(countries_list, list) and len(countries_list) > 0:
        return '|'.join([country['name'] for country in countries_list if 'name' in country])
    return np.nan

def extract_production_companies(companies_list):
    """Extract company names as 'Marvel Studios|Walt Disney' or NaN."""
    if isinstance(companies_list, list) and len(companies_list) > 0:
        return '|'.join([company['name'] for company in companies_list if 'name' in company])
    return np.nan

# === 3. Apply extractions ===
print("\nExtracting nested fields...")

df_clean['collection_name'] = df_clean['belongs_to_collection'].apply(extract_collection_name)
df_clean['genres'] = df_clean['genres'].apply(extract_genres)
df_clean['spoken_languages'] = df_clean['spoken_languages'].apply(extract_spoken_languages)
df_clean['production_countries'] = df_clean['production_countries'].apply(extract_production_countries)
df_clean['production_companies'] = df_clean['production_companies'].apply(extract_production_companies)

# Clean up original JSON columns (optional: drop them to save space)
json_cols = ['belongs_to_collection', 'genres', 'production_countries', 'production_companies', 'spoken_languages']
df_clean = df_clean.drop(columns=[col for col in json_cols if col in df_clean.columns], errors='ignore')

# === 4. Inspect extracted columns with value_counts() ===
print("\n=== INSPECTION: Value Counts for Extracted Columns ===")

inspection_cols = ['collection_name', 'genres', 'spoken_languages', 'production_countries', 'production_companies']

for col in inspection_cols:
    if col in df_clean.columns:
        print(f"\n{col.upper()}:")
        print(df_clean[col].value_counts().head(10))  # Top 10 to spot patterns/anomalies
        print(f"  → Unique values: {df_clean[col].nunique()}, NaNs: {df_clean[col].isna().sum()}")
        print("-" * 50)

print("\nSample of cleaned data:")
display(df_clean[['id','origin_country', 'original_language', 'overview', 'popularity', 'status', 'credits', 'collection_name']].head(5))

##df_clean.head()


Dropped 5 irrelevant columns. New shape: (18, 22)

Extracting nested fields...

=== INSPECTION: Value Counts for Extracted Columns ===

COLLECTION_NAME:
collection_name
The Avengers Collection                4
Star Wars Collection                   2
Frozen Collection                      2
Jurassic Park Collection               2
Avatar Collection                      1
The Lion King (Reboot) Collection      1
The Fast and the Furious Collection    1
Black Panther Collection               1
Harry Potter Collection                1
The Incredibles Collection             1
Name: count, dtype: int64
  → Unique values: 10, NaNs: 2
--------------------------------------------------

Sample of cleaned data:


Unnamed: 0,id,origin_country,original_language,overview,popularity,status,credits,collection_name
0,299534,[US],en,After the devastating events of Avengers: Infi...,12.0878,Released,"{'cast': [{'adult': False, 'gender': 2, 'id': ...",The Avengers Collection
1,19995,[US],en,"In the 22nd century, a paraplegic Marine is di...",38.2316,Released,"{'cast': [{'adult': False, 'gender': 2, 'id': ...",Avatar Collection
2,140607,[US],en,Thirty years after defeating the Galactic Empi...,7.5842,Released,"{'cast': [{'adult': False, 'gender': 2, 'id': ...",Star Wars Collection
3,299536,[US],en,As the Avengers and their allies have continue...,20.7267,Released,"{'cast': [{'adult': False, 'gender': 2, 'id': ...",The Avengers Collection
4,597,[US],en,101-year-old Rose DeWitt Bukater tells the sto...,23.4289,Released,"{'cast': [{'adult': False, 'gender': 2, 'id': ...",
