In [None]:
# import necessary packages and modules
import pandas as pd
import numpy as np
from matplotlib.pyplot import matplotlib
import json
import os
import sys
from pathlib import Path
sys.path.append(str(Path('../src')))
from data_fetching import create_movie_dataframe
from data_cleaning import drop_irrelevant_columns, evaluate_json_columns, inspect_extracted_columns, convert_datatypes, handle_unrealistic_values, remove_duplicates_and_invalid_rows,filter_released_movies, finalize_dataframe, save_dataframe, clean_data
from analysis import rank_movies, calculate_metrics, compute_kpi_rankings, filter_specific_movies, compute_franchise_standalone_stats, compute_franchise_performance,compute_director_performance, save_analysis_results, perform_analysis
from visualization import create_visualization
from config import MOVIE_IDS,TMDB_API_KEY,BASE_URL,RAW_DATA_DIR, PROCESSED_DATA_DIR

In [None]:
# Fetch raw data from API
raw_df = create_movie_dataframe(MOVIE_IDS, TMDB_API_KEY, BASE_URL, RAW_DATA_DIR)
print("Initial DataFrame shape:", raw_df.shape)
raw_df.head()

2025-04-23 06:01:47,954 - INFO - Loaded cached data from data\raw\raw_movies_20250423_054238.json
2025-04-23 06:01:55,246 - ERROR - Max retries reached for movie ID 0
2025-04-23 06:01:55,252 - INFO - Using cached data for movie ID 299534
2025-04-23 06:01:55,254 - INFO - Using cached data for movie ID 19995
2025-04-23 06:01:55,256 - INFO - Using cached data for movie ID 140607
2025-04-23 06:01:55,258 - INFO - Using cached data for movie ID 299536
2025-04-23 06:01:55,259 - INFO - Using cached data for movie ID 597
2025-04-23 06:01:55,261 - INFO - Using cached data for movie ID 135397
2025-04-23 06:01:55,263 - INFO - Using cached data for movie ID 420818
2025-04-23 06:01:55,265 - INFO - Using cached data for movie ID 24428
2025-04-23 06:01:55,266 - INFO - Using cached data for movie ID 168259
2025-04-23 06:01:55,268 - INFO - Using cached data for movie ID 99861
2025-04-23 06:01:55,270 - INFO - Using cached data for movie ID 284054
2025-04-23 06:01:55,271 - INFO - Using cached data for mov

Initial DataFrame shape: (18, 27)


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits
0,False,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",356000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...",https://www.marvel.com/movies/avengers-endgame,299534,tt4154796,[US],en,...,2799439100,181,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Avenge the fallen.,Avengers: Endgame,False,8.237,26247,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
1,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.avatar.com/movies/avatar,19995,tt0499549,[US],en,...,2923706026,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,False,7.588,32162,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
2,False,/k6EOrckWFuz7I4z4wiRwz8zsj4H.jpg,"{'id': 10, 'name': 'Star Wars Collection', 'po...",245000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,140607,tt2488496,[US],en,...,2068223624,136,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Every generation has a story.,Star Wars: The Force Awakens,False,7.262,19693,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
3,False,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",https://www.marvel.com/movies/avengers-infinit...,299536,tt4154756,[US],en,...,2052415039,149,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Destiny arrives all the same.,Avengers: Infinity War,False,8.235,30432,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
4,False,/sCzcYW9h55WcesOqA12cgEr9Exw.jpg,,200000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",https://www.paramountmovies.com/movies/titanic,597,tt0120338,[US],en,...,2264162353,194,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Nothing on Earth could come between them.,Titanic,False,7.906,25915,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."


Run the function below to clean the data, returns a cleaned dataframe which is also saved in the `data/processed` folder.

In [3]:
df_cleaned = clean_data(raw_df, save_path=Path(PROCESSED_DATA_DIR) / "cleaned_movies.parquet")
df_cleaned.head()

Number of duplicate rows: 0
Rows with title='unknown': 0
Rows with missing id or title: 0
DataFrame shape after dropping unknown titles: (18, 24)
Cleaned DataFrame shape after dropping rows with < 10 non-null values: (18, 24)
Cleaned DataFrame shape after filtering for 'Released' movies: (18, 23)
Cleaned DataFrame shape: (18, 22)
Cleaned data saved to data\processed\cleaned_movies_20250423_060156.parquet
Timestamp saved to data\processed\latest_timestamp.txt


Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,director,crew_size
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure | Science Fiction | Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,...,8.237,53.8808,181,After the devastating events of Avengers: Infi...,en | ja | xh,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,Robert Downey Jr. | Chris Evans | Mark Ruffalo...,105,Anthony Russo | Joe Russo,593
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action | Adventure | Fantasy | Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment | Lightstorm Entertainment ...,...,7.588,30.3068,162,"In the 22nd century, a paraplegic Marine is di...",en | es,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Sam Worthington | Zoe Saldaña | Sigourney Weav...,65,James Cameron,986
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure | Action | Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd. | Bad Robot,...,7.262,15.2615,136,Thirty years after defeating the Galactic Empi...,en,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,Harrison Ford | Mark Hamill | Carrie Fisher | ...,182,J.J. Abrams,257
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure | Action | Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,...,8.235,97.1289,149,As the Avengers and their allies have continue...,en | xh,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,Robert Downey Jr. | Chris Evans | Chris Hemswo...,69,Anthony Russo | Joe Russo,724
4,597,Titanic,Nothing on Earth could come between them.,1997-11-18,Drama | Romance,,en,200.0,2264.162353,Paramount Pictures | 20th Century Fox | Lights...,...,7.906,46.9587,194,101-year-old Rose DeWitt Bukater tells the sto...,en | fr | de | sv | it | ru,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,Leonardo DiCaprio | Kate Winslet | Billy Zane ...,116,James Cameron,258


Optionally, run the individual functions to see the cleaning process step by step

In [4]:
df = raw_df.copy()
# Display initial DataFrame info
print("Initial DataFrame shape:", df.shape)
df.head()

Initial DataFrame shape: (18, 27)


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,origin_country,original_language,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,credits
0,False,/7RyHsO4yDXtBv1zUU3mTpHeQ0d5.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",356000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...",https://www.marvel.com/movies/avengers-endgame,299534,tt4154796,[US],en,...,2799439100,181,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Avenge the fallen.,Avengers: Endgame,False,8.237,26247,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
1,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",https://www.avatar.com/movies/avatar,19995,tt0499549,[US],en,...,2923706026,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,False,7.588,32162,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
2,False,/k6EOrckWFuz7I4z4wiRwz8zsj4H.jpg,"{'id': 10, 'name': 'Star Wars Collection', 'po...",245000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,140607,tt2488496,[US],en,...,2068223624,136,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Every generation has a story.,Star Wars: The Force Awakens,False,7.262,19693,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
3,False,/mDfJG3LC3Dqb67AZ52x3Z0jU0uB.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",https://www.marvel.com/movies/avengers-infinit...,299536,tt4154756,[US],en,...,2052415039,149,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Destiny arrives all the same.,Avengers: Infinity War,False,8.235,30432,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
4,False,/sCzcYW9h55WcesOqA12cgEr9Exw.jpg,,200000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",https://www.paramountmovies.com/movies/titanic,597,tt0120338,[US],en,...,2264162353,194,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Nothing on Earth could come between them.,Titanic,False,7.906,25915,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."


In [5]:
# Drop irrelevant columns
df = drop_irrelevant_columns(df)
print("Shape after dropping irrelevant columns:", df.shape)
df.head()

Shape after dropping irrelevant columns: (18, 21)


Unnamed: 0,belongs_to_collection,budget,genres,id,origin_country,original_language,overview,popularity,poster_path,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,credits
0,"{'id': 86311, 'name': 'The Avengers Collection...",356000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...",299534,[US],en,After the devastating events of Avengers: Infi...,53.8808,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,"[{'id': 420, 'logo_path': '/hUzeosd33nzE5MCNsZ...",...,2019-04-24,2799439100,181,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Avenge the fallen.,Avengers: Endgame,8.237,26247,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
1,"{'id': 87096, 'name': 'Avatar Collection', 'po...",237000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",19995,[US],en,"In the 22nd century, a paraplegic Marine is di...",30.3068,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,"[{'id': 444, 'logo_path': None, 'name': 'Dune ...",...,2009-12-15,2923706026,162,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Enter the world of Pandora.,Avatar,7.588,32162,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
2,"{'id': 10, 'name': 'Star Wars Collection', 'po...",245000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",140607,[US],en,Thirty years after defeating the Galactic Empi...,15.2615,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,"[{'id': 1, 'logo_path': '/tlVSws0RvvtPBwViUyOF...",...,2015-12-15,2068223624,136,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Every generation has a story.,Star Wars: The Force Awakens,7.262,19693,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
3,"{'id': 86311, 'name': 'The Avengers Collection...",300000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",299536,[US],en,As the Avengers and their allies have continue...,97.1289,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,"[{'id': 420, 'logo_path': '/hUzeosd33nzE5MCNsZ...",...,2018-04-25,2052415039,149,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Destiny arrives all the same.,Avengers: Infinity War,8.235,30432,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."
4,,200000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",597,[US],en,101-year-old Rose DeWitt Bukater tells the sto...,46.9587,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,"[{'id': 4, 'logo_path': '/gz66EfNoYPqHTYI4q9UE...",...,1997-11-18,2264162353,194,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Nothing on Earth could come between them.,Titanic,7.906,25915,"{'cast': [{'adult': False, 'gender': 2, 'id': ..."


In [6]:
# Evaluate JSON-like columns
df = evaluate_json_columns(df)
print("Shape after evaluating JSON columns:", df.shape)
df[['belongs_to_collection','genres','production_companies', 'production_countries', 'spoken_languages', 'cast', 'director', 'cast_size', 'crew_size']].head()

Shape after evaluating JSON columns: (18, 24)


Unnamed: 0,belongs_to_collection,genres,production_companies,production_countries,spoken_languages,cast,director,cast_size,crew_size
0,The Avengers Collection,Adventure | Science Fiction | Action,Marvel Studios,US,en | ja | xh,Robert Downey Jr. | Chris Evans | Mark Ruffalo...,Anthony Russo | Joe Russo,105,593
1,Avatar Collection,Action | Adventure | Fantasy | Science Fiction,Dune Entertainment | Lightstorm Entertainment ...,US | GB,en | es,Sam Worthington | Zoe Saldaña | Sigourney Weav...,James Cameron,65,986
2,Star Wars Collection,Adventure | Action | Science Fiction,Lucasfilm Ltd. | Bad Robot,US,en,Harrison Ford | Mark Hamill | Carrie Fisher | ...,J.J. Abrams,182,257
3,The Avengers Collection,Adventure | Action | Science Fiction,Marvel Studios,US,en | xh,Robert Downey Jr. | Chris Evans | Chris Hemswo...,Anthony Russo | Joe Russo,69,724
4,,Drama | Romance,Paramount Pictures | 20th Century Fox | Lights...,US,en | fr | de | sv | it | ru,Leonardo DiCaprio | Kate Winslet | Billy Zane ...,James Cameron,116,258


In [7]:
# Inspect extracted columns
inspect_extracted_columns(df, columns_to_inspect=['belongs_to_collection', 'genres', 'production_countries', 'spoken_languages', 'production_companies', 'director'])


Belongs To Collection distribution:
belongs_to_collection
The Avengers Collection                4
Star Wars Collection                   2
NaN                                    2
Frozen Collection                      2
Jurassic Park Collection               2
Avatar Collection                      1
The Lion King (Reboot) Collection      1
The Fast and the Furious Collection    1
Black Panther Collection               1
Harry Potter Collection                1
Name: count, dtype: int64

Genres distribution:
genres
Adventure | Action | Science Fiction               3
Action | Adventure | Science Fiction | Thriller    2
Action | Adventure | Science Fiction               2
Action | Adventure | Fantasy | Science Fiction     1
Drama | Romance                                    1
Adventure | Science Fiction | Action               1
Adventure | Drama | Family | Animation             1
Science Fiction | Action | Adventure               1
Action | Thriller | Crime                          1

In [8]:
print(df.dtypes)


belongs_to_collection     object
budget                     int64
genres                    object
id                         int64
origin_country            object
original_language         object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                    int64
runtime                    int64
spoken_languages          object
status                    object
tagline                   object
title                     object
vote_average             float64
vote_count                 int64
cast                      object
cast_size                  int64
director                  object
crew_size                  int64
dtype: object


In [9]:
# Convert datatypes
df = convert_datatypes(df)
print("Data types after conversion:")
print(df.dtypes)

Data types after conversion:
belongs_to_collection          category
budget                          float64
genres                         category
id                                int64
origin_country                 category
original_language              category
overview                 string[python]
popularity                      float64
poster_path              string[python]
production_companies           category
production_countries           category
release_date             datetime64[ns]
revenue                         float64
runtime                           int64
spoken_languages               category
status                         category
tagline                  string[python]
title                    string[python]
vote_average                    float64
vote_count                        int64
cast                     string[python]
cast_size                         int64
director                 string[python]
crew_size                         int64
dtype: obje

In [10]:
# Handle unrealistic values
df = handle_unrealistic_values(df)
print("Missing values:")
print(df.isna().sum())

Missing values:
belongs_to_collection    2
budget                   0
genres                   0
id                       0
origin_country           0
original_language        0
overview                 0
popularity               0
poster_path              0
production_companies     0
production_countries     0
release_date             0
revenue                  0
runtime                  0
spoken_languages         0
status                   0
tagline                  0
title                    0
vote_average             0
vote_count               0
cast                     0
cast_size                0
director                 0
crew_size                0
dtype: int64


In [11]:
# Remove duplicates and invalid rows
df = remove_duplicates_and_invalid_rows(df)

Number of duplicate rows: 0
Rows with title='unknown': 0
Rows with missing id or title: 0
DataFrame shape after dropping unknown titles: (18, 24)
Cleaned DataFrame shape after dropping rows with < 10 non-null values: (18, 24)


In [12]:
# Filter released movies
df = filter_released_movies(df)
print("Shape after filtering released movies:", df.shape)

Cleaned DataFrame shape after filtering for 'Released' movies: (18, 23)
Shape after filtering released movies: (18, 23)


In [13]:
# Finalize DataFrame
df = finalize_dataframe(df)
df.head()

Cleaned DataFrame shape: (18, 22)


Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,director,crew_size
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure | Science Fiction | Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,...,8.237,53.8808,181,After the devastating events of Avengers: Infi...,en | ja | xh,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,Robert Downey Jr. | Chris Evans | Mark Ruffalo...,105,Anthony Russo | Joe Russo,593
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action | Adventure | Fantasy | Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment | Lightstorm Entertainment ...,...,7.588,30.3068,162,"In the 22nd century, a paraplegic Marine is di...",en | es,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Sam Worthington | Zoe Saldaña | Sigourney Weav...,65,James Cameron,986
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure | Action | Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd. | Bad Robot,...,7.262,15.2615,136,Thirty years after defeating the Galactic Empi...,en,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,Harrison Ford | Mark Hamill | Carrie Fisher | ...,182,J.J. Abrams,257
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure | Action | Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,...,8.235,97.1289,149,As the Avengers and their allies have continue...,en | xh,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,Robert Downey Jr. | Chris Evans | Chris Hemswo...,69,Anthony Russo | Joe Russo,724
4,597,Titanic,Nothing on Earth could come between them.,1997-11-18,Drama | Romance,,en,200.0,2264.162353,Paramount Pictures | 20th Century Fox | Lights...,...,7.906,46.9587,194,101-year-old Rose DeWitt Bukater tells the sto...,en | fr | de | sv | it | ru,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,Leonardo DiCaprio | Kate Winslet | Billy Zane ...,116,James Cameron,258


In [None]:
# Save the cleaned DataFrame
save_dataframe(df, save_path=Path(PROCESSED_DATA_DIR) /"cleaned_movies.parquet")

Cleaned data saved to data\processed\cleaned_movies_20250423_060158.parquet
Timestamp saved to data\processed\latest_timestamp.txt


### Perform Analysis

In [15]:
analysis_df = df_cleaned.copy()

In [16]:
print("Initial DataFrame shape:", analysis_df.shape)
analysis_df.head()

Initial DataFrame shape: (18, 22)


Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,director,crew_size
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure | Science Fiction | Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,...,8.237,53.8808,181,After the devastating events of Avengers: Infi...,en | ja | xh,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,Robert Downey Jr. | Chris Evans | Mark Ruffalo...,105,Anthony Russo | Joe Russo,593
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action | Adventure | Fantasy | Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment | Lightstorm Entertainment ...,...,7.588,30.3068,162,"In the 22nd century, a paraplegic Marine is di...",en | es,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Sam Worthington | Zoe Saldaña | Sigourney Weav...,65,James Cameron,986
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure | Action | Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd. | Bad Robot,...,7.262,15.2615,136,Thirty years after defeating the Galactic Empi...,en,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,Harrison Ford | Mark Hamill | Carrie Fisher | ...,182,J.J. Abrams,257
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure | Action | Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,...,8.235,97.1289,149,As the Avengers and their allies have continue...,en | xh,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,Robert Downey Jr. | Chris Evans | Chris Hemswo...,69,Anthony Russo | Joe Russo,724
4,597,Titanic,Nothing on Earth could come between them.,1997-11-18,Drama | Romance,,en,200.0,2264.162353,Paramount Pictures | 20th Century Fox | Lights...,...,7.906,46.9587,194,101-year-old Rose DeWitt Bukater tells the sto...,en | fr | de | sv | it | ru,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,Leonardo DiCaprio | Kate Winslet | Billy Zane ...,116,James Cameron,258


In [17]:
# Calculate metrics (profit_musd and roi)
analysis_df = calculate_metrics(analysis_df)
print("DataFrame with new metrics:")
print(analysis_df[['title', 'profit_musd', 'roi']].head())

DataFrame with new metrics:
                          title  profit_musd    roi
0             Avengers: Endgame      2443.44   7.86
1                        Avatar      2686.71  12.34
2  Star Wars: The Force Awakens      1823.22   8.44
3        Avengers: Infinity War      1752.42   6.84
4                       Titanic      2064.16  11.32


In [18]:
# Compute KPI rankings
kpis = compute_kpi_rankings(analysis_df)
print("Top 5 Highest Revenue Movies:")
print(kpis['highest_revenue'][['title', 'revenue_musd']])
print("\nTop 5 Highest Rated Movies:")
print(kpis['highest_rated'][['title', 'vote_average']])

Top 5 Highest Revenue Movies:
                          title  revenue_musd
1                        Avatar   2923.706026
0             Avengers: Endgame   2799.439100
4                       Titanic   2264.162353
2  Star Wars: The Force Awakens   2068.223624
3        Avengers: Infinity War   2052.415039

Top 5 Highest Rated Movies:
                                           title  vote_average
0                              Avengers: Endgame         8.237
3                         Avengers: Infinity War         8.235
11  Harry Potter and the Deathly Hallows: Part 2         8.087
4                                        Titanic         7.906
7                                   The Avengers         7.740


In [19]:
# Filter specific movies
specific_movies = filter_specific_movies(analysis_df)
print("Bruce Willis Sci-Fi Action Movies:")
print(specific_movies['sci_fi_action_bruce_willis'][['title', 'vote_average']])
print("\nUma Thurman & Tarantino Movies:")
print(specific_movies['uma_thurman_tarentino_directed'][['title', 'runtime']])

Bruce Willis Sci-Fi Action Movies:
Empty DataFrame
Columns: [title, vote_average]
Index: []

Uma Thurman & Tarantino Movies:
Empty DataFrame
Columns: [title, runtime]
Index: []


The dataframe doesn't include Bruce Willis and Uma Thurman

In [20]:
# Compute franchise vs. standalone stats
franchise_standalone = compute_franchise_standalone_stats(analysis_df)
print("Franchise Stats:")
print(franchise_standalone['franchise_stats'])
print("\nStandalone Stats:")
print(franchise_standalone['standalone_stats'])

Franchise Stats:
{'mean_revenue': np.float64(1682.6419708125), 'median_roi': np.float64(7.785), 'mean_budget': np.float64(219.875), 'mean_popularity': np.float64(35.50724375), 'mean_rating': np.float64(7.3805625)}

Standalone Stats:
{'mean_revenue': np.float64(1765.1391585000001), 'median_roi': np.float64(9.615), 'mean_budget': np.float64(180.0), 'mean_popularity': np.float64(54.98005), 'mean_rating': np.float64(7.4384999999999994)}


In [21]:
# Compute franchise performance
franchise_performance = compute_franchise_performance(analysis_df)
print("Top 5 Franchises by Total Revenue:")
print(franchise_performance.head(5))

Top 5 Franchises by Total Revenue:
                          num_movies  total_budget_musd  mean_budget_musd  \
belongs_to_collection                                                       
The Avengers Collection            4             1241.0            310.25   
Star Wars Collection               2              445.0            222.50   
Jurassic Park Collection           2              320.0            160.00   
Avatar Collection                  1              237.0            237.00   
Frozen Collection                  2              300.0            150.00   

                          total_revenue_musd  mean_revenue_musd  mean_rating  
belongs_to_collection                                                         
The Avengers Collection          7776.073348        1944.018337      7.87075  
Star Wars Collection             3400.922454        1700.461227      7.02050  
Jurassic Park Collection         2982.003740        1491.001870      6.61500  
Avatar Collection             

In [22]:
# Compute director performance
director_performance = compute_director_performance(analysis_df)
print("Top 5 Directors by Total Revenue:")
print(director_performance.head(5))

Top 5 Directors by Total Revenue:
                           num_movies  total_revenue_musd  mean_rating
director                                                              
James Cameron                       2         5187.868379       7.7470
Anthony Russo | Joe Russo           2         4851.854139       8.2360
Joss Whedon                         2         2924.219209       7.5055
Jennifer Lee | Chris Buck           2         2727.902485       7.2485
J.J. Abrams                         1         2068.223624       7.2620


In [23]:
# Save the updated DataFrame
analysis_df = save_analysis_results(analysis_df)


In [None]:
analysis_result = df_cleaned.copy()
analysis_result.head()

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,budget_musd,revenue_musd,production_companies,...,vote_average,popularity,runtime,overview,spoken_languages,poster_path,cast,cast_size,director,crew_size
0,299534,Avengers: Endgame,Avenge the fallen.,2019-04-24,Adventure | Science Fiction | Action,The Avengers Collection,en,356.0,2799.4391,Marvel Studios,...,8.237,53.8808,181,After the devastating events of Avengers: Infi...,en | ja | xh,/ulzhLuWrPK07P1YkdWQLZnQh1JL.jpg,Robert Downey Jr. | Chris Evans | Mark Ruffalo...,105,Anthony Russo | Joe Russo,593
1,19995,Avatar,Enter the world of Pandora.,2009-12-15,Action | Adventure | Fantasy | Science Fiction,Avatar Collection,en,237.0,2923.706026,Dune Entertainment | Lightstorm Entertainment ...,...,7.588,30.3068,162,"In the 22nd century, a paraplegic Marine is di...",en | es,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Sam Worthington | Zoe Saldaña | Sigourney Weav...,65,James Cameron,986
2,140607,Star Wars: The Force Awakens,Every generation has a story.,2015-12-15,Adventure | Action | Science Fiction,Star Wars Collection,en,245.0,2068.223624,Lucasfilm Ltd. | Bad Robot,...,7.262,15.2615,136,Thirty years after defeating the Galactic Empi...,en,/wqnLdwVXoBjKibFRR5U3y0aDUhs.jpg,Harrison Ford | Mark Hamill | Carrie Fisher | ...,182,J.J. Abrams,257
3,299536,Avengers: Infinity War,Destiny arrives all the same.,2018-04-25,Adventure | Action | Science Fiction,The Avengers Collection,en,300.0,2052.415039,Marvel Studios,...,8.235,97.1289,149,As the Avengers and their allies have continue...,en | xh,/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg,Robert Downey Jr. | Chris Evans | Chris Hemswo...,69,Anthony Russo | Joe Russo,724
4,597,Titanic,Nothing on Earth could come between them.,1997-11-18,Drama | Romance,,en,200.0,2264.162353,Paramount Pictures | 20th Century Fox | Lights...,...,7.906,46.9587,194,101-year-old Rose DeWitt Bukater tells the sto...,en | fr | de | sv | it | ru,/9xjZS2rlVxm8SFx8kPC3aIGCOYQ.jpg,Leonardo DiCaprio | Kate Winslet | Billy Zane ...,116,James Cameron,258


In [24]:
analysis_results = perform_analysis(df_cleaned)
print("Analyzed DataFrame columns:", analysis_results['full_dataframe'].columns.tolist())

Analyzed DataFrame columns: ['id', 'title', 'tagline', 'release_date', 'genres', 'belongs_to_collection', 'original_language', 'budget_musd', 'revenue_musd', 'production_companies', 'production_countries', 'vote_count', 'vote_average', 'popularity', 'runtime', 'overview', 'spoken_languages', 'poster_path', 'cast', 'cast_size', 'director', 'crew_size', 'profit_musd', 'roi']


In [27]:
create_visualization(analysis_results['full_dataframe'])

Plotting Revenue vs Budget trends...
revenue_vs_budget.png saved to reports\figures
Plotting ROI Distribution by Genre for top genres...
roi_by_genre.png saved to reports\figures
Plotting Popularity vs Rating...
popularity_vs_rating.png saved to reports\figures
Plotting Yearly Trends in Box Office Performance...
yearly_trends.png saved to reports\figures
Plotting Franchise vs Standalone Success...
franchise_vs_standalone.png saved to reports\figures
Visualizations created successfully and saved to reports\figures
