## Add columns for gross, local budget using movie_budjets: Dataset: 
BoxOfficeMojo dataset: https://www.boxofficemojo.com/title/tt0499549/?ref_=bo_se_r_1  
The dataset is taken from the Github: https://github.com/ntdoris/movie-revenue-analysis/tree/main 


In [96]:
import numpy as np
import pandas as pd
import ast
import re
import matplotlib.pyplot as plt
import seaborn as sns
import os

#### df_movie_budgets

In [97]:
## dataset more cleaned and with additional column production budget
df_movie_budgets = pd.read_csv('movie-revenue-analysis/zippedData/tn.movie_budgets.csv.gz')
df_movie_budgets.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"
4,5,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,"$317,000,000","$620,181,382","$1,316,721,747"


In [98]:
# Remove dollar signs and commas, and convert columns to numeric
df_movie_budgets['production_budget'] = pd.to_numeric(df_movie_budgets['production_budget'].str.replace('[\$,]', '', regex=True))
df_movie_budgets['domestic_gross'] = pd.to_numeric(df_movie_budgets['domestic_gross'].str.replace('[\$,]', '', regex=True))
df_movie_budgets['worldwide_gross'] = pd.to_numeric(df_movie_budgets['worldwide_gross'].str.replace('[\$,]', '', regex=True))
df_movie_budgets = df_movie_budgets.rename(columns={'movie': 'Name', 'release_date': 'Release_Date'})
df_movie_budgets.drop(columns=['id'], inplace=True)

In [99]:
df_movie_budgets.head()

Unnamed: 0,Release_Date,Name,production_budget,domestic_gross,worldwide_gross
0,"Dec 18, 2009",Avatar,425000000,760507625,2776345279
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875
2,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350
3,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747


In [100]:
## DATA CLEANING df_movie_budgets
## drop nan and rows that have zero domestic gross
print(f"Number of rows before dropping: {df_movie_budgets.shape[0]}")
df_movie_budgets = df_movie_budgets[df_movie_budgets['domestic_gross'].notna() & (df_movie_budgets['domestic_gross'] != 0)]

## drop nan and rows that have worldwide gross= domestic gross( foreigh gross = 0)
df_movie_budgets = df_movie_budgets[df_movie_budgets['worldwide_gross'] != df_movie_budgets['domestic_gross']]
print(f"Number of rows after dropping: {df_movie_budgets.shape[0]}")

df_movie_budgets['foreign_gross'] = df_movie_budgets['worldwide_gross'] - df_movie_budgets['domestic_gross']
df_movie_budgets.head()

Number of rows before dropping: 5782
Number of rows after dropping: 3982


Unnamed: 0,Release_Date,Name,production_budget,domestic_gross,worldwide_gross,foreign_gross
0,"Dec 18, 2009",Avatar,425000000,760507625,2776345279,2015837654
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,804600000
2,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350,107000000
3,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963,944008095
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,696540365


In [101]:
df_movie_budgets_cleaned = df_movie_budgets.copy(deep=True)
df_movie_budgets_cleaned

Unnamed: 0,Release_Date,Name,production_budget,domestic_gross,worldwide_gross,foreign_gross
0,"Dec 18, 2009",Avatar,425000000,760507625,2776345279,2015837654
1,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,410600000,241063875,1045663875,804600000
2,"Jun 7, 2019",Dark Phoenix,350000000,42762350,149762350,107000000
3,"May 1, 2015",Avengers: Age of Ultron,330600000,459005868,1403013963,944008095
4,"Dec 15, 2017",Star Wars Ep. VIII: The Last Jedi,317000000,620181382,1316721747,696540365
...,...,...,...,...,...,...
5759,"Jun 2, 2006",The Puffy Chair,15000,194523,195254,731
5773,"Feb 26, 1993",El Mariachi,7000,2040920,2041928,1008
5774,"Oct 8, 2004",Primer,7000,424760,841926,417166
5775,"May 26, 2006",Cavite,7000,70071,71644,1573


#### df_movie_gross

In [102]:
df_movie_gross = pd.read_csv('movie-revenue-analysis/zippedData//bom.movie_gross.csv.gz')
df_movie_gross = df_movie_gross.rename(columns={'title': 'Name', 'year': 'Release_Date'})
df_movie_gross.drop(columns = ['studio'], inplace=True)
df_movie_gross

Unnamed: 0,Name,domestic_gross,foreign_gross,Release_Date
0,Toy Story 3,415000000.0,652000000,2010
1,Alice in Wonderland (2010),334200000.0,691300000,2010
2,Harry Potter and the Deathly Hallows Part 1,296000000.0,664300000,2010
3,Inception,292600000.0,535700000,2010
4,Shrek Forever After,238700000.0,513900000,2010
...,...,...,...,...
3382,The Quake,6200.0,,2018
3383,Edward II (2018 re-release),4800.0,,2018
3384,El Pacto,2500.0,,2018
3385,The Swan,2400.0,,2018


In [103]:
# DATA CLEANING FOR df_movie_gross
df_movie_gross['foreign_gross'] = pd.to_numeric(df_movie_gross['foreign_gross'], errors='coerce')
df_movie_gross['worldwide_gross'] = df_movie_gross['domestic_gross'] + df_movie_gross['foreign_gross']
df_movie_gross.head()

Unnamed: 0,Name,domestic_gross,foreign_gross,Release_Date,worldwide_gross
0,Toy Story 3,415000000.0,652000000.0,2010,1067000000.0
1,Alice in Wonderland (2010),334200000.0,691300000.0,2010,1025500000.0
2,Harry Potter and the Deathly Hallows Part 1,296000000.0,664300000.0,2010,960300000.0
3,Inception,292600000.0,535700000.0,2010,828300000.0
4,Shrek Forever After,238700000.0,513900000.0,2010,752600000.0


In [104]:
print(f"Number of rows before dropping: {df_movie_gross.shape[0]}")
df_movie_gross = df_movie_gross[df_movie_gross['domestic_gross'].notna() & (df_movie_gross['domestic_gross'] != 0)]
df_movie_gross = df_movie_gross[df_movie_gross['foreign_gross'].notna() & (df_movie_gross['foreign_gross'] != 0)]
print(f"Number of rows after dropping: {df_movie_gross.shape[0]}")



Number of rows before dropping: 3387
Number of rows after dropping: 2004


In [105]:
df_movie_gross_cleaned = df_movie_gross.copy(deep=True)
df_movie_gross_cleaned.head()

Unnamed: 0,Name,domestic_gross,foreign_gross,Release_Date,worldwide_gross
0,Toy Story 3,415000000.0,652000000.0,2010,1067000000.0
1,Alice in Wonderland (2010),334200000.0,691300000.0,2010,1025500000.0
2,Harry Potter and the Deathly Hallows Part 1,296000000.0,664300000.0,2010,960300000.0
3,Inception,292600000.0,535700000.0,2010,828300000.0
4,Shrek Forever After,238700000.0,513900000.0,2010,752600000.0


#### Combined df_movie_gross, df_movie_budget

In [106]:
df_movie_gross_cleaned['Name'] = df_movie_gross_cleaned['Name'].str.strip().str.lower()
df_movie_budgets_cleaned['Name'] = df_movie_budgets_cleaned['Name'].str.strip().str.lower()

# Find movies in df_movie_gross_cleaned that are not in df_movie_budgets_cleaned
missing_movies = df_movie_gross_cleaned[~df_movie_gross_cleaned['Name'].isin(df_movie_budgets_cleaned['Name'])]
missing_movies = missing_movies[['Name', 'Release_Date', 'domestic_gross', 'foreign_gross', 'worldwide_gross']]

df_movie_combined = pd.concat([df_movie_budgets_cleaned, missing_movies], ignore_index=True)
df_movie_combined

Unnamed: 0,Release_Date,Name,production_budget,domestic_gross,worldwide_gross,foreign_gross
0,"Dec 18, 2009",avatar,425000000.0,760507625.0,2.776345e+09,2.015838e+09
1,"May 20, 2011",pirates of the caribbean: on stranger tides,410600000.0,241063875.0,1.045664e+09,8.046000e+08
2,"Jun 7, 2019",dark phoenix,350000000.0,42762350.0,1.497624e+08,1.070000e+08
3,"May 1, 2015",avengers: age of ultron,330600000.0,459005868.0,1.403014e+09,9.440081e+08
4,"Dec 15, 2017",star wars ep. viii: the last jedi,317000000.0,620181382.0,1.316722e+09,6.965404e+08
...,...,...,...,...,...,...
4888,2018,i still see you,,1400.0,1.501400e+06,1.500000e+06
4889,2018,the catcher was a spy,,725000.0,9.540000e+05,2.290000e+05
4890,2018,time freak,,10000.0,2.660000e+05,2.560000e+05
4891,2018,reign of judges: title of liberty - concept short,,93200.0,9.840000e+04,5.200000e+03


In [108]:
df_movie_combined['languages'] = pd.NA
df_box_office_mojo = df_movie_combined.copy(deep=True)

In [110]:
df_box_office_mojo.to_csv('../../data/box_office_mojo_cleaned.csv', index=False)

## combination with the original dtaset

In [91]:
df_cleaned = pd.read_csv(r"C:\Users\giada\OneDrive - GirlsCodeToo\Desktop\semestre 3\Applied_Data_Analysis\the_lords_of_data\ada-2024-project-thelordsofdata\data\cmu_cleaned.csv")

In [None]:
df_cleaned

Unnamed: 0,Freebase_ID,Name,Release_Date,Revenue,Languages,Countries
0,/m/03vyhn,Ghosts of Mars,2001,14010832.0,english,united states of america
7,/m/02qc0j7,Alexander's Ragtime Band,1938,3600000.0,english,united states of america
13,/m/016ywb,Henry V,1989,10161099.0,english,united kingdom
17,/m/0kcn7,Mary Poppins,1964,102272727.0,english,united states of america
21,/m/02wjqm,New Rose Hotel,1999,21521.0,english,united states of america
...,...,...,...,...,...,...
81695,/m/0f7hw,Coming to America,1988,288752301.0,english,united states of america
81720,/m/0kvgqb,Spaced Invaders,1990,15369573.0,english,united states of america
81725,/m/0660qx,State and Main,2000,6944471.0,"italian, english","france, united states of america"
81726,/m/030xw6,Guilty as Sin,1993,22886222.0,,united states of america


In [None]:
df_movie_combined

Unnamed: 0,release_date,Name,domestic_gross,worldwide_gross,foreign_gross
0,"Dec 18, 2009",avatar,760507625.0,2.776345e+09,2.015838e+09
1,"May 20, 2011",pirates of the caribbean: on stranger tides,241063875.0,1.045664e+09,8.046000e+08
2,"Jun 7, 2019",dark phoenix,42762350.0,1.497624e+08,1.070000e+08
3,"May 1, 2015",avengers: age of ultron,459005868.0,1.403014e+09,9.440081e+08
4,"Dec 15, 2017",star wars ep. viii: the last jedi,620181382.0,1.316722e+09,6.965404e+08
...,...,...,...,...,...
4888,2018,i still see you,1400.0,1.501400e+06,1.500000e+06
4889,2018,the catcher was a spy,725000.0,9.540000e+05,2.290000e+05
4890,2018,time freak,10000.0,2.660000e+05,2.560000e+05
4891,2018,reign of judges: title of liberty - concept short,93200.0,9.840000e+04,5.200000e+03


Intersection betweeen those 2 datasets

In [92]:
# MERGE THE INFO INTO THE ORGINAL DTASET, IF THE MOVIE IS PRESENT
df_movie_combined['Name'] = df_movie_combined['Name'].str.strip().str.lower()
df_cleaned['Name'] = df_cleaned['Name'].str.strip().str.lower()

# Merge combined data with domestic_gross and production budget info with the cleaned original data
df_cleaned_with_budget = pd.merge(
    df_cleaned,
    df_movie_combined[['Name', 'domestic_gross', 'worldwide_gross']],
    left_on='Name',
    right_on='Name',
    how='left'
)

df_cleaned_with_budget.head()

Unnamed: 0,Freebase_ID,Name,Release_Date,Revenue,Languages,Countries,domestic_gross,worldwide_gross
0,/m/03vyhn,ghosts of mars,2001,14010832.0,english,united states of america,,
1,/m/02qc0j7,alexander's ragtime band,1938,3600000.0,english,united states of america,,
2,/m/016ywb,henry v,1989,10161099.0,english,united kingdom,10161099.0,10176701.0
3,/m/0kcn7,mary poppins,1964,102272727.0,english,united states of america,,
4,/m/02wjqm,new rose hotel,1999,21521.0,english,united states of america,,


In [93]:
print(f"Number of rows before cleaning: {len(df_cleaned_with_budget)}")
df_cleaned_with_budget_drop = df_cleaned_with_budget.dropna()
print(f"Number of rows after cleaning: {len(df_cleaned_with_budget_drop)}")

Number of rows before cleaning: 8389
Number of rows after cleaning: 2622


In [95]:
df_cleaned_with_budget_drop

Unnamed: 0,Freebase_ID,Name,Release_Date,Revenue,Languages,Countries,domestic_gross,worldwide_gross
2,/m/016ywb,henry v,1989,10161099.0,english,united kingdom,10161099.0,10176701.0
25,/m/0g08ws,the astronaut farmer,2006,11130889.0,"english, spanish",united states of america,11003643.0,11141213.0
26,/m/01bwgr,straw dogs,1971,11148828.0,english,"united states of america, united kingdom",10324441.0,11253821.0
29,/m/01xlqd,grease,1978,394589888.0,english,united states of america,181813770.0,387510179.0
30,/m/08m1s2,becoming jane,2007,37311672.0,english,"ireland, united kingdom",18663911.0,39380876.0
...,...,...,...,...,...,...,...,...
8366,/m/049kmb,beverly hills cop iii,1994,119208989.0,english,united states of america,42586861.0,119180938.0
8368,/m/0gjk1d,dead man walking,1995,83088295.0,english,"united states of america, united kingdom",39387284.0,83088295.0
8377,/m/05zkcsk,adam,2009,2549605.0,english,united states of america,2283291.0,2834485.0
8379,/m/01s7w3,twister,1996,494471524.0,english,united states of america,241688385.0,495700000.0
