# Grace Techau
## Box Office Revenue & Letterboxd Ratings Project 
### NOTEBOOK 1
### Importing & Cleaning Box Office Revenue Kaggle Data Set

In [38]:
# Import necessary packages
import pandas as pd 

In [40]:
## Import Kaggle data set of Box Office revnue data on movies from 2010 - "2010-2024 Movies Box Ofice Collection.csv"

box_office_revenue = pd.read_csv("2010-2024_Movies_Box_Ofice_Collection_raw.csv", encoding="utf-8")

In [42]:
# Display first 10 rows to make sure data frame loaded correctly 
print(box_office_revenue.shape)
display(box_office_revenue.head(10))

(2800, 8)


Unnamed: 0,Rank,Release Group,Worldwide,Domestic,Domestic_percent,Foreign,Foreign_percent,year
0,0,Toy Story 3,1066969703,415004880,38.90%,651964823,61.10%,2010
1,1,Alice in Wonderland,1025467110,334191110,32.60%,691276000,67.40%,2010
2,2,Harry Potter and the Deathly Hallows: Part 1,960283305,295983305,30.80%,664300000,69.20%,2010
3,3,Inception,828258695,292576195,35.30%,535682500,64.70%,2010
4,4,Shrek Forever After,752600867,238736787,31.70%,513864080,68.30%,2010
5,5,The Twilight Saga: Eclipse,698491347,300531751,43%,397959596,57%,2010
6,6,Iron Man 2,623933331,312433331,50.10%,311500000,49.90%,2010
7,7,Tangled,592461732,200821936,33.90%,391639796,66.10%,2010
8,8,Despicable Me,543239815,251639815,46.30%,291600000,53.70%,2010
9,9,How to Train Your Dragon,494878759,217581231,44%,277297528,56%,2010


In [44]:
# create new "clean" data frame for cleaning 

# delete "Rank" column 
box_office_revenue_clean = box_office_revenue.drop('Rank', axis=1)

# rename columns with snake case & with proper names for later intergration
box_office_revenue_clean = box_office_revenue_clean.rename(columns={'Release Group': 'title', 'Worldwide':'worldwide_revenue', 'Domestic':'domestic_revenue','Domestic_percent':'domestic_percent', 'Foreign': 'foreign_revenue', 'Foreign_percent':'foreign_percent', 'year':'year'})

print("Clean Column Names")
print("-"*50)
print(box_office_revenue_clean.columns)

Clean Column Names
--------------------------------------------------
Index(['title', 'worldwide_revenue', 'domestic_revenue', 'domestic_percent',
       'foreign_revenue', 'foreign_percent', 'year'],
      dtype='object')


In [46]:
# Check data types 
print("Data Types of Dataframe before cleaning")
print("-"*50)
print(box_office_revenue_clean.dtypes)

Data Types of Dataframe before cleaning
--------------------------------------------------
title                object
worldwide_revenue    object
domestic_revenue     object
domestic_percent     object
foreign_revenue      object
foreign_percent      object
year                  int64
dtype: object


In [48]:
# We need to change columns "worldwide_revenue", "domestic_revenue", "domestic percent" 
# "foreign_revenue" and "foreign_percent" all to numeric datatypes 

box_office_revenue_clean['worldwide_revenue'] = box_office_revenue_clean['worldwide_revenue'].str.replace(',','', regex=False)
box_office_revenue_clean['worldwide_revenue'] = pd.to_numeric(box_office_revenue_clean['worldwide_revenue'], errors='coerce')


box_office_revenue_clean['domestic_revenue'] = box_office_revenue_clean['domestic_revenue'].str.replace(',','', regex=False)
box_office_revenue_clean['domestic_revenue'] = pd.to_numeric(box_office_revenue_clean['domestic_revenue'], errors='coerce')

box_office_revenue_clean['foreign_revenue'] = box_office_revenue_clean['foreign_revenue'].str.replace(',','', regex=False)
box_office_revenue_clean['foreign_revenue'] = pd.to_numeric(box_office_revenue_clean['foreign_revenue'], errors='coerce')


box_office_revenue_clean['domestic_percent'] = box_office_revenue_clean['domestic_percent'].str.replace('%','', regex=False)
box_office_revenue_clean['domestic_percent'] = pd.to_numeric(box_office_revenue_clean['domestic_percent'], errors='coerce')

box_office_revenue_clean['foreign_percent'] = box_office_revenue_clean['foreign_percent'].str.replace('%','', regex=False)
box_office_revenue_clean['foreign_percent'] = pd.to_numeric(box_office_revenue_clean['foreign_percent'], errors='coerce')

# Strip any white space from title column
box_office_revenue_clean['title'] = box_office_revenue_clean['title'].str.strip()

print("Data Types of Dataframe after cleaning")
print("-"*50)
print(box_office_revenue_clean.dtypes)

Data Types of Dataframe after cleaning
--------------------------------------------------
title                 object
worldwide_revenue      int64
domestic_revenue       int64
domestic_percent     float64
foreign_revenue        int64
foreign_percent      float64
year                   int64
dtype: object


In [50]:
# Conert the percentage fields into values between 0 and 1 & round to 2 decimal places 
box_office_revenue_clean['domestic_percent'] = ((box_office_revenue_clean['domestic_percent']) / 100).round(2)
box_office_revenue_clean['foreign_percent'] = ((box_office_revenue_clean['foreign_percent']) / 100).round(2)

#Check to see if they transformed correctly 
print("New percentage fields as values from 0 to 1")
display(box_office_revenue_clean[['domestic_percent', 'foreign_percent']].head(5))

New percentage fields as values from 0 to 1


Unnamed: 0,domestic_percent,foreign_percent
0,0.39,0.61
1,0.33,0.67
2,0.31,0.69
3,0.35,0.65
4,0.32,0.68


In [52]:
# Change the three revenue columns to be scaled in millions and round to 2 decimal places 

box_office_revenue_clean['worldwide_revenue'] = (box_office_revenue_clean['worldwide_revenue'] / 1000000).round(2)
box_office_revenue_clean['domestic_revenue'] = (box_office_revenue_clean['domestic_revenue'] / 1000000).round(2)
box_office_revenue_clean['foreign_revenue'] = (box_office_revenue_clean['foreign_revenue'] / 1000000).round(2)

In [54]:
print("Revenue Columns Represented in Millions of U.S $")
print('-'*50)
display(box_office_revenue_clean[['worldwide_revenue', 'domestic_revenue', 'foreign_revenue']].head(5))

Revenue Columns Represented in Millions of U.S $
--------------------------------------------------


Unnamed: 0,worldwide_revenue,domestic_revenue,foreign_revenue
0,1066.97,415.0,651.96
1,1025.47,334.19,691.28
2,960.28,295.98,664.3
3,828.26,292.58,535.68
4,752.6,238.74,513.86


In [56]:
# Drop all records that are from the year 2010-2016 and 2020-2024 
years_to_drop = [2010, 2011, 2012, 2013, 2014, 2015, 2016, 2020, 2021, 2022, 2023]

rows_to_drop = box_office_revenue_clean[box_office_revenue_clean['year'].isin(years_to_drop)].index

box_office_revenue_clean = box_office_revenue_clean.drop(rows_to_drop)

In [58]:
# Check to see that only the years we want are included 
print("Years included after dropping 2010, 2020-2023")
print("-"*50)
print(box_office_revenue_clean['year'].unique())

Years included after dropping 2010, 2020-2023
--------------------------------------------------
[2017 2018 2019]


In [60]:
# Some Movie Title names seemed to have replaced a "-" with a numerical 0 
# This was the way the data was in the Kaggle data set - nothing wrong with the encoding in reading the CSV file 

# Find all rows in title column where the movie name contains a '0'
print("List of Title Names with 0's in Them")
print("-"*50)
display(box_office_revenue_clean[box_office_revenue_clean['title'].str.contains('0', na=False)]['title'])
print("\n")

# There seems to be ~80 values that have had dashes (-) replaced with 0 
# We do not want to repalce ALL 0's with dashes 
# (eg. Movies with titles have years or numerical values with 0's in them)

# Define function to replace zeros  ('0') with dashes ('-')
def fix_dashes(title): 
    words = title.split() 
    for i, word in enumerate(words): 
        if word ==0: 
            words[i] == '-' 
        elif '0' in word and not word.replace('0', '').isdigit(): 
            words[i] = word.replace('0', '-')
    return ' '.join(words)

box_office_revenue_clean['title2'] = box_office_revenue_clean['title'].apply(fix_dashes)

print(f"Number of rows with 0's before function applied: {len(box_office_revenue_clean[box_office_revenue_clean['title'].str.contains('0', na=False)]['title'])}")
print(f"Number of rows with 0's after function applied: {len(box_office_revenue_clean[box_office_revenue_clean['title2'].str.contains('0', na=False)]['title'])}")
print("\n")

print("Title Names with 0's After Function Applied")
print("-"*50)
display(box_office_revenue_clean[box_office_revenue_clean['title2'].str.contains('0', na=False)]['title2'])
print("All dashes seemed to be fixed! The only 0's left are ones that are included 0's in years or numeric digits included in the title.")

List of Title Names with 0's in Them
--------------------------------------------------


1400              Star Wars: Episode VIII 0 The Last Jedi
1405                               Spider0Man: Homecoming
1429                    The Ex0File 3: Return of the Exes
1434                                    Blade Runner 2049
1565                                Reset 2017 Re0release
1583    A Chinese Odyssey: Part 2 0 Cinderella 2017 Re...
1588                                Leap! 2017 Re0release
1590                                The 80Year Engagement
1597                                  Épouse0moi mon pote
1607                        Mission: Impossible 0 Fallout
1610                                 Ant0Man and the Wasp
1624                    Spider0Man: Into the Spider0Verse
1702                        Spirited Away 2018 Re0release
1722                 Once Upon a Deadpool 2018 Re0release
1732                    Bajrangi Bhaijaan 2018 Re0release
1779                   My Neighbor Totoro 2018 Re0release
1788                   The Witch: Part 1 0 The Subversion
1803          



Number of rows with 0's before function applied: 27
Number of rows with 0's after function applied: 9


Title Names with 0's After Function Applied
--------------------------------------------------


1434                                    Blade Runner 2049
1565                                Reset 2017 Re-release
1583    A Chinese Odyssey: Part 2 - Cinderella 2017 Re...
1588                                Leap! 2017 Re-release
1702                        Spirited Away 2018 Re-release
1722                 Once Upon a Deadpool 2018 Re-release
1732                    Bajrangi Bhaijaan 2018 Re-release
1779                   My Neighbor Totoro 2018 Re-release
1974                                             Super 30
Name: title2, dtype: object

All dashes seemed to be fixed! The only 0's left are ones that are included 0's in years or numeric digits included in the title.


In [62]:
# delete orginal title column 
box_office_revenue_clean.drop('title', axis=1, inplace=True)

#rename the modified title2 column to "title"
box_office_revenue_clean.rename(columns={'title2':'title'}, inplace=True)

In [64]:
print("Final Data Frame")
print("-"*50)
print(box_office_revenue_clean.shape)
display(box_office_revenue_clean)

Final Data Frame
--------------------------------------------------
(600, 7)


Unnamed: 0,worldwide_revenue,domestic_revenue,domestic_percent,foreign_revenue,foreign_percent,year,title
1400,1332.54,620.18,0.46,712.36,0.54,2017,Star Wars: Episode VIII - The Last Jedi
1401,1263.52,504.01,0.40,759.51,0.60,2017,Beauty and the Beast
1402,1236.01,226.01,0.18,1010.00,0.82,2017,The Fate of the Furious
1403,1034.80,264.62,0.26,770.18,0.74,2017,Despicable Me 3
1404,962.08,404.52,0.42,557.56,0.58,2017,Jumanji: Welcome to the Jungle
...,...,...,...,...,...,...,...
1995,19.36,0.00,0.00,19.36,1.00,2019,The Specials
1996,19.02,0.09,0.00,18.92,1.00,2019,Always Miss You
1997,18.90,0.00,0.00,18.90,1.00,2019,An Officer and a Spy
1998,18.74,0.00,0.00,18.74,1.00,2019,Crayon Shin-chan: Honeymoon Hurricane - The Lo...


In [66]:
# Save cleaned data frame to a distinctlly named CSV file 

box_office_revenue_clean.to_csv("box_office_revenue_2017.2019_clean.csv", header=True, index=False, encoding='utf-8')