# Grace Techau
## Box Office Revenue & Letterboxd Ratings Project 
### NOTEBOOK 5
### Merging Clean Letterboxd and Box Office Revenue Files 

In [2]:
# import required libraries
import pandas as pd 

### Import full clean Letterboxd data files from years 2017, 2018 and 2019 to pandas data frames

In [4]:
letterboxd_2017 = pd.read_csv("letterboxd_movie_data_2017_clean.csv", encoding='utf-8')

print(f"Number of Letterboxd movies in 2017: {len(letterboxd_2017)}")

Number of Letterboxd movies in 2017: 3252


In [5]:
letterboxd_2018 = pd.read_csv("letterboxd_movie_data_2018_clean.csv", encoding='utf-8')

print(f"Number of Letterboxd movies in 2018: {len(letterboxd_2018)}")

Number of Letterboxd movies in 2018: 3419


In [6]:
letterboxd_2019 = pd.read_csv("letterboxd_movie_data_2019_clean.csv", encoding='utf-8')

print(f"Number of Letterboxd movies in 2019: {len(letterboxd_2019)}")

Number of Letterboxd movies in 2019: 3704


**Merge Letterboxd data frames from years 2017, 2018, and 2019 into one Letterboxd movie data frame**

In [8]:
letterboxd_merge = pd.concat([letterboxd_2017, letterboxd_2018, letterboxd_2019], axis=0, ignore_index=True)

print(f"Total number of movies scraped from Letterboxd: {len(letterboxd_merge)}")
print("-"*50)
display(letterboxd_merge.head(5))

Total number of movies scraped from Letterboxd: 10375
--------------------------------------------------


Unnamed: 0,title,year,number_ratings,average_rating,length,genres
0,Get Out,2017,2706282,4.2,104,"Horror, Mystery, Thriller"
1,Lady Bird,2017,2062281,3.8,94,"Comedy, Drama, Moving Relationship Stories"
2,Call Me by Your Name,2017,1687099,3.9,132,"Romance, Drama, Moving Relationship Stories"
3,Baby Driver,2017,1933993,3.7,113,"Crime, Action, Drugs And Gangsters"
4,Blade Runner 2049,2017,1411904,4.1,164,"Science Fiction, Drama, Humanity And The World..."


Since the Letterboxd data was scraped this final merge of clean data across years could  have duplicates. I removed all duplicates. 

In [10]:
duplicates = letterboxd_merge[letterboxd_merge.duplicated(keep=False)]

print(f"total records: {len(duplicates)}")
print(f"duplicate records: {len(duplicates[duplicates.duplicated(keep="first")])}")

letterboxd_merge = letterboxd_merge.drop_duplicates()

total records: 44
duplicate records: 22


**Save cleaned Letterboxd data from years 2017 to 2019 into one csv file** 

In [12]:
letterboxd_merge.to_csv('letterboxd_movie_data_2017.2019_clean.csv', header=True, index=False, encoding='utf-8')

In [13]:
## Re-open the full clean CSV file to another pandas data frame for merging with Box Office Data 

letterboxd_data = pd.read_csv('letterboxd_movie_data_2017.2019_clean.csv', encoding='utf-8')

### Import Box Office Revenue data file into pandas data frame

In [15]:
box_office = pd.read_csv("box_office_revenue_2017.2019_clean.csv", encoding='utf-8')

print(f"Number of movies from Kaggle Box Office Revenue: {len(box_office)}")
print("-"*50)
display(box_office.head(5))

Number of movies from Kaggle Box Office Revenue: 600
--------------------------------------------------


Unnamed: 0,worldwide_revenue,domestic_revenue,domestic_percent,foreign_revenue,foreign_percent,year,title
0,1332.54,620.18,0.46,712.36,0.54,2017,Star Wars: Episode VIII - The Last Jedi
1,1263.52,504.01,0.4,759.51,0.6,2017,Beauty and the Beast
2,1236.01,226.01,0.18,1010.0,0.82,2017,The Fate of the Furious
3,1034.8,264.62,0.26,770.18,0.74,2017,Despicable Me 3
4,962.08,404.52,0.42,557.56,0.58,2017,Jumanji: Welcome to the Jungle


## Merge Letterboxd and Box Office Revenue data frames

In [17]:
## Use an inner join to merge the data frames on 'title' and 'year'
## This will keep only matching rowsabs 

film_data = pd.merge(box_office, letterboxd_data, on=['title', 'year'], how='inner')

print(f"Number of movies in final data set: {len(film_data)}")
print("-"*50)
display(film_data.head(5))

Number of movies in final data set: 491
--------------------------------------------------


Unnamed: 0,worldwide_revenue,domestic_revenue,domestic_percent,foreign_revenue,foreign_percent,year,title,number_ratings,average_rating,length,genres
0,1263.52,504.01,0.4,759.51,0.6,2017,Beauty and the Beast,495549,3.1,129,"Romance, Fantasy, Family"
1,1236.01,226.01,0.18,1010.0,0.82,2017,The Fate of the Furious,229455,2.7,136,"Crime, Action, Thriller"
2,1034.8,264.62,0.26,770.18,0.74,2017,Despicable Me 3,278201,2.6,90,"Action, Animation, Family"
3,962.08,404.52,0.42,557.56,0.58,2017,Jumanji: Welcome to the Jungle,674676,3.1,119,"Fantasy, Adventure, Comedy"
4,880.17,334.2,0.38,545.97,0.62,2017,Spider-Man: Homecoming,1688069,3.5,133,"Action, Drama, Adventure"


## Final Cleaning to Full Data Set Before Analysis 

In [19]:
## Re-order the Columns

film_data = film_data[['title', 'year', 'length', 'worldwide_revenue', 'domestic_revenue', 'domestic_percent', 'foreign_revenue', 'foreign_percent', 'number_ratings', 'average_rating', 'genres']]

print("Re-ordered Columns")
print("-"*50)
display(film_data.head(2))

Re-ordered Columns
--------------------------------------------------


Unnamed: 0,title,year,length,worldwide_revenue,domestic_revenue,domestic_percent,foreign_revenue,foreign_percent,number_ratings,average_rating,genres
0,Beauty and the Beast,2017,129,1263.52,504.01,0.4,759.51,0.6,495549,3.1,"Romance, Fantasy, Family"
1,The Fate of the Furious,2017,136,1236.01,226.01,0.18,1010.0,0.82,229455,2.7,"Crime, Action, Thriller"


In [20]:
## check for duplicates in the final data set 

duplicates_2 = film_data[film_data.duplicated(keep=False)]

print(f"total records: {len(film_data)}")
print(f"duplicate records: {len(duplicates_2[duplicates_2.duplicated(keep="first")])}")

total records: 491
duplicate records: 0


In [21]:
## Check data types 

print("Final Data Types")
print("-"*50)
print(film_data.dtypes)

Final Data Types
--------------------------------------------------
title                 object
year                   int64
length                 int64
worldwide_revenue    float64
domestic_revenue     float64
domestic_percent     float64
foreign_revenue      float64
foreign_percent      float64
number_ratings         int64
average_rating       float64
genres                object
dtype: object


## Save the clean & fully merged data frame to a CSV file 

In [23]:
film_data.to_csv("2017_2019_final_film_data.csv", header=True, index=False, encoding='utf-8')