## Final Project Submission

Please fill out:
* Student name: Abdulrahman Aber, Benjamin Bai, Charles Pan, Kevin Rivera
* Student pace: Full Time
* Scheduled project review date/time: 
* Instructor name: David Elliott
* Blog post URL:


# Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sqlite3 as sq
conn = sq.connect('./zippedData/im.db/im.db')
cur = conn.cursor()

# Load all source files as dataframes

In [2]:
bom_movie_gross_df = pd.read_csv('./zippedData/bom.movie_gross.csv.gz')

In [3]:
movie_basics_df = pd.read_sql('''select * from movie_basics''', conn)

In [4]:
directors_df = pd.read_sql("""select * from directors""",conn)

In [5]:
known_for_df = pd.read_sql("""select * from known_for""",conn)

In [6]:
movie_akas_df = pd.read_sql("""select * from movie_akas""",conn)

In [7]:
movie_ratings_df = pd.read_sql("""select * from movie_ratings""",conn)

In [8]:
persons_df = pd.read_sql("""select * from persons""",conn)

In [9]:
principals_df = pd.read_sql("""select * from principals""",conn)

In [10]:
writers_df = pd.read_sql("""select * from writers""",conn)

In [11]:
rt_movie_info_df = pd.read_csv('./zippedData/rt.movie_info.tsv.gz', sep='\t')

In [12]:
rt_reviews_df = pd.read_csv('./zippedData/rt.reviews.tsv.gz', sep='\t', encoding= 'unicode_escape')

In [13]:
tmdb_movies_df = pd.read_csv('./zippedData/tmdb.movies.csv.gz')

In [14]:
tn_movie_budgets_df = pd.read_csv('./zippedData/tn.movie_budgets.csv.gz',
                                 parse_dates=['release_date'])

# TN Movie Budgets Clean Up Process

In [15]:
# Remove $ from production_budget, convert to float
tn_movie_budgets_df['production_budget'] = tn_movie_budgets_df['production_budget'].str.replace(',', '').str.replace('$', '').astype(float)

# Remove $ from domestic_gross, convert to float
tn_movie_budgets_df['domestic_gross'] = tn_movie_budgets_df['domestic_gross'].str.replace(',', '').str.replace('$', '').astype(float)

# Remove $ from worldwide_gross, convert to float
tn_movie_budgets_df['worldwide_gross'] = tn_movie_budgets_df['worldwide_gross'].str.replace(',', '').str.replace('$', '').astype(float)

# new column release_year from release_date to filter based on time frame in scope
tn_movie_budgets_df['release_year'] = pd.DatetimeIndex(tn_movie_budgets_df['release_date']).year

# time frame: 2010-Present; SAG definition of theatrical: budget > 2 million
tn_movie_budgets_df = tn_movie_budgets_df[(tn_movie_budgets_df.release_year >= 2010) & (tn_movie_budgets_df.production_budget > 1999999)]

# Merge TN Movie Budgets with IMDB Movie Basics for Genres Analysis

In [16]:
# rename movie_basics column to match tn_movie_budgets
movie_basics_df.rename(columns={"primary_title":"movie"}, inplace = True)

# merge movie_budgets and imdb movie_basics
moviebudgets_moviebasics_df = tn_movie_budgets_df.merge(movie_basics_df, how='inner', on='movie')

# drop columns that are not relevant to Genre analysis
moviebudgets_moviebasics_df.drop(columns = ['id','release_date', 'movie_id', 'original_title', 'runtime_minutes', 'domestic_gross' ],axis=1,inplace=True)

# remove duplicate movies
moviebudgets_moviebasics_df.drop_duplicates(subset='movie', keep='first', inplace=True, ignore_index=False)

# add column for net_return
moviebudgets_moviebasics_df['net_return'] = (moviebudgets_moviebasics_df['worldwide_gross'] - moviebudgets_moviebasics_df['production_budget'])

# add column for ROI as a %
moviebudgets_moviebasics_df['roi_percent'] = ((moviebudgets_moviebasics_df['net_return'] / moviebudgets_moviebasics_df['production_budget'])*100)

# Genres Analysis Visualizations

In [26]:
# budgets_basics_netreturn_df created to get top 100 movies by net_return
budgets_basics_netreturn_df = moviebudgets_moviebasics_df.sort_values(by='net_return',ascending=False).head(100)
# split genres string to list
budgets_basics_netreturn_df['genres'] = budgets_basics_netreturn_df['genres'].str.split(",")
# split genres lists to unique rows for counts of individual genres instead of combinations
budgets_basics_netreturn_df = budgets_basics_netreturn_df.explode('genres')
# means by genre
netreturn_mean_df = budgets_basics_netreturn_df.groupby(['genres']).mean()

Unnamed: 0_level_0,production_budget,worldwide_gross,release_year,start_year,net_return,roi_percent
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Action,179122000.0,895587600.0,2014.576271,2014.576271,716465600.0,442.9265
Adventure,172015400.0,877921600.0,2014.358974,2014.320513,705906300.0,469.02481
Animation,131478300.0,802796700.0,2013.826087,2013.73913,671318400.0,598.969647
Biography,82666670.0,658416700.0,2015.666667,2015.666667,575750000.0,888.560449
Comedy,119142900.0,766081300.0,2014.571429,2014.321429,646938500.0,702.441986
Crime,173000000.0,954726400.0,2013.8,2013.0,781726400.0,442.812533
Drama,109000000.0,749574100.0,2014.5,2013.285714,640574100.0,782.378584
Family,162555600.0,740918800.0,2014.666667,2013.888889,578363300.0,407.918792
Fantasy,189922200.0,848988500.0,2014.333333,2014.166667,659066300.0,421.292771
Horror,35000000.0,697458000.0,2017.0,2017.0,662458000.0,1892.737054


In [28]:
# budgets_basics_roi_df created to get top 100 movies by ROI%
budgets_basics_roi_df = moviebudgets_moviebasics_df.sort_values(by='roi_percent',ascending=False).head(100)
# split genres string to list
budgets_basics_roi_df['genres'] = budgets_basics_roi_df['genres'].str.split(",")
# split genres lists to unique rows for counts of individual genres instead of combinations
budgets_basics_roi_df = budgets_basics_roi_df.explode('genres')
# means by genre
roi_mean_df = budgets_basics_roi_df.groupby(['genres']).mean()
roi_mean_df

Unnamed: 0_level_0,production_budget,worldwide_gross,release_year,start_year,net_return,roi_percent
genres,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Action,29642860.0,332642100.0,2015.428571,2015.285714,302999200.0,1124.257164
Adventure,53625000.0,647332700.0,2015.333333,2015.166667,593707700.0,1074.309057
Animation,67250000.0,843257400.0,2015.333333,2015.166667,776007400.0,1197.222125
Biography,21090910.0,274490400.0,2015.272727,2015.272727,253399500.0,1291.034893
Comedy,28296880.0,350276600.0,2014.09375,2013.84375,321979800.0,1297.821099
Crime,11000000.0,131318600.0,2017.0,2017.0,120318600.0,1079.692293
Documentary,16375000.0,222853200.0,2016.75,2013.5,206478200.0,1244.007954
Drama,15576000.0,196346300.0,2014.52,2013.96,180770300.0,1269.266814
Family,18750000.0,202705700.0,2014.75,2014.5,183955700.0,1021.465637
Fantasy,26416670.0,281861100.0,2013.666667,2013.666667,255444500.0,1179.837904
