## Importing libraries



We imported the Pandas, Numpy, Seaborn, and Matplotlib libraries so we could use them to manipulate the dataset and create visualizations for it.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from matplotlib.lines import Line2D

## Importing tn.movie_budgets.csv.gz and imdb.title.basics.csv.gz CSV Files and Merging them

In [2]:
df = pd.read_csv('data/zippedData/tn.movie_budgets.csv.gz')
df_2 = pd.read_csv('data/zippedData/imdb.title.basics.csv.gz')
df_meg = pd.merge(df, df_2, how='inner', left_on='movie', right_on='primary_title')
df_meg.head()

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross,tconst,primary_title,original_title,start_year,runtime_minutes,genres
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279",tt1775309,Avatar,Abatâ,2011,93.0,Horror
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875",tt1298650,Pirates of the Caribbean: On Stranger Tides,Pirates of the Caribbean: On Stranger Tides,2011,136.0,"Action,Adventure,Fantasy"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350",tt6565702,Dark Phoenix,Dark Phoenix,2019,113.0,"Action,Adventure,Sci-Fi"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963",tt2395427,Avengers: Age of Ultron,Avengers: Age of Ultron,2015,141.0,"Action,Adventure,Sci-Fi"
4,7,"Apr 27, 2018",Avengers: Infinity War,"$300,000,000","$678,815,482","$2,048,134,200",tt4154756,Avengers: Infinity War,Avengers: Infinity War,2018,149.0,"Action,Adventure,Sci-Fi"


## Checking to see how much of the Column is missing based off Percentage

In [3]:
for col in df_meg.columns:
    pct_missing = np.mean(df_meg[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100)))

id - 0.0%
release_date - 0.0%
movie - 0.0%
production_budget - 0.0%
domestic_gross - 0.0%
worldwide_gross - 0.0%
tconst - 0.0%
primary_title - 0.0%
original_title - 0.0%
start_year - 0.0%
runtime_minutes - 13.0%
genres - 2.0%


## Dropping as well as Splitting and Replacing Columns

In [4]:
cols_to_drop = ['runtime_minutes','start_year','original_title','id' ,'production_budget', 'release_date','tconst','primary_title']
df_mod_2 = df_meg.drop(cols_to_drop, axis=1)
df_mod_2["genres"] = df_mod_2["genres"].apply(lambda x: x.split(",") if type(x) == str else x)
df_mod_2["domestic_gross"] = df["domestic_gross"].str.replace(",","").str.replace("$","").astype(int)
df_mod_2["worldwide_gross"] = df["worldwide_gross"].str.replace(",","").str.replace("$","").astype(int)
df_mod_2.head()

Unnamed: 0,movie,domestic_gross,worldwide_gross,genres
0,Avatar,760507625,2776345279,[Horror]
1,Pirates of the Caribbean: On Stranger Tides,241063875,1045663875,"[Action, Adventure, Fantasy]"
2,Dark Phoenix,42762350,149762350,"[Action, Adventure, Sci-Fi]"
3,Avengers: Age of Ultron,459005868,1403013963,"[Action, Adventure, Sci-Fi]"
4,Avengers: Infinity War,620181382,1316721747,"[Action, Adventure, Sci-Fi]"


## Exploding the Genres Column as well as changing the Zeros to Nan then Dropping Nan's

In [5]:
df_explode = df_mod_2.explode('genres')
df_explode.replace(0, np.nan, inplace=True)
df_explode[df_explode.isna().any(axis=1)]
df_explode = df_explode.dropna()
print(df.isnull().sum())

id                   0
release_date         0
movie                0
production_budget    0
domestic_gross       0
worldwide_gross      0
dtype: int64


## Grouping by Genres means and sourting it by worldwide_gross as well as setting index to Movies

In [6]:
df_explode.groupby('genres').mean()
df_explode= df_explode.sort_values("worldwide_gross", ascending=False).set_index('movie')
df_top_50 = df_explode

## Plotting the data in a Bar Plot

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(x='genres', 
            y='worldwide_gross',
            order=["Adventure","Animation","Fantasy","Musical",'Sci-Fi','Action', 'Family',"Western","History",'Documentary', 'Sport',"Comedy",'Thriller',"Reality-TV",'News','Biography','Drama', 'War','Crime','Mystery', 'Romance','Horror',"Music"],
            data=df_top_50,)
plt.ylabel("Worldwide Gross in Hundreds of Millions", size=14)
plt.xlabel("Genres", size=14)
plt.title(" Movies Worldwide Gross per Genre", size=18)
plt.tick_params(axis='x', labelrotation=90)
plt.savefig("images/grouped_barplot_Seaborn_barplot_Python_corrected.png")

## Work on tmdb_movies_data.csv.zip CSV

## Importing CSV and Dropping Columns

In [None]:
df3 = pd.read_csv('data/zippedData/tmdb_movies_data.csv.zip')
cols_to_drop = ['id', 'imdb_id','cast', 'homepage', 'director','tagline', 'overview', 'production_companies','keywords' ,'vote_count','genres','release_date','release_year','vote_average','budget_adj','popularity','revenue_adj']
df_mod = df3.drop(cols_to_drop, axis=1)
df_mod.head()

## Replacing zeros with nan and dropping the nan's

In [None]:
df_mod.replace(0, np.nan, inplace=True)
df_mod[df_mod.isna().any(axis=1)]
df_mod = df_mod.dropna()
print(df_mod.isnull().sum())

## Sorting Values in ascending order for Revenue

In [None]:
df_mod= df_mod.sort_values("revenue", ascending=False)
df_mod.describe()

## Setting parameters based of a film has to be 40 min to be considered a movie and removing the outliers that skew the Data

In [None]:
df_mod2 = df_mod.loc[(df_mod['runtime'] >= 40) & (df_mod['runtime'] <= 190) & (df_mod['revenue'] > 226830568) & (df_mod['revenue'] < 2068178225)]
df_mod2.head()

## Creating  variables for insertion into Graph

In [None]:
runtime_25_percentile = df_mod2['runtime'].quantile(.25)
runtime_75_percentile = df_mod2['runtime'].quantile(.75)

## Plotting Data in a Scatter Plot

In [None]:
fig, ax = plt.subplots(figsize=(14,7))
data = df_mod2[['runtime', 'revenue']]
x = data['runtime']
y = data['revenue']
plt.scatter(x, y)
z = np.polyfit(x, y, 0)
p = np.poly1d(z)
plt.plot(x,p(x),"r-")
plt.title('Runtime Comparison to Revenue', fontsize=35)
plt.xlabel('Movie Time - min -', fontsize=20)
plt.ylabel('Revenue in hundereds million', fontsize= 20)
plt.fill_between([runtime_25_percentile, runtime_75_percentile], max(y), facecolor='orange', alpha=.2)
plt.savefig("images/Runtime_Comparison_line_added.png")
plt.show()

# Working on Rotten Tomatoes Movies CSV

## Calling the CSV

In [None]:
df_4 = pd.read_csv('data/zippedData/rotten_tomatoes_movies.csv.gz')
df_4.head(2)

## Dropping columns and Null values

In [None]:
cols_to_drop = ['critics_consensus', 'movie_info','directors', 'authors', 'actors', 'streaming_release_date','runtime', 'production_company', 'tomatometer_count', 
                'audience_status', 'tomatometer_rating', 'audience_count', 'tomatometer_top_critics_count','audience_rating','original_release_date', 'rotten_tomatoes_link' ,'tomatometer_fresh_critics_count', 'tomatometer_rotten_critics_count']
df_dropped_colums = df_4.drop(cols_to_drop, axis=1)
df_dropped_colums.replace(0, np.nan, inplace=True)
df_dropped_colums[df_dropped_colums.isna().any(axis=1)]
df_dropped_colums = df_dropped_colums.dropna()
print(df_dropped_colums.isnull().sum())

## Splitting the Genres Column on the ","  and Exploding and isolating the Adventure Genres

In [None]:
df_dropped_colums["genres"] = df_dropped_colums["genres"].apply(lambda x: x.split(",") if type(x) == str else x)

df_explode_2 = df_dropped_colums.explode('genres')

new_df = df_explode_2[df_explode_2['genres'].str.contains('Action & Adventure')]
new_df

## Setting Index replacing the Ratings with values and setting those values to 1 then Grouping by content_rating, tomatometer_status

In [None]:
new_df.set_index('genres')
new_df.replace({'G': 0, 'PG': 1, 'PG-13': 2, 'NC17': 3, 'NR': 4, 'R': 5})
new_df['movie_count'] = 1
df_cor = new_df.groupby(['content_rating', 'tomatometer_status']).count()
df_cor

## Resetting Index and Dropping the Generes column to keep the Genre Column

In [None]:
df_cor = df_cor.reset_index()
cols_to_drop = ['genres']
df_drop = df_cor.drop(cols_to_drop, axis=1)
df_drop.head()

## Ploting the data in a Bar Graph

In [None]:
plt.figure(figsize=(10, 8))
sns.barplot(x='content_rating', 
            y='movie_count', 
            hue='tomatometer_status',
            order=["NC17","G","PG","PG-13",'NR', 'R'],
            data=df_cor, palette=['green', 'blue', 'red'])
plt.ylabel("Count Total", size=14)
plt.xlabel("Movie Rating", size=14)
plt.title("Rotten Tomatos Generated Ratings for Adventure Movies", size=18)
plt.savefig("images/Rotten_tomatose_Ratings.png")
plt.legend()