# 1. Overview
# 2. Column-by-column   (each section = EDA plot + markdown takeaway)
   2.1 BUDGET
       • hist / KDE
       • skew number
       • markdown: “Right-skewed → median imputation”
   2.2 RUNTIME
       • box-plot, etc.
       …
# 3. Relationships & Correlation
   3.1 Budget vs Revenue (scatter + LOWESS)
   3.2 Vote_Count vs Popularity
   3.3 Genre vs Revenue (violin or bar)
   3.4 Correlation heat-map of numeric features
   3.5 Pair-plot (optional, if not too heavy)
# 4. EDA Conclusions  (bullet list of variables to model)

In [9]:
# Packages required for 01_eda.ipynb

import pandas as pd
import numpy as np
from scipy.stats import skew
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.ticker as ticker

In [10]:
# Read in raw data and merge csv files from imdb and rotten tomatoes

movie_info = pd.read_csv('../data/raw/movie_info.csv', index_col=0)
critic_reviews = pd.read_csv('../data/raw/CriticReviews_2018-2020.csv', index_col=0)
# Merge the two dataframes on the 'review_object_title' column
merged_df = pd.merge(movie_info, critic_reviews, on='review_object_title', how='outer', suffixes=('_movie', '_review'))

merged_df.head(100)

Unnamed: 0,review_object_title,genres,budget,runtime,original_language,tmdb_budget,tmdb_revenue,release_date,tmdb_popularity,tmdb_vote_avg,...,reviewer_rating_rotten,reviewer_rating_actual,reviewer rating,tmeter,review_src_url,publication,review_object_type,review_object_year,review_object_href,critic_name
0,#Alive,Action|Horror|Thriller|Science Fiction,0.0,86.0,en,0.0,0.0,7/7/2020,75.455,3.6,...,True,,,0.88,https://www.rottentomatoes.com/source-1879,1879.0,Movie,2020.0,https://www.rottentomatoes.com/m/alive_2020,Felix Vasquez Jr.
1,#Alive,Action|Horror|Thriller|Science Fiction,0.0,86.0,en,0.0,0.0,7/7/2020,75.455,3.6,...,True,4/5/2021 0:00,,0.88,https://www.rottentomatoes.com/source-720,720.0,Movie,2020.0,https://www.rottentomatoes.com/m/alive_2020,John Lui
2,#Alive,Action|Horror|Thriller|Science Fiction,0.0,86.0,en,0.0,0.0,7/7/2020,75.455,3.6,...,True,3/5/2021 0:00,,0.88,https://www.rottentomatoes.com/source-3784,3784.0,Movie,2020.0,https://www.rottentomatoes.com/m/alive_2020,Matt Donato
3,#Alive,Action|Horror|Thriller|Science Fiction,0.0,86.0,en,0.0,0.0,7/7/2020,75.455,3.6,...,True,,,0.88,https://www.rottentomatoes.com/source-2715,2715.0,Movie,2020.0,https://www.rottentomatoes.com/m/alive_2020,Scott Weinberg
4,#AnneFrank. Parallel Stories (Vite parallele),,,,,,,,,,...,True,4/5/2021 0:00,,1.00,https://www.rottentomatoes.com/source-437,437.0,Movie,2019.0,https://www.rottentomatoes.com/m/annefrank_par...,Edward Porter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,12 Hour Shift,Thriller|Horror|Comedy|Crime,0.0,86.0,en,0.0,0.0,10/2/2020,85.685,5.2,...,True,2.5/4,,0.76,https://www.rottentomatoes.com/source-2586,2586.0,Movie,2020.0,https://www.rottentomatoes.com/m/12_hour_shift,Joey Magidson
96,12 Hour Shift,Thriller|Horror|Comedy|Crime,0.0,86.0,en,0.0,0.0,10/2/2020,85.685,5.2,...,True,3/5/2021 0:00,,0.76,https://www.rottentomatoes.com/source-28,28.0,Movie,2020.0,https://www.rottentomatoes.com/m/12_hour_shift,Josh Kupecki
97,12 Hour Shift,Thriller|Horror|Comedy|Crime,0.0,86.0,en,0.0,0.0,10/2/2020,85.685,5.2,...,True,4/5/2021 0:00,,0.76,https://www.rottentomatoes.com/source-3062,3062.0,Movie,2020.0,https://www.rottentomatoes.com/m/12_hour_shift,Kat Hughes
98,12 Hour Shift,Thriller|Horror|Comedy|Crime,0.0,86.0,en,0.0,0.0,10/2/2020,85.685,5.2,...,True,B,,0.76,https://www.rottentomatoes.com/source-833,833.0,Movie,2020.0,https://www.rottentomatoes.com/m/12_hour_shift,Katie Rife


In [11]:
# RENAME MovieID and MovieTitle
merged_df = merged_df.rename(columns={'Unnamed: 0': 'Movie_ID', 'review_object_title': 'Movie_Title'})

# MOVIE ID
# Reset the MovieID column to start at 1 for each unique MovieTitle
merged_df['Movie_ID'] = merged_df.groupby('Movie_Title').ngroup() + 1

# drop columns because we made our own index called MovieID
merged_df.drop(['tmdbid'], axis=1, inplace=True)

In [12]:
# create a column indicating how much of a row is missing
# Create a boolean mask indicating which cells contain Not available, NaN, or the string 'Not available'
na_mask = merged_df.isna() | merged_df.isin(['Not available'])

# Count the number of cells in each row that contain Not available, NaN, or 'Not available'
na_counts = na_mask.sum(axis=1)

# Calculate the percentage of cells in each row that contain Not available, NaN, or 'Not available'
na_percent = na_counts / len(merged_df.columns) * 100

# Add the new column to the DataFrame
merged_df['NA_Percent'] = na_percent
#merged_df

In [13]:
# Sort the DataFrame by NA_Percent in descending order
merged_df.sort_values('NA_Percent', ascending=False)

Unnamed: 0,Movie_Title,genres,budget,runtime,original_language,tmdb_budget,tmdb_revenue,release_date,tmdb_popularity,tmdb_vote_avg,...,reviewer rating,tmeter,review_src_url,publication,review_object_type,review_object_year,review_object_href,critic_name,Movie_ID,NA_Percent
19344,Chèche Lavi (Looking for Life),,,,,,,,,,...,,,,,,,,,1352,92.857143
103422,When Arabs Danced (Au temps où les Arabes dans...,,,,,,,,,,...,,,,,,,,,7092,92.857143
81095,The Demons (Les démons),,,,,,,,,,...,,,,,,,,,5662,92.857143
81061,The Decline (Jusqu'au déclin),,,,,,,,,,...,,,,,,,,,5656,92.857143
1689,A Beautiful Summer (Le Bel été),,,,,,,,,,...,,,,,,,,,133,92.857143
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58603,On the Record,Documentary|Music,0.0,96.0,fr,0.0,0.0,9/17/2020,15.414,7.0,...,,0.99,https://www.rottentomatoes.com/source-2443,2443.0,Movie,2020.0,https://www.rottentomatoes.com/m/on_the_record,Emma Simmonds,4099,3.571429
58604,On the Record,Documentary|Music,0.0,96.0,fr,0.0,0.0,9/17/2020,15.414,7.0,...,,0.99,https://www.rottentomatoes.com/source-226,226.0,Movie,2020.0,https://www.rottentomatoes.com/m/on_the_record,Eric Kohn,4099,3.571429
58605,On the Record,Documentary|Music,0.0,96.0,fr,0.0,0.0,9/17/2020,15.414,7.0,...,,0.99,https://www.rottentomatoes.com/source-1002,1002.0,Movie,2020.0,https://www.rottentomatoes.com/m/on_the_record,Glenn Dunks,4099,3.571429
58606,On the Record,Documentary|Music,0.0,96.0,fr,0.0,0.0,9/17/2020,15.414,7.0,...,,0.99,https://www.rottentomatoes.com/source-2305,2305.0,Movie,2020.0,https://www.rottentomatoes.com/m/on_the_record,Jordan Raup,4099,3.571429


In [14]:
# remove rows that have more than 50% Na values. These rows don't have enough information to use in a sentiment analysis. 
# Also, Not available of these columns contain the important review or reviewer rating necessary for the analysis. 
merged_df = merged_df[merged_df['NA_Percent'] <= 50]
merged_df
#319 rows have been removed.

Unnamed: 0,Movie_Title,genres,budget,runtime,original_language,tmdb_budget,tmdb_revenue,release_date,tmdb_popularity,tmdb_vote_avg,...,reviewer rating,tmeter,review_src_url,publication,review_object_type,review_object_year,review_object_href,critic_name,Movie_ID,NA_Percent
0,#Alive,Action|Horror|Thriller|Science Fiction,0.0,86.0,en,0.0,0.0,7/7/2020,75.455,3.6,...,,0.88,https://www.rottentomatoes.com/source-1879,1879.0,Movie,2020.0,https://www.rottentomatoes.com/m/alive_2020,Felix Vasquez Jr.,1,7.142857
1,#Alive,Action|Horror|Thriller|Science Fiction,0.0,86.0,en,0.0,0.0,7/7/2020,75.455,3.6,...,,0.88,https://www.rottentomatoes.com/source-720,720.0,Movie,2020.0,https://www.rottentomatoes.com/m/alive_2020,John Lui,1,3.571429
2,#Alive,Action|Horror|Thriller|Science Fiction,0.0,86.0,en,0.0,0.0,7/7/2020,75.455,3.6,...,,0.88,https://www.rottentomatoes.com/source-3784,3784.0,Movie,2020.0,https://www.rottentomatoes.com/m/alive_2020,Matt Donato,1,3.571429
3,#Alive,Action|Horror|Thriller|Science Fiction,0.0,86.0,en,0.0,0.0,7/7/2020,75.455,3.6,...,,0.88,https://www.rottentomatoes.com/source-2715,2715.0,Movie,2020.0,https://www.rottentomatoes.com/m/alive_2020,Scott Weinberg,1,7.142857
4,#AnneFrank. Parallel Stories (Vite parallele),,,,,,,,,,...,,1.00,https://www.rottentomatoes.com/source-437,437.0,Movie,2019.0,https://www.rottentomatoes.com/m/annefrank_par...,Edward Porter,2,39.285714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106641,ÌÐndÌ¦g,,,,,,,,,,...,,0.93,https://www.rottentomatoes.com/source-2753,2753.0,Movie,2019.0,https://www.rottentomatoes.com/m/ondog,James Mottram,7312,39.285714
106642,ÌÐndÌ¦g,,,,,,,,,,...,,0.93,https://www.rottentomatoes.com/source-466,466.0,Movie,2019.0,https://www.rottentomatoes.com/m/ondog,Jessica Kiang,7312,42.857143
106643,ÌÐndÌ¦g,,,,,,,,,,...,,0.93,https://www.rottentomatoes.com/source-1069,1069.0,Movie,2019.0,https://www.rottentomatoes.com/m/ondog,Jonathan Romney,7312,42.857143
106644,ÌÐndÌ¦g,,,,,,,,,,...,,0.93,https://www.rottentomatoes.com/source-3108,3108.0,Movie,2019.0,https://www.rottentomatoes.com/m/ondog,Nicholas Bell,7312,39.285714


In [15]:
# drop the NA_Percent column
merged_df = merged_df.drop('NA_Percent', axis=1)
#merged_df

# CHECK FOR NAs in columns 
# Calculate the percentage of NaN values for each column
col_na_percentage = merged_df.isna().sum() / merged_df.shape[0] * 100
# Add a new row at the top of the DataFrame with the column NaN percentages
merged_df.loc[-1] = col_na_percentage.round(2)
merged_df.index = merged_df.index + 1
merged_df = merged_df.sort_index()
merged_df.head(2)

Unnamed: 0,Movie_Title,genres,budget,runtime,original_language,tmdb_budget,tmdb_revenue,release_date,tmdb_popularity,tmdb_vote_avg,...,reviewer_rating_actual,reviewer rating,tmeter,review_src_url,publication,review_object_type,review_object_year,review_object_href,critic_name,Movie_ID
0,0.0,8.91,8.12,8.24,8.12,8.12,8.12,8.36,8.12,8.12,...,33.89,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,#Alive,Action|Horror|Thriller|Science Fiction,0.0,86.0,en,0.0,0.0,7/7/2020,75.455,3.6,...,,,0.88,https://www.rottentomatoes.com/source-1879,1879.0,Movie,2020.0,https://www.rottentomatoes.com/m/alive_2020,Felix Vasquez Jr.,1.0


In [16]:
# EDA — Budget

# % missing
budget_na_pct = merged_df['budget'].isna().mean() * 100
print(f"Budget NA %: {budget_na_pct:.2f}%")

# skew check
budget_skew = skew(merged_df['budget'].dropna())
print(f"Skew of budget: {budget_skew:.2f}")

# EDA Conclusion:
# Skew = 2.7 right-skewed - use median imputation

Budget NA %: 8.12%
Skew of budget: 2.73


In [None]:
# EDA – Runtime

# % missing
runtime_na_pct = merged_df['runtime'].isna().mean() * 100
print(f"Runtime NA %: {runtime_na_pct:.2f}%")

# skew check
runtime_skew = skew(merged_df['runtime'].dropna())
print(f"Skew of runtime: {runtime_skew:.2f}")

# EDA conclusion:
# Skew = 3.5 right-skewed - use median imputation

Runtime NA %: 8.24%
Skew of runtime: 3.47


In [18]:
# EDA – Original Language

# % missing
lang_na_pct = merged_df['original_language'].isna().mean() * 100
print(f"Original Language NA %: {lang_na_pct:.2f}%")

# EDA conclusion:
# ~8% missing - fill with 'Not available'
# values are abbreviated - map to full language names

Original Language NA %: 8.12%


In [19]:
# EDA – Revenue

# % missing
revenue_na_pct = merged_df['tmdb_revenue'].isna().mean() * 100
print(f"Revenue NA %: {revenue_na_pct:.2f}%")

# skew check
revenue_skew = skew(merged_df['tmdb_revenue'].dropna())
print(f"Skew of Revenue: {revenue_skew:.2f}")

# EDA conclusion:
# ~8% missing or zero values
# highly right-skewed (skew > 4.0) - use median imputation

Revenue NA %: 8.12%
Skew of Revenue: 4.84


In [20]:
# EDA – Release_Date

# % missing
release_na_pct = merged_df['release_date'].isna().mean() * 100
print(f"Release_Date NA %: {release_na_pct:.2f}%")

# EDA conclusion:
# ~8% missing values
# impute with 'Not available'

Release_Date NA %: 8.36%


In [21]:
# EDA – Popularity

# % missing
popularity_na_pct = merged_df['tmdb_popularity'].isna().mean() * 100
print(f"Popularity NA %: {popularity_na_pct:.2f}%")

# distribution shape
popularity_skew = skew(merged_df['tmdb_popularity'].dropna())
print(f"Skewness: {popularity_skew:.2f}")

# EDA conclusion:
# ~8% missing values
# heavily skewed (~17) - use median imputation

Popularity NA %: 8.12%
Skewness: 17.00


In [None]:
# EDA – Vote_Average

# % missing
vote_avg_na_pct = merged_df['tmdb_vote_avg'].isna().mean() * 100
print(f"Vote_Average NA %: {vote_avg_na_pct:.2f}%")

# distribution shape
vote_avg_skew = skew(merged_df['tmdb_vote_avg'].dropna())
print(f"Skewness: {vote_avg_skew:.2f}")

# EDA conclusion:
# ~8% missing values
# left-skewed (~ -2.8) - use median imputation

Vote_Average NA %: 8.12%
Skewness: -2.79


In [None]:
# EDA – Vote_Count

# % missing
vote_count_na_pct = merged_df['tmdb_vote_count'].isna().mean() * 100
print(f"Vote_Count NA %: {vote_count_na_pct:.2f}%")

# distribution shape
vote_count_skew = skew(merged_df['tmdb_vote_count'].dropna())
print(f"Skewness: {vote_count_skew:.2f}")

# EDA conclusion:
# ~8% missing values
# right-skewed (~3.1) - use median imputation

Vote_Count NA %: 8.12%
Skewness: 3.08


In [26]:
# EDA - Review_ID

# 0% missing
merged_df['review_id'].isna().sum()

np.int64(0)

In [27]:
# EDA - Critic_ID

# 0% missing
merged_df['critic_id'].isna().sum()

np.int64(0)

In [28]:
# EDA - Created_Date

# 0% missing
merged_df['created_date'].isna().sum()

np.int64(0)

In [29]:
# EDA - Pub_Date

# 0% missing
merged_df['pub_date'].isna().sum()

np.int64(0)

In [30]:
# CONTENT
# 0.01% NA
merged_df['content'].isna().mean()
merged_df['content'].head(100)

0                                                   0.01
1      Never boring consistently creepy and inspires ...
2      Who knew that zombie films could be as inventi...
3      '#Alive' finds drama and tension within one ma...
4      We're all a bit zombied out these days but thi...
                             ...                        
96     "12 Hour Shift" is a comedy but there aren't a...
97     Bettis is perfectly cast as Mandy her hazy dis...
98     A nifty thrifty razor-sharp comedy packed full...
99     The material is edgy and at times outrageously...
100    Grant puts us in the shoes of her harried hero...
Name: content, Length: 100, dtype: object

In [31]:
# PUBLISHER
# 0% NA
merged_df['publisher'].isna().mean()
merged_df['publisher'].head(100)

0                                0.0
1                        Film Threat
2      The Straits Times (Singapore)
3         Film Journal International
4                       Apollo Guide
                   ...              
96                    Hollywood News
97                  Austin Chronicle
98                               THN
99                           AV Club
100                         Guardian
Name: publisher, Length: 100, dtype: object

In [32]:
# REVIEWER RATING ROTTEN
# 0% NA
# all values are TRUE → drop
merged_df['reviewer_rating_rotten'].value_counts()

reviewer_rating_rotten
True    106318
0.0          1
Name: count, dtype: int64

In [33]:
# REVIEWER RATING ACTUAL
# preview raw values and unique patterns
merged_df['reviewer_rating_actual'].head(100)
merged_df['reviewer_rating_actual'].unique()[:25]

array([33.89, nan, '4/5/2021 0:00', '3/5/2021 0:00', 'A-',
       '2/5/2021 0:00', '74/100', 'A', '5/5/2021 0:00', '3/4/2021 0:00',
       '2.5/5', '3.5/4', 'B+', 'B-', 'D', '1.5/5', 'C', '6.5/10', '2.5/4',
       'D+', '1.5/4', 'B', '9/10/2021 0:00', '8/10/2021 0:00', '2.0/4.0'],
      dtype=object)

In [34]:
# REVIEW SRC URL, REVIEW OBJECT TYPE, REVIEW OBJECT HREF
# 0% NA
# inspect these columns before deciding to drop
merged_df[['review_src_url', 'review_object_type', 'review_object_href']].head(100)

Unnamed: 0,review_src_url,review_object_type,review_object_href
0,0.0,0.0,0.0
1,https://www.rottentomatoes.com/source-1879,Movie,https://www.rottentomatoes.com/m/alive_2020
2,https://www.rottentomatoes.com/source-720,Movie,https://www.rottentomatoes.com/m/alive_2020
3,https://www.rottentomatoes.com/source-3784,Movie,https://www.rottentomatoes.com/m/alive_2020
4,https://www.rottentomatoes.com/source-2715,Movie,https://www.rottentomatoes.com/m/alive_2020
...,...,...,...
96,https://www.rottentomatoes.com/source-2586,Movie,https://www.rottentomatoes.com/m/12_hour_shift
97,https://www.rottentomatoes.com/source-28,Movie,https://www.rottentomatoes.com/m/12_hour_shift
98,https://www.rottentomatoes.com/source-3062,Movie,https://www.rottentomatoes.com/m/12_hour_shift
99,https://www.rottentomatoes.com/source-833,Movie,https://www.rottentomatoes.com/m/12_hour_shift


In [35]:
# PUBLICATION
# 0% NA
# inspect values before converting
merged_df['publication'].head(100)

0         0.0
1      1879.0
2       720.0
3      3784.0
4      2715.0
        ...  
96     2586.0
97       28.0
98     3062.0
99      833.0
100    1676.0
Name: publication, Length: 100, dtype: float64

In [36]:
# REVIEW OBJECT YEAR
# 0% NA
# inspect values before converting
merged_df['review_object_year'].head(100)

0         0.0
1      2020.0
2      2020.0
3      2020.0
4      2020.0
        ...  
96     2020.0
97     2020.0
98     2020.0
99     2020.0
100    2020.0
Name: review_object_year, Length: 100, dtype: float64

In [37]:
# CRITIC NAME
# 0% NA
# inspect values before converting
merged_df['critic_name'].head(100)

0                    0.0
1      Felix Vasquez Jr.
2               John Lui
3            Matt Donato
4         Scott Weinberg
             ...        
96         Joey Magidson
97          Josh Kupecki
98            Kat Hughes
99            Katie Rife
100        Kristy Puchko
Name: critic_name, Length: 100, dtype: object

In [39]:
print(merged_df.columns)

Index(['Movie_Title', 'genres', 'budget', 'runtime', 'original_language',
       'tmdb_budget', 'tmdb_revenue', 'release_date', 'tmdb_popularity',
       'tmdb_vote_avg', 'tmdb_vote_count', 'review_id', 'critic_id',
       'created_date', 'pub_date', 'content', 'publisher',
       'reviewer_rating_rotten', 'reviewer_rating_actual', 'reviewer rating',
       'tmeter', 'review_src_url', 'publication', 'review_object_type',
       'review_object_year', 'review_object_href', 'critic_name', 'Movie_ID'],
      dtype='object')


In [40]:
# ROI
# preview Budget and Revenue before calculating ROI
merged_df[['budget', 'revenue']].describe()

KeyError: "['revenue'] not in index"

In [None]:
# MONTH
# convert Release_Date to datetime
merged_df['Release_Date'] = pd.to_datetime(merged_df['Release_Date'], errors='coerce')
# preview converted dates
merged_df['Release_Date'].head(100)

In [None]:
# add markdown section explaining this part now exploring relationships between variables

In [None]:
# add code to bring in the cleaned data output from feature-engineering.ipynb

In [None]:
# Create a histogram of the 'Revenue' column
fig, ax = plt.subplots(figsize=(8,6))
ax.hist(df['Revenue'], bins=20, edgecolor='black')

# Customize x-axis labels
ax.set_xlabel('Revenue (in billions)')
ax.set_ylabel('Frequency')
ax.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: f'{x/1e9:g}'))
ax.set_xlim(0, 3e9)

plt.title('Histogram of Revenue')
plt.show()

In [None]:
df['Revenue'].plot(kind='box')
plt.ylabel('Revenue (in millions)')
plt.title('Boxplot of Revenue')
plt.show()

In [None]:
x = df['Budget']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Budget')
plt.xlabel('Budget (in billions)')
plt.ylabel('Revenue (in billions)')
plt.show()

In [None]:
#impute 
median_runtime = df['Runtime'].median()
df.loc[df['Runtime'] == 0, 'Runtime'] = median_runtime
df.loc[df['Runtime'] > 200, 'Runtime'] = median_runtime

In [None]:
x = df['Runtime']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Runtime')
plt.xlabel('Runtime')
plt.ylabel('Revenue in Billions')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Vote_Average')
plt.xlabel('Vote_Average')
plt.ylabel('Revenue in Billions')
plt.show()

In [None]:
x = df['Vote_Count']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Vote_Count')
plt.xlabel('Vote_Count')
plt.ylabel('Revenue in Billions')
plt.show()

In [None]:
monthly_revenue = df.groupby('Month')['Revenue'].mean()

# Create a new DataFrame with the monthly labels and average revenues
monthlyrevenue = pd.DataFrame({
    'Month': monthly_revenue.index,
    'Average Revenue': monthly_revenue.values
})

In [None]:
# Define default blue color instead of the green color map
colors = 'C0'  # 'C0' is the default blue color in Matplotlib

# Create an ordered categorical data type for the 'Month' column
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
               'August', 'September', 'October', 'November', 'December']
cat_dtype = pd.api.types.CategoricalDtype(categories=month_order, ordered=True)
monthlyrevenue['Month'] = monthlyrevenue['Month'].astype(cat_dtype)

# Sort the dataframe by the 'Month' column
monthlyrevenue = monthlyrevenue.sort_values('Month')

# Create a bar plot with monthly revenue and color the bars by revenue
plt.bar(monthlyrevenue['Month'], monthlyrevenue['Average Revenue'], color=colors)

# Customize the plot
plt.xlabel('Month')
plt.ylabel('Average Revenue ($)')
plt.title('Average Revenue by Month')
plt.xticks(rotation=45)

# Set the y-axis tick labels
tick_labels = ['${:,.0f}'.format(x) for x in plt.yticks()[0]]
plt.yticks(plt.yticks()[0], tick_labels)

# Display the plot
plt.show()

In [None]:
genre_revenue = df.groupby('Genre_1')['Revenue'].mean().reset_index()

# Create a bar plot with genre_1 and revenue
plt.bar(genre_revenue['Genre_1'], genre_revenue['Revenue'])
# Customize the plot
plt.xlabel('Genre')
plt.ylabel('Average Revenue (in Hundred Millions)')
plt.title('Average Revenue by Genre')
plt.xticks(rotation=45, ha='right')

# Display the plot
plt.show()

In [None]:
df['Vote_Average'].plot(kind='density')
plt.title('Distribution of Vote Average')
plt.xlabel('Vote Average')
plt.show()

In [None]:
x = df['Runtime']
y = df['Vote_Average']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Average vs. Runtime')
plt.xlabel('Runtime')
plt.ylabel('Vote Average')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Revenue']

plt.scatter(x, y, alpha=0.5)
plt.title('Revenue vs. Vote Average')
plt.xlabel('Vote_Average')
plt.ylabel('Revenue')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Vote Average')
plt.xlabel('Vote Average')
plt.ylabel('Vote Count')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Reviewer_Rating']

plt.scatter(x, y, alpha=0.5)
plt.title('Reviewer Rating vs. Vote Average')
plt.xlabel('Vote Average')
plt.ylabel('Reviewer Rating')
plt.show()


In [None]:
df_filtered = df[df['Tomato_Meter'] >= 0]
# create the scatter plot using the filtered dataframe
x = df_filtered['Vote_Average']
y = df_filtered['Tomato_Meter']

plt.scatter(x, y, alpha=0.5)
plt.title('Tomato Meter vs. Vote Average')
plt.xlabel('Tomato_Meter')
plt.ylabel('Vote Average')
plt.show()

In [None]:
# Group the data by month and calculate the mean of the 'Vote Average' column
month_vote_avg = df.groupby('Month')['Vote_Average'].mean().reset_index()
plt.figure(figsize=(10, 5))
# Create a bar chart of vote average by month
plt.bar(month_vote_avg['Month'], month_vote_avg['Vote_Average'])
# Add data labels to the bars
for i, v in enumerate(month_vote_avg['Vote_Average']):
    plt.text(i, v, '{:.2f}'.format(v), ha='center', va='bottom')
# Customize the plot
plt.xlabel('Month')
plt.ylabel('Vote Average')
plt.title('Vote Average by Month')
plt.xticks(rotation=45)

# Display the plot
plt.show()

In [None]:
# Group the data by genre and calculate the mean of the 'Vote Average' column
genre_vote_avg = df.groupby('Genre_1')['Vote_Average'].mean().reset_index()
plt.figure(figsize=(10, 5))
# Create a bar chart of vote average by genre
plt.bar(genre_vote_avg['Genre_1'], genre_vote_avg['Vote_Average'])

# Add labels to the bars
for i, v in enumerate(genre_vote_avg['Vote_Average']):
    plt.text(i, v, '{:.2f}'.format(v), ha='center', va='bottom')

# Customize the plot
plt.xlabel('Genre')
plt.ylabel('Vote Average')
plt.title('Vote Average by Genre')
plt.xticks(rotation=60)

# Display the plot
plt.show()

In [None]:
x = df['Budget']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Budget')
plt.xlabel('Budget (in billions)')
plt.ylabel('Vote Count')
plt.show()

In [None]:
x = df['Revenue']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Revenue')
plt.xlabel('Revenue')
plt.ylabel('Vote Count')
plt.show()

In [None]:
x = df['Vote_Average']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Vote Average')
plt.xlabel('Vote Average')
plt.ylabel('Vote Count')
plt.show()

In [None]:
x = df['Reviewer_Rating']
y = df['Vote_Count']

plt.scatter(x, y, alpha=0.5)
plt.title('Vote Count vs. Reviewer Rating')
plt.xlabel('Reviewer Rating')
plt.ylabel('Vote Count')
plt.show()

In [None]:
# Convert the 'Month' column to a categorical data type
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
               'August', 'September', 'October', 'November', 'December']
cat_dtype = pd.api.types.CategoricalDtype(categories=month_order, ordered=True)
df['Month'] = df['Month'].astype(cat_dtype)

# Group the data by month and calculate the sum of the 'Vote Count' column
monthly_vote_count = df.groupby('Month', observed=True)['Vote_Count'].sum().reset_index()

# Create a bar chart of vote count by month
plt.bar(monthly_vote_count['Month'], monthly_vote_count['Vote_Count'])

# Customize the plot
plt.xlabel('Month')
plt.ylabel('Vote Count')
plt.title('Vote Count by Month')
plt.xticks(rotation=45)

# Display the plot
plt.show()


In [None]:
# Group the data by genre and calculate the sum of the 'Vote Count' column
genre_vote_count = df.groupby('Genre_1')['Vote_Count'].sum().reset_index()
plt.figure(figsize=(10, 6))
# Create a bar chart of vote count by genre
plt.bar(genre_vote_count['Genre_1'], genre_vote_count['Vote_Count'])

# Customize the plot
plt.xlabel('Genre')
plt.ylabel('Vote Count')
plt.title('Vote Count by Genre')
plt.xticks(rotation=90)

# Display the plot
plt.show()


In [None]:
df_cleaned = df[df['Release_Date'] != 'Not available']
print(df_cleaned['Release_Date'].dtype)
df_cleaned['Release_Month'] = pd.to_datetime(df_cleaned['Release_Date']).dt.month

df_cleaned_rating = df_cleaned.dropna(subset=['Reviewer_Rating'])
df_cleaned_month_rating = df_cleaned_rating.dropna(subset=['Release_Month'])

# Group df_cleaned by month and get mean of Reviewer_Rating 
month_rating = df_cleaned_month_rating.groupby(['Release_Month'])['Reviewer_Rating'].mean()
import calendar
month_abbr = list(calendar.month_abbr)[1:] # Define a list of month abbreviations


df_filtered = df_cleaned[df_cleaned['Reviewer_Rating'] > 0.6]
count_by_month = df_filtered['Release_Month'].value_counts()
count_by_month = count_by_month.sort_index()

# Define month labels
month_labels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

plt.bar(month_labels, count_by_month)
plt.title('Count of Movies with Reviewer_Rating > 0.6 by Release Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.show()

# September is traditionally known as the start of the "fall movie season," where studios release 
# prestige films in the lead up to award season. This is because many of the high-profile film festivals 
# (Venice Film Festival and the Toronto International Film Festival) take place in September. As a result, 
# studios may choose to release their films in September in order to coincide with these festivals and 
# generate buzz and positive reviews for their films. Additionally, the period from September to December 
# leads up to the award season, during which studios release their most highly regarded and critically 
# acclaimed films in the hopes of receiving nominations and awards. This background information could 
# explain the higher count of reviewer_ratings above .6 from September to December. 

In [None]:
# select only the numeric columns
numeric_cols = df.select_dtypes(include=['int64', 'float64'])

# compute the correlation matrix
corr_matrix = numeric_cols.corr()

print(corr_matrix)

In [None]:
# Create a histogram of Revenue by Month
df.groupby('Month', observed=True)['Revenue'].sum().plot(kind='bar', title='Revenue by Month', figsize=(8, 6))
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.show()

In [None]:
# Create a histogram of the 'Budget' column
plt.figure(figsize=(8,6))
plt.title('Histogram of Budget')
plt.xlabel('Budget (in billions)')
plt.ylabel('Frequency')
plt.hist(df['Budget'], bins=20, edgecolor='black')

plt.show()

In [None]:
# Create a histogram of the 'Runtime' column
plt.figure(figsize=(8,6))
plt.title('Histogram of Runtime')
plt.xlabel('Runtime (in minutes)')
plt.ylabel('Frequency')
plt.hist(df['Runtime'], bins=20, edgecolor='black')

# Customize x-axis labels
plt.xticks(rotation=45, ha='right')
plt.ticklabel_format(style='plain', axis='x')

plt.show()

In [None]:
# add markdown cell summarizing the findings from the EDA