In [5]:
# Packages required for exploratory data analysis: pandas, numpy, and matplotlib

# Data manipulation
import pandas as pd
import numpy as np
from scipy.stats import skew
# Visualization
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import matplotlib.ticker as ticker

In [None]:
# READ IN CSV FILES AND MERGE
movie_info = pd.read_csv('../data/raw/movie_info.csv', index_col=0)
critic_reviews = pd.read_csv('../data/raw/CriticReviews_2018-2020.csv', index_col=0)
# Merge the two dataframes on the 'review_object_title' column
merged_df = pd.merge(movie_info, critic_reviews, on='review_object_title', how='outer', suffixes=('_movie', '_review'))

merged_df.head(100)

In [None]:
# RENAME MovieID and MovieTitle
merged_df = merged_df.rename(columns={'Unnamed: 0': 'Movie_ID', 'review_object_title': 'Movie_Title'})

# MOVIE ID
# Reset the MovieID column to start at 1 for each unique MovieTitle
merged_df['Movie_ID'] = merged_df.groupby('Movie_Title').ngroup() + 1

# drop columns because we made our own index called MovieID
merged_df.drop(['tmdbid'], axis=1, inplace=True)

In [None]:
# create a column indicating how much of a row is missing
# Create a boolean mask indicating which cells contain Not available, NaN, or the string 'Not available'
na_mask = merged_df.isna() | merged_df.isin(['Not available'])

# Count the number of cells in each row that contain Not available, NaN, or 'Not available'
na_counts = na_mask.sum(axis=1)

# Calculate the percentage of cells in each row that contain Not available, NaN, or 'Not available'
na_percent = na_counts / len(merged_df.columns) * 100

# Add the new column to the DataFrame
merged_df['NA_Percent'] = na_percent
#merged_df

In [None]:
# Sort the DataFrame by NA_Percent in descending order
merged_df.sort_values('NA_Percent', ascending=False)

In [None]:
# remove rows that have more than 50% Na values. These rows don't have enough information to use useful in a sentiment analysis. 
#Also, Not available of these columns contain the important review or reviewer rating necessary for the analysis. 
merged_df = merged_df[merged_df['NA_Percent'] <= 50]
merged_df
#319 rows have been removed.

In [None]:
# drop the NA_Percent column
merged_df = merged_df.drop('NA_Percent', axis=1)
#merged_df

# CHECK FOR NAs in columns 
# Calculate the percentage of NaN values for each column
col_na_percentage = merged_df.isna().sum() / merged_df.shape[0] * 100
# Add a new row at the top of the DataFrame with the column NaN percentages
merged_df.loc[-1] = col_na_percentage.round(2)
merged_df.index = merged_df.index + 1
merged_df = merged_df.sort_index()
merged_df.head(2)

In [None]:
# EDA — Budget

# % missing
budget_na_pct = merged_df['budget'].isna().mean() * 100
print(f"Budget NA %: {budget_na_pct:.2f}%")

# skew check
budget_skew = skew(merged_df['budget'].dropna())
print(f"Skew of budget: {budget_skew:.2f}")

# EDA Conclusion:
# Skew = 2.7 right-skewed - use median imputation

Budget NA %: 8.31%
Skew of budget: 2.73


In [None]:
# EDA – Runtime

# % missing
runtime_na_pct = merged_df['runtime'].isna().mean() * 100
print(f"Runtime NA %: {runtime_na_pct:.2f}%")

# skew check
runtime_skew = skew(merged_df['runtime'].dropna())
print(f"Skew of runtime: {runtime_skew:.2f}")

# EDA conclusion:
# Skew = 3.45 right-skewed - use median imputation

Runtime NA %: 8.43%
Skew of runtime: 3.45


In [None]:
# EDA – Original Language

# % missing
lang_na_pct = merged_df['original_language'].isna().mean() * 100
print(f"Original Language NA %: {lang_na_pct:.2f}%")

# EDA conclusion:
# ~8% missing - fill with 'Not available'
# values are abbreviated - map to full language names

In [None]:
# EDA – Revenue

# % missing
revenue_na_pct = merged_df['tmdb_revenue'].isna().mean() * 100
print(f"Revenue NA %: {revenue_na_pct:.2f}%")

# skew check
revenue_skew = skew(merged_df['tmdb_revenue'].dropna())
print(f"Skew of Revenue: {revenue_skew:.2f}")

# EDA conclusion:
# ~8% missing or zero values
# highly right-skewed (skew > 4.0) - use median imputation

In [None]:
# EDA – Release_Date

# % missing
release_na_pct = merged_df['release_date'].isna().mean() * 100
print(f"Release_Date NA %: {release_na_pct:.2f}%")

# EDA conclusion:
# ~8% missing values
# impute with 'Not available'

In [None]:
# EDA – Popularity

# % missing
popularity_na_pct = merged_df['tmdb_popularity'].isna().mean() * 100
print(f"Popularity NA %: {popularity_na_pct:.2f}%")

# distribution shape
popularity_skew = skew(merged_df['tmdb_popularity'].dropna())
print(f"Skewness: {popularity_skew:.2f}")

# EDA conclusion:
# ~8% missing values
# heavily skewed (~17) - use median imputation

In [None]:
# EDA – Vote_Average

# % missing
vote_avg_na_pct = merged_df['tmdb_vote_avg'].isna().mean() * 100
print(f"Vote_Average NA %: {vote_avg_na_pct:.2f}%")

# distribution shape
vote_avg_skew = skew(merged_df['tmdb_vote_avg'].dropna())
print(f"Skewness: {vote_avg_skew:.2f}")

# EDA conclusion:
# ~8% missing values
# left-skewed (~ -2.79) - use median imputation

In [None]:
# EDA – Vote_Count

# % missing
vote_count_na_pct = merged_df['tmdb_vote_count'].isna().mean() * 100
print(f"Vote_Count NA %: {vote_count_na_pct:.2f}%")

# distribution shape
vote_count_skew = skew(merged_df['tmdb_vote_count'].dropna())
print(f"Skewness: {vote_count_skew:.2f}")

# EDA conclusion:
# ~8% missing values
# right-skewed (~3.08) - use median imputation

In [7]:
# EDA - Review_ID

# 0% missing
merged_df['review_id'].isna().sum()


np.int64(329)

In [None]:
# EDA - Critic_ID

# 0% missing
merged_df['critic_id'].isna().sum()

In [8]:
# EDA - Created_Date

# 0% missing
merged_df['created_date'].isna().sum()

np.int64(329)

In [None]:
# EDA - Pub_Date

# 0% missing
merged_df['pub_date'].isna().sum()

In [None]:
# CONTENT
# 0.01% NA
merged_df['content'].isna().mean()
merged_df['content'].head(100)

In [None]:
# PUBLISHER
# 0% NA
merged_df['publisher'].isna().mean()
merged_df['publisher'].head(100)

In [None]:
# REVIEWER RATING ROTTEN
# 0% NA
# all values are TRUE, so drop
merged_df['reviewer_rating_rotten'].value_counts()

In [None]:
# REVIEW SRC URL, REVIEW OBJECT TYPE, REVIEW OBJECT HREF
# 0% NA
# inspect these columns before deciding to drop
merged_df[['review_src_url', 'review_object_type', 'review_object_href']].head(100)

In [None]:
# PUBLICATION
# 0% NA
# inspect values before converting
merged_df['publication'].head(100)

In [None]:
# REVIEW OBJECT YEAR
# 0% NA
# inspect values before converting
merged_df['review_object_year'].head(100)

In [None]:
# CRITIC NAME
# 0% NA
# inspect values before converting
merged_df['critic_name'].head(100)

In [None]:
# ROI
# preview Budget and Revenue before calculating ROI
merged_df[['Budget', 'Revenue']].describe()

In [None]:
# MONTH
# convert Release_Date to datetime
merged_df['Release_Date'] = pd.to_datetime(merged_df['Release_Date'], errors='coerce')
# preview converted dates
merged_df['Release_Date'].head(100)