# Import packages and dataset

In [None]:
# essentials
import pandas as pd
import numpy as np

# visualizations
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Warnings
import warnings
warnings.filterwarnings("ignore")

from collections import Counter

In [None]:
print(plt.style.available)

In [None]:
df = pd.read_csv('Movies.csv')

In [None]:
df.info()

In [None]:
df.duplicated().sum()  # Check for duplicates

In [None]:
df.isnull().sum()  # Check missing values

In [None]:
df.isnull().sum().sort_values(ascending=False)

# Clean and prepare the data

## Convert data types

In [None]:
# For numeric-based analysis:
df['Released_Year'] = pd.to_numeric(df['Released_Year'], errors='coerce')

# To see, for example, the average movie length
df['Runtime'] = df['Runtime'].str.replace('min', '').astype(int)

# I noticed that it is a string with commas (e.g., 1,000,000) which can be used later for scatter plots, bar charts or other plots
df['Gross'] = df['Gross'].str.replace(',', '').astype(float)

In [None]:
df.info()

## Handle missing values

In [None]:
# Drop the only row missing release year
df.dropna(subset=['Released_Year'], inplace=True)

# Fill missing Certificate with a neutral category
df['Certificate'].fillna('Not Rated', inplace=True)

# Fill missing Meta_score with the average
df['Meta_score'].fillna(df['Meta_score'].mean(), inplace=True)

# Fill missing Gross with 0
df['Gross'].fillna(0, inplace=True)

In [None]:
df.isnull().sum()  # Check missing values

In [None]:
df.isnull().sum().sort_values(ascending=False)

# Explore the data

## Descriptive Statistics and Value Counts

In [None]:
df.describe()

In [None]:
# See the frequency of certificates
df['Certificate'].value_counts()

In [None]:
# See the frequency of genres
df['Genre'].value_counts().head(10)

In [None]:
# Find the number of movies per year
df['Released_Year'].value_counts().sort_index(ascending=False).head(20)

In [None]:
# plt.figure(figsize=(12, 5))
# df['Released_Year'].value_counts().sort_index().plot(kind='line')
# plt.title("Number of Movies released per Year")
# plt.xlabel("Year")
# plt.ylabel("Number of movies")
# plt.grid(True)
#
# # Save the picture before showing the plot
# plt.savefig("insight_movies_per_year.png", dpi=300, bbox_inches='tight')
#
# plt.show()
# plt.close()

In [None]:
# Personalized version
plt.figure(figsize=(12, 5))
plt.figure(figsize=(12, 5))
df['Released_Year'].value_counts().sort_index().plot(
    kind='line',
    color='#A020F0',      # Purple line
    marker='o',
    markersize=4,
    linewidth=2
)

plt.title("Number of Movies Released per Year", fontsize=16, color='black')
plt.xlabel("Year", fontsize=12, color='black')
plt.ylabel("Number of Movies", fontsize=12, color='black')
plt.xticks(fontsize=10, color='black')
plt.yticks(fontsize=10, color='black')
plt.grid(True, linestyle='--', alpha=0.6)

# Save directly to the assets folder
plt.savefig("assets/insight1_movies_per_year.png", dpi=300, bbox_inches='tight')
plt.show()
plt.close()

I can see that movie production has increased over time, peaking around the 2000s, then slightly declining by 2020.

In [None]:
df.groupby('Certificate')['IMDB_Rating'].mean().sort_values(ascending=False)

## Compare average IMDB ratings per certificate

In [None]:
df.groupby('Certificate')['IMDB_Rating'].mean().sort_values(ascending=False)

In [None]:
# Show all certificate ratings
df.groupby('Certificate')['IMDB_Rating'].mean().sort_values(ascending=False).reset_index()

In [None]:
# plt.figure(figsize=(10, 6))
# df.groupby('Certificate')['IMDB_Rating'].mean().sort_values(ascending=False).plot(kind='bar', color='skyblue')
#
# plt.title("Average IMDB Rating by Movie Certificate")
# plt.xlabel("Certificate")
# plt.ylabel("Average IMDB Rating")
# plt.xticks(rotation=45)
# plt.grid(axis='y')
#
# # Save before show
# plt.savefig("insight_imdb_vs_certificate.png", dpi=300, bbox_inches='tight')
# plt.show()
# plt.close()

In [None]:
# Personalized version
plt.figure(figsize=(10, 6))
df.groupby('Certificate')['IMDB_Rating'].mean().sort_values(ascending=False).plot(
    kind='bar',
    color='#A020F0',      # Purple bars
    edgecolor='black'
)

plt.title("Average IMDB Rating by Movie Certificate", color='black')
plt.xlabel("Certificate", color='black')
plt.ylabel("Average IMDB Rating", color='black')
plt.xticks(rotation=45, color='black')
plt.yticks(color='black')
plt.grid(axis='y')

plt.savefig("insight2_imdb_vs_certificate.png", dpi=300, bbox_inches='tight')
plt.show()
plt.close()

## Top 10 most frequent actor appearances

In [None]:
# Combine all 4-star columns into one
# Combine all 4-star columns into one
all_actors = df[['Star1', 'Star2', 'Star3', 'Star4']].values.ravel()
actor_counts = Counter(all_actors)
top_actors = dict(sorted(actor_counts.items(), key=lambda x: x[1], reverse=True)[:10])

# Plot
plt.figure(figsize=(10, 5))
plt.bar(top_actors.keys(), top_actors.values(), color='#A020F0', edgecolor='black')

plt.title("Top 10 Most Frequent Actors", fontsize=16, color='black')
plt.xlabel("Actor", fontsize=12, color='black')
plt.ylabel("Number of Appearances", fontsize=12, color='black')
plt.xticks(rotation=45, fontsize=10, color='black')
plt.yticks(fontsize=10, color='black')
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.savefig("assets/insight3_top_actors.png", dpi=300, bbox_inches='tight')
plt.show()
plt.close()

## Top 10 most common genres

In [None]:
# Count individual genres
genre_split = df['Genre'].dropna().str.split(', ')
flat_genres = [genre for sublist in genre_split for genre in sublist]
genre_counts = Counter(flat_genres)
top_genres = dict(genre_counts.most_common(10))

# Plot horizontal bar chart
plt.figure(figsize=(10, 5))
plt.barh(list(top_genres.keys()), list(top_genres.values()), color='#A020F0', edgecolor='black')

plt.title("Top 10 Most Common Genres", fontsize=16, color='black')
plt.xlabel("Number of Movies", fontsize=12, color='black')
plt.ylabel("Genre", fontsize=12, color='black')
plt.xticks(fontsize=10, color='black')
plt.yticks(fontsize=10, color='black')
plt.gca().invert_yaxis()
plt.grid(axis='x', linestyle='--', alpha=0.6)

plt.tight_layout()
plt.savefig("assets/insight4_top_genres.png", dpi=300, bbox_inches='tight')
plt.show()
plt.close()

## Rating . Gross

In [None]:
import matplotlib.pyplot as plt

plt.style.use('default')  # Clean white background

plt.figure(figsize=(12, 6))
scatter = plt.scatter(
    df['IMDB_Rating'],
    df['Gross'],
    c=df['Meta_score'],
    s=df['No_of_Votes'] / 25000,  # Adjust dot size
    cmap='Purples',
    edgecolor='black',
    alpha=0.7
)

plt.colorbar(label='Meta Score')
plt.xlabel("IMDB Rating", color='black')
plt.ylabel("Gross Revenue", color='black')
plt.title("IMDB Rating vs. Gross Revenue\n(Meta Score & Number of Votes)", fontsize=14, color='black')

plt.xticks(color='black')
plt.yticks(color='black')
plt.grid(True, linestyle='--', alpha=0.6)

plt.tight_layout()
plt.savefig("assets/insight5_rating_vs_gross.png", dpi=300, bbox_inches='tight')
plt.show()
plt.close()