# 🎬 Movie Rating Analysis
### A Data-Driven Exploration of the TMDB Movie Dataset



##### 📦 Importing Required Libraries

In [None]:
import numpy as numpy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

##### 🎨 Setting Plot Style and Palette

In [None]:
sns.set_palette("Set2")
sns.set_style("whitegrid")
plt.style.use('ggplot')

##### 📂 Loading the TMDB Datasets


In [None]:
movies=pd.read_csv('/content/tmdb_5000_credits.csv')
credits=pd.read_csv('/content/tmdb_5000_movies.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/tmdb_5000_credits.csv'

In [None]:
from google.colab import drive
drive.mount('/content/drive')

##### 🔍 Previewing Movies Dataset

In [None]:
movies.head(3)

##### 👥 Previewing Credits Dataset

In [None]:
credits.head(4)


##### 🔗 Merging Movies and Credits Data

In [None]:
movies = movies.merge(credits,on='title')

##### 🔍 Previewing Movies Dataset

In [None]:
movies.head(1)

##### 🧾 Dataset Information Overview

In [None]:
movies.info()

##### ✏️ Renaming Columns for Clarity

In [None]:
movies = movies.rename(columns={'id_x': 'movie_id'})

##### 📊 Selecting Relevant Columns

In [None]:

analysis_df = movies[['movie_id', 'title', 'release_date', 'runtime', 'vote_average', 'vote_count','revenue', 'budget', 'popularity', 'genres', 'cast', 'crew']]

##### 📅 Converting and Extracting Release Year

In [None]:
analysis_df['release_date'] = pd.to_datetime(analysis_df['release_date'], errors='coerce')
analysis_df['release_year'] = analysis_df['release_date'].dt.year

##### 🔎 Previewing Analysis DataFrame

In [None]:
analysis_df.head()

##### ❓ Exploratory Data Analysis Questions



```
# This is formatted as code
```

**1. Display Titles of Movies Having Runtime ≥ 180 Minutes**

In [None]:
long_movies = analysis_df[analysis_df['runtime'] >= 180][['title', 'runtime']].sort_values(by='runtime', ascending=False)
print("Top Movies with Runtime >= 180 minutes")
display(long_movies.head(10))

**2. In Which Year Was the Highest Average Rating?**

In [None]:
avg_rating_year = analysis_df.groupby('release_year')['vote_average'].mean().sort_index(ascending=True)
avg_rating_year.plot(kind='bar', figsize=(15,5), title='Average Movie Rating by Year', color='skyblue')
plt.ylabel('Average Rating')
plt.xlabel('Year')
plt.grid(True)
plt.show()

print("Year with Highest Average Rating:", avg_rating_year.idxmax())


**3. Display Top 10 Longest Movies**

In [None]:
longest_movies = analysis_df[['title', 'runtime']].sort_values(by='runtime', ascending=False).head(10)
longest_movies.plot(kind='barh', x='title', y='runtime', color='orange', figsize=(10,6), title='Top 10 Longest Movies')
plt.xlabel('Runtime (minutes)')
plt.gca().invert_yaxis()
plt.show()

display(longest_movies)


**4. In Which Year Was the Highest Average Revenue?**

In [None]:
avg_revenue_year = analysis_df.groupby('release_year')['revenue'].mean().sort_index(ascending=True)
avg_revenue_year.plot(kind='bar', figsize=(15,5), title='Average Revenue by Year', color='lightgreen')
plt.ylabel('Average Revenue')
plt.xlabel('Year')
plt.grid(True)
plt.show()

print("Year with Highest Average Revenue:", avg_revenue_year.idxmax())




**5. Display Number of Movies Released Per Year**

In [None]:
movies_per_year = analysis_df['release_year'].value_counts().sort_index()

plt.figure(figsize=(14,5))
sns.lineplot(x=movies_per_year.index, y=movies_per_year.values, marker='o')
plt.title('Number of Movies Released Per Year')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.grid(True)
plt.show()


**6. Find Most Popular Movie Title (Highest Revenue)**


In [None]:
top_revenue_movie = analysis_df[['title', 'revenue']].sort_values(by='revenue', ascending=False).head(1)
print("Most Popular Movie by Revenue:")
display(top_revenue_movie)


**7. Display Top 10 Highest Rated Movie Titles and Their Directors**

In [None]:
# Flatten director list
import ast

# Function to extract director name from crew
def get_director(crew_str):
    try:
        crew_list = ast.literal_eval(crew_str)
        for person in crew_list:
            if person.get('job') == 'Director':
                return person.get('name')
    except:
        return None

# Apply the function to extract director name
top_rated = analysis_df[['title', 'vote_average', 'crew']].copy()
top_rated['director'] = top_rated['crew'].apply(get_director)

# Sort and show top 10 highest rated movies
top_10_rated = top_rated.sort_values(by='vote_average', ascending=False).head(10)

print("Top 10 Highest Rated Movies and Their Directors:")
display(top_10_rated[['title', 'vote_average', 'director']])



**8. Display Top 10 Highest Revenue Movie Titles**

In [None]:
top_10_revenue = analysis_df[['title', 'revenue']].sort_values(by='revenue', ascending=False).head(10)

plt.figure(figsize=(10,6))
sns.barplot(data=top_10_revenue, x='revenue', y='title', palette='viridis')
plt.title('Top 10 Highest Revenue Movies')
plt.xlabel('Revenue')
plt.ylabel('Movie Title')
plt.tight_layout()
plt.show()

display(top_10_revenue)


**9. Find Average Rating of Movies Year Wise**

In [None]:
yearly_avg_rating = analysis_df.groupby('release_year')['vote_average'].mean()

plt.figure(figsize=(14,5))
sns.lineplot(x=yearly_avg_rating.index, y=yearly_avg_rating.values, marker='o', color='teal')
plt.title('Year-wise Average Movie Ratings')
plt.xlabel('Year')
plt.ylabel('Average Rating')
plt.grid(True)
plt.show()


**10. In Which Year Was the Highest Average Revenue Recorded?**

In [None]:
highest_avg_revenue = analysis_df.groupby('release_year')['revenue'].mean().sort_values(ascending=False).head(1)
print("Year with Highest Average Revenue:")
display(highest_avg_revenue)


**11. Top 10 Directors by Average Movie Rating**

In [None]:
from collections import defaultdict
import ast

# Creating dictionary for director ratings
director_ratings = defaultdict(list)

# Safely extracting director names from stringified JSON
for index, row in analysis_df.iterrows():
    try:
        crew_list = ast.literal_eval(row['crew'])
        for person in crew_list:
            if person.get('job') == 'Director':
                director_ratings[person['name']].append(row['vote_average'])
    except:
        continue

# Creating DataFrame with average ratings
avg_director_rating = pd.DataFrame({
    'director': list(director_ratings.keys()),
    'average_rating': [sum(v)/len(v) for v in director_ratings.values()]
})

# Sorting and showing top 10
top_directors = avg_director_rating.sort_values(by='average_rating', ascending=False).head(10)

print("Top 10 Directors by Average Rating:")
display(top_directors)



**12. Display Top 10 Lengthy Movies**

In [None]:
top_long_movies = analysis_df[['title', 'runtime']].sort_values(by='runtime', ascending=False).head(10)

plt.figure(figsize=(10,6))
sns.barplot(data=top_long_movies, x='runtime', y='title', palette='mako')
plt.title('Top 10 Longest Movies')
plt.xlabel('Runtime (minutes)')
plt.ylabel('Movie Title')
plt.tight_layout()
plt.show()

display(top_long_movies)


**13. Does Rating Affect the Revenue? (Scatter Plot)**

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(data=analysis_df, x='vote_average', y='revenue', alpha=0.6)
plt.title('Revenue vs. Rating')
plt.xlabel('Vote Average')
plt.ylabel('Revenue')
plt.tight_layout()
plt.show()



**14. Classify Movies Based on Ratings [Excellent, Good, Average]**

In [None]:
def classify_rating(rating):
    if rating >= 8:
        return 'Excellent'
    elif rating >= 6:
        return 'Good'
    else:
        return 'Average'

analysis_df['rating_category'] = analysis_df['vote_average'].apply(classify_rating)

category_counts = analysis_df['rating_category'].value_counts()

plt.figure(figsize=(6,6))
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=140, colors=['#76c7c0','#f4a261','#e76f51'])
plt.title('Rating Classification of Movies')
plt.tight_layout()
plt.show()

print(category_counts)


**15. Count Number of Action Movies**

In [None]:
action_movies_count = analysis_df['genres'].apply(lambda x: 'Action' in x).sum()
print(f"Number of Action Movies: {action_movies_count}")


**16. List All Unique Movie Genres**

In [None]:
# Print each unique genre on a new line

import pandas as pd
import ast

# Extract all genres and find unique ones
all_genres = []
for index, row in analysis_df.iterrows():
    if isinstance(row['genres'], str):
        try:
            genres_list = ast.literal_eval(row['genres'])
            if isinstance(genres_list, list):
                for genre in genres_list:
                    if isinstance(genre, dict) and 'name' in genre:
                        all_genres.append(genre['name'])
        except (ValueError, SyntaxError):
            # Handle cases where the string is not a valid list of dictionaries
            pass
    elif isinstance(row['genres'], list):
         for genre in row['genres']:
            if isinstance(genre, dict) and 'name' in genre:
                all_genres.append(genre['name'])


unique_genres = set(all_genres)


# Convert set to a sorted list and create a DataFrame
unique_genres_list = sorted(unique_genres)
df_genres = pd.DataFrame(unique_genres_list, columns=['Genre'])

# Display the table
print(df_genres)

**17. How Many Films of Each Genre Were Made? (Bar Plot)**

In [None]:
import ast

# Ensure genres column is parsed into actual lists of dicts
analysis_df['genres'] = analysis_df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) and x.startswith('[') else x)

# Step 1: Explode genres and prepare new DataFrame
rows = []
for _, row in analysis_df.iterrows():
    if isinstance(row['genres'], list):
        for genre in row['genres']:
            if isinstance(genre, dict) and 'name' in genre:
                rows.append({'genre': genre['name'], 'vote_average': row['vote_average']})

flat_df = pd.DataFrame(rows)

# Step 2: Group by genre to compute count and average vote
genre_stats = flat_df.groupby('genre').agg(
    movie_count=('vote_average', 'count'),
    avg_vote=('vote_average', 'mean')
).sort_values(by='movie_count', ascending=False).reset_index()

# Step 3: Plot using seaborn
plt.figure(figsize=(12, 8))
sns.barplot(data=genre_stats, x='movie_count', y='genre', palette='magma')

plt.title('Number of Movies by Genre')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()




**18. What is the Average Rating per Genre? (Grouped Bar Plot)**

In [None]:
# Group by genre to calculate average vote (rating)
avg_rating_genre = flat_df.groupby('genre')['vote_average'].mean().sort_values(ascending=False).reset_index()

# Plot the average rating per genre
plt.figure(figsize=(10, 6))
sns.barplot(data=avg_rating_genre, x='vote_average', y='genre', palette='coolwarm')
plt.title('Average Rating by Genre')
plt.xlabel('Average Rating')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()




**19.  What is the Average Film Duration by Genre? (Bar Plot)**

In [None]:
# Create flat dataframe of genres and runtime
genre_runtime_rows = []
for _, row in analysis_df.iterrows():
    for genre in row['genres']:
        genre_runtime_rows.append({'genre': genre['name'], 'runtime': row['runtime']})

genre_runtime_df = pd.DataFrame(genre_runtime_rows)

# Group and average runtime by genre
avg_runtime_genre = genre_runtime_df.groupby('genre')['runtime'].mean().sort_values(ascending=False).reset_index()

# Plotting
plt.figure(figsize=(10, 6))
sns.barplot(data=avg_runtime_genre, x='runtime', y='genre', palette='magma')
plt.title('Average Film Duration by Genre')
plt.xlabel('Duration (minutes)')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()



In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('/content/tmdb_5000_movies.csv')

# Drop identifier columns
df = df.drop(columns=['id', 'movie_id'], errors='ignore')  # errors='ignore' handles missing columns

# Select only numerical columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute correlation matrix
corr_matrix = numeric_df.corr()

# Plot correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap='coolwarm', linewidths=0.5,
            xticklabels=corr_matrix.columns, yticklabels=corr_matrix.columns)
plt.title("Correlation Matrix of Numerical Features in Movie Dataset")
plt.tight_layout()
plt.show()



**20. Display Number of Movies by Genre**

In [None]:
from collections import Counter

# Extract genre names
genre_list = []
for genre_entry in analysis_df['genres']:
    for genre_dict in genre_entry:
        genre_list.append(genre_dict['name'])  # Get just the genre name

# Count occurrences
genre_counts = Counter(genre_list)

# Convert to DataFrame
genre_df = pd.DataFrame(genre_counts.items(), columns=['genre', 'count']).sort_values(by='count', ascending=False)

# Plot
plt.figure(figsize=(10, 6))
sns.barplot(data=genre_df, x='count', y='genre', palette='cubehelix')
plt.title('Number of Movies per Genre')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.tight_layout()
plt.show()

# View the DataFrame
display(genre_df)
