In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Import the TMDB dataset
movies_df = pd.read_csv('content/tmdb_5000_movies.csv')
credits_df = pd.read_csv('conent/tmdb_5000_credits.csv')

# Display basic information about the datasets
print("Movies Dataset Shape:", movies_df.shape)
print("\nCredits Dataset Shape:", credits_df.shape)

print("\nMovies Dataset Columns:")
print(movies_df.columns.tolist())

print("\nCredits Dataset Columns:")
print(credits_df.columns.tolist())

# Display first few rows of movies dataset
print("\nFirst 5 rows of Movies Dataset:")
movies_df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'archive/tmdb_5000_movies.csv'

In [None]:
# Data Cleaning Analysis - Step 1: Identify Issues

print("="*60)
print("DATA QUALITY ASSESSMENT")
print("="*60)

# 1. Check for missing values
print("\n1. MISSING VALUES IN MOVIES DATASET:")
print("-" * 60)
missing_movies = movies_df.isnull().sum()
missing_pct = (missing_movies / len(movies_df)) * 100
missing_summary = pd.DataFrame({
    'Missing Count': missing_movies,
    'Percentage': missing_pct
})
print(missing_summary[missing_summary['Missing Count'] > 0].sort_values('Missing Count', ascending=False))

print("\n2. MISSING VALUES IN CREDITS DATASET:")
print("-" * 60)
missing_credits = credits_df.isnull().sum()
print(missing_credits[missing_credits > 0])

# 3. Check for duplicates
print("\n3. DUPLICATE ROWS:")
print("-" * 60)
print(f"Duplicates in movies_df: {movies_df.duplicated().sum()}")
print(f"Duplicates in credits_df: {credits_df.duplicated().sum()}")

# 4. Check data types
print("\n4. DATA TYPES:")
print("-" * 60)
print(movies_df.dtypes)

# 5. Check for zero/invalid values in numerical columns
print("\n5. ZERO VALUES IN KEY COLUMNS:")
print("-" * 60)
print(f"Movies with budget = 0: {(movies_df['budget'] == 0).sum()}")
print(f"Movies with revenue = 0: {(movies_df['revenue'] == 0).sum()}")
print(f"Movies with runtime = 0: {(movies_df['runtime'] == 0).sum()}")

# 6. Basic statistics
print("\n6. BASIC STATISTICS:")
print("-" * 60)
movies_df.describe()


In [None]:
# Data Cleaning - Step 1: Handle Missing Values and Zero Values

print("="*60)
print("DATA CLEANING - STEP 1")
print("="*60)

# Create a copy of the original data for cleaning
movies_cleaned = movies_df.copy()
credits_cleaned = credits_df.copy()

print(f"Original dataset shape: {movies_cleaned.shape}")

# 1. Handle zero values in critical columns (these are likely missing data, not actual zeros)
print("\n1. HANDLING ZERO VALUES:")
print("-" * 40)

# Replace 0 values with NaN for budget, revenue, runtime
movies_cleaned['budget'] = movies_cleaned['budget'].replace(0, np.nan)
movies_cleaned['revenue'] = movies_cleaned['revenue'].replace(0, np.nan)
movies_cleaned['runtime'] = movies_cleaned['runtime'].replace(0, np.nan)

print(f"Budget 0s replaced with NaN: {(movies_df['budget'] == 0).sum()}")
print(f"Revenue 0s replaced with NaN: {(movies_df['revenue'] == 0).sum()}")
print(f"Runtime 0s replaced with NaN: {(movies_df['runtime'] == 0).sum()}")

# 2. Handle missing values in non-critical columns
print("\n2. HANDLING MISSING VALUES:")
print("-" * 40)

# Fill missing values for non-critical columns
movies_cleaned['homepage'].fillna('No homepage available', inplace=True)
movies_cleaned['tagline'].fillna('No tagline available', inplace=True)
movies_cleaned['overview'].fillna('No overview available', inplace=True)

print("Missing values filled for homepage, tagline, overview")

# 3. Remove rows with critical missing data
print("\n3. REMOVING ROWS WITH CRITICAL MISSING DATA:")
print("-" * 40)

# Remove rows where all financial data is missing
before_removal = len(movies_cleaned)
movies_cleaned = movies_cleaned.dropna(subset=['release_date'])  # Remove if no release date
after_removal = len(movies_cleaned)

print(f"Rows removed due to missing release_date: {before_removal - after_removal}")
print(f"Final dataset shape: {movies_cleaned.shape}")

# 4. Check remaining missing values
print("\n4. REMAINING MISSING VALUES:")
print("-" * 40)
remaining_missing = movies_cleaned.isnull().sum()
print(remaining_missing[remaining_missing > 0])

print("\n✅ Data Cleaning Step 1 Complete!")


In [None]:
# Data Cleaning - Step 2: Data Type Conversion and JSON Parsing

print("="*60)
print("DATA CLEANING - STEP 2")
print("="*60)

import json
import ast

# 1. Convert release_date to datetime
print("1. CONVERTING DATA TYPES:")
print("-" * 40)

movies_cleaned['release_date'] = pd.to_datetime(movies_cleaned['release_date'], errors='coerce')
print("✅ release_date converted to datetime")

# 2. Parse JSON columns for better analysis
print("\n2. PARSING JSON COLUMNS:")
print("-" * 40)

def parse_json_column(df, column_name):
    """Parse JSON string columns and extract useful information"""
    try:
        # Parse JSON strings
        parsed_data = df[column_name].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])

        # Extract names from JSON objects
        if column_name == 'genres':
            # For genres: extract genre names
            df[f'{column_name}_parsed'] = parsed_data.apply(
                lambda x: [item['name'] for item in x if isinstance(item, dict) and 'name' in item]
            )
        elif column_name in ['production_companies', 'production_countries']:
            # For production: extract names
            df[f'{column_name}_parsed'] = parsed_data.apply(
                lambda x: [item['name'] for item in x if isinstance(item, dict) and 'name' in item]
            )
        elif column_name == 'spoken_languages':
            # For languages: extract language names
            df[f'{column_name}_parsed'] = parsed_data.apply(
                lambda x: [item['name'] for item in x if isinstance(item, dict) and 'name' in item]
            )
        elif column_name == 'keywords':
            # For keywords: extract keyword names
            df[f'{column_name}_parsed'] = parsed_data.apply(
                lambda x: [item['name'] for item in x if isinstance(item, dict) and 'name' in item]
            )

        print(f"✅ {column_name} parsed successfully")
        return True
    except Exception as e:
        print(f"❌ Error parsing {column_name}: {e}")
        return False

# Parse JSON columns
json_columns = ['genres', 'keywords', 'production_companies', 'production_countries', 'spoken_languages']
for col in json_columns:
    parse_json_column(movies_cleaned, col)

# 3. Parse cast and crew from credits dataset
print("\n3. PARSING CREDITS DATA:")
print("-" * 40)

def parse_cast_crew(credits_df):
    """Parse cast and crew information"""
    try:
        # Parse cast
        credits_df['cast_parsed'] = credits_df['cast'].apply(
            lambda x: ast.literal_eval(x) if pd.notna(x) else []
        )

        # Extract top 3 actors from cast
        credits_df['top_actors'] = credits_df['cast_parsed'].apply(
            lambda x: [actor['name'] for actor in x[:3] if isinstance(actor, dict) and 'name' in actor]
        )

        # Parse crew
        credits_df['crew_parsed'] = credits_df['crew'].apply(
            lambda x: ast.literal_eval(x) if pd.notna(x) else []
        )

        # Extract director from crew
        credits_df['director'] = credits_df['crew_parsed'].apply(
            lambda x: next((person['name'] for person in x if person.get('job') == 'Director'), 'Unknown')
        )

        print("✅ Cast and crew data parsed successfully")
        return True
    except Exception as e:
        print(f"❌ Error parsing credits: {e}")
        return False

parse_cast_crew(credits_cleaned)

print("\n✅ Data Cleaning Step 2 Complete!")
print(f"Cleaned movies dataset shape: {movies_cleaned.shape}")
print(f"Cleaned credits dataset shape: {credits_cleaned.shape}")


In [None]:
# Data Cleaning - Step 3: Merge Datasets and Create Features for Recommendation System

print("="*60)
print("DATA CLEANING - STEP 3")
print("="*60)

# 1. Merge movies and credits datasets
print("1. MERGING DATASETS:")
print("-" * 40)

# Merge on movie_id and id
merged_df = movies_cleaned.merge(credits_cleaned, left_on='id', right_on='movie_id', how='left')
print(f"✅ Datasets merged successfully")
print(f"Merged dataset shape: {merged_df.shape}")

# Check column names after merge to understand the naming
print(f"Columns after merge: {list(merged_df.columns)}")

# 2. Create new features for recommendation system
print("\n2. CREATING RECOMMENDATION FEATURES:")
print("-" * 40)

# Calculate profit (revenue - budget)
merged_df['profit'] = merged_df['revenue'] - merged_df['budget']
merged_df['profit_margin'] = merged_df['profit'] / merged_df['budget'].replace(0, np.nan)

# Create year feature from release_date
merged_df['year'] = merged_df['release_date'].dt.year

# Create decade feature
merged_df['decade'] = (merged_df['year'] // 10) * 10

# Calculate popularity score (normalized)
merged_df['popularity_normalized'] = (merged_df['popularity'] - merged_df['popularity'].min()) / (merged_df['popularity'].max() - merged_df['popularity'].min())

# Create genre count
merged_df['genre_count'] = merged_df['genres_parsed'].apply(lambda x: len(x) if isinstance(x, list) else 0)

# Create text features for content-based filtering
# Use the correct title column name (from movies dataset, not credits)
merged_df['combined_text'] = (
    merged_df['overview'].fillna('') + ' ' +
    merged_df['tagline'].fillna('') + ' ' +
    merged_df['original_title'].fillna('')  # Use original_title instead of title
)

print("✅ New features created:")
print("  - profit, profit_margin")
print("  - year, decade")
print("  - popularity_normalized")
print("  - genre_count")
print("  - combined_text (for content-based filtering)")

# 3. Final data quality check
print("\n3. FINAL DATA QUALITY CHECK:")
print("-" * 40)

print(f"Final dataset shape: {merged_df.shape}")
print(f"Final missing values:")
final_missing = merged_df.isnull().sum()
print(final_missing[final_missing > 0])

# 4. Display sample of cleaned data
print("\n4. SAMPLE OF CLEANED DATA:")
print("-" * 40)
print("Sample movies with key features:")
sample_cols = ['original_title', 'year', 'genres_parsed', 'director', 'vote_average', 'budget', 'revenue', 'profit']
print(merged_df[sample_cols].head())

print("\n✅ Data Cleaning Complete! Dataset ready for recommendation system development.")


In [11]:
# Data Analysis and Visualization - Part 1: Basic Statistics and Distribution

print("="*60)
print("DATA ANALYSIS AND VISUALIZATION")
print("="*60)

# Set up plotting style
plt.style.use('default')
plt.rcParams['figure.figsize'] = (18, 12)
plt.rcParams['font.size'] = 10

# 1. Basic Statistics Overview
print("1. BASIC STATISTICS OVERVIEW:")
print("-" * 50)

# Financial statistics
print("💰 FINANCIAL STATISTICS:")
print(f"Average Budget: ${merged_df['budget'].mean():,.0f}")
print(f"Average Revenue: ${merged_df['revenue'].mean():,.0f}")
print(f"Average Profit: ${merged_df['profit'].mean():,.0f}")
print(f"Highest Budget Movie: ${merged_df['budget'].max():,.0f}")
print(f"Highest Revenue Movie: ${merged_df['revenue'].max():,.0f}")

# Rating statistics
print(f"\n⭐ RATING STATISTICS:")
print(f"Average Rating: {merged_df['vote_average'].mean():.2f}/10")
print(f"Highest Rated Movie: {merged_df['vote_average'].max():.2f}/10")
print(f"Average Runtime: {merged_df['runtime'].mean():.0f} minutes")

# 2. Distribution Analysis with Better Scaling
print("\n2. DISTRIBUTION ANALYSIS (Improved Scaling):")
print("-" * 50)

# Create subplots for distribution analysis
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Movie Dataset Distribution Analysis (Improved Scaling)', fontsize=16, fontweight='bold')

# Budget distribution - Log scale
budget_data = merged_df['budget'].dropna()
budget_data = budget_data[budget_data > 0]  # Remove zeros for log scale
axes[0, 0].hist(np.log10(budget_data), bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0, 0].set_title('Budget Distribution (Log Scale)')
axes[0, 0].set_xlabel('Log10(Budget) - $1M to $100M')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_xticks([6, 7, 8])
axes[0, 0].set_xticklabels(['$1M', '$10M', '$100M'])

# Revenue distribution - Log scale
revenue_data = merged_df['revenue'].dropna()
revenue_data = revenue_data[revenue_data > 0]  # Remove zeros for log scale
axes[0, 1].hist(np.log10(revenue_data), bins=50, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0, 1].set_title('Revenue Distribution (Log Scale)')
axes[0, 1].set_xlabel('Log10(Revenue) - $1M to $1B')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_xticks([6, 7, 8, 9])
axes[0, 1].set_xticklabels(['$1M', '$10M', '$100M', '$1B'])

# Rating distribution - Keep as is (already good)
axes[0, 2].hist(merged_df['vote_average'], bins=30, alpha=0.7, color='orange', edgecolor='black')
axes[0, 2].set_title('Rating Distribution')
axes[0, 2].set_xlabel('Rating (/10)')
axes[0, 2].set_ylabel('Frequency')

# Runtime distribution - Keep as is (already good)
axes[1, 0].hist(merged_df['runtime'].dropna(), bins=40, alpha=0.7, color='pink', edgecolor='black')
axes[1, 0].set_title('Runtime Distribution')
axes[1, 0].set_xlabel('Runtime (minutes)')
axes[1, 0].set_ylabel('Frequency')

# Popularity distribution - Log scale
popularity_data = merged_df['popularity']
popularity_data = popularity_data[popularity_data > 0]  # Remove zeros for log scale
axes[1, 1].hist(np.log10(popularity_data), bins=50, alpha=0.7, color='lightcoral', edgecolor='black')
axes[1, 1].set_title('Popularity Distribution (Log Scale)')
axes[1, 1].set_xlabel('Log10(Popularity) - 0.1 to 100')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].set_xticks([-1, 0, 1, 2])
axes[1, 1].set_xticklabels(['0.1', '1', '10', '100'])

# Genre count distribution - Keep as is (already good)
axes[1, 2].hist(merged_df['genre_count'], bins=range(0, merged_df['genre_count'].max()+2),
                alpha=0.7, color='lightblue', edgecolor='black')
axes[1, 2].set_title('Number of Genres per Movie')
axes[1, 2].set_xlabel('Number of Genres')
axes[1, 2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

print("✅ Distribution analysis with improved scaling completed!")


DATA ANALYSIS AND VISUALIZATION
1. BASIC STATISTICS OVERVIEW:
--------------------------------------------------
💰 FINANCIAL STATISTICS:


NameError: name 'merged_df' is not defined

In [None]:
# Data Analysis and Visualization - Part 2: Genre Analysis and Time Trends

print("="*60)
print("GENRE ANALYSIS AND TIME TRENDS")
print("="*60)

# 1. Genre Analysis
print("1. GENRE ANALYSIS:")
print("-" * 50)

# Extract all genres
all_genres = []
for genres_list in merged_df['genres_parsed']:
    if isinstance(genres_list, list):
        all_genres.extend(genres_list)

# Count genre frequency
from collections import Counter
genre_counts = Counter(all_genres)

print("🎬 TOP 10 MOST COMMON GENRES:")
print("-" * 30)
for i, (genre, count) in enumerate(genre_counts.most_common(10), 1):
    print(f"{i:2d}. {genre:<20} ({count:3d} movies)")

# 2. Genre Visualization
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Top 15 genres bar chart
top_genres = dict(genre_counts.most_common(15))
axes[0].barh(list(top_genres.keys()), list(top_genres.values()), color='steelblue', alpha=0.8)
axes[0].set_title('Top 15 Movie Genres', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Number of Movies')
axes[0].invert_yaxis()

# Genre count distribution
axes[1].hist(merged_df['genre_count'], bins=range(0, merged_df['genre_count'].max()+2),
             color='lightgreen', alpha=0.8, edgecolor='black')
axes[1].set_title('Distribution of Genre Count per Movie', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Number of Genres')
axes[1].set_ylabel('Number of Movies')

plt.tight_layout()
plt.show()

# 3. Time Analysis
print("\n2. TIME ANALYSIS:")
print("-" * 50)

# Movies by decade
decade_counts = merged_df['decade'].value_counts().sort_index()
print("📅 MOVIES BY DECADE:")
print("-" * 25)
for decade, count in decade_counts.items():
    print(f"{decade}s: {count:3d} movies")

# 4. Time Trends Visualization
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Movies by decade
decade_counts.plot(kind='bar', ax=axes[0, 0], color='coral', alpha=0.8)
axes[0, 0].set_title('Movies Released by Decade', fontweight='bold')
axes[0, 0].set_xlabel('Decade')
axes[0, 0].set_ylabel('Number of Movies')
axes[0, 0].tick_params(axis='x', rotation=45)

# Average rating by decade
rating_by_decade = merged_df.groupby('decade')['vote_average'].mean()
rating_by_decade.plot(kind='line', ax=axes[0, 1], marker='o', color='darkgreen', linewidth=2)
axes[0, 1].set_title('Average Rating by Decade', fontweight='bold')
axes[0, 1].set_xlabel('Decade')
axes[0, 1].set_ylabel('Average Rating')
axes[0, 1].grid(True, alpha=0.3)

# Average budget by decade
budget_by_decade = merged_df.groupby('decade')['budget'].mean()
budget_by_decade.plot(kind='bar', ax=axes[1, 0], color='gold', alpha=0.8)
axes[1, 0].set_title('Average Budget by Decade', fontweight='bold')
axes[1, 0].set_xlabel('Decade')
axes[1, 0].set_ylabel('Average Budget ($)')
axes[1, 0].tick_params(axis='x', rotation=45)

# Average revenue by decade
revenue_by_decade = merged_df.groupby('decade')['revenue'].mean()
revenue_by_decade.plot(kind='bar', ax=axes[1, 1], color='purple', alpha=0.8)
axes[1, 1].set_title('Average Revenue by Decade', fontweight='bold')
axes[1, 1].set_xlabel('Decade')
axes[1, 1].set_ylabel('Average Revenue ($)')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("✅ Genre and time analysis completed!")


In [None]:
# Data Analysis and Visualization - Part 3: Correlation Analysis and Top Movies

print("="*60)
print("CORRELATION ANALYSIS AND TOP MOVIES")
print("="*60)

# 1. Correlation Analysis
print("1. CORRELATION ANALYSIS:")
print("-" * 50)

# Select numerical columns for correlation
numerical_cols = ['budget', 'revenue', 'profit', 'runtime', 'popularity',
                  'vote_average', 'vote_count', 'genre_count']
correlation_matrix = merged_df[numerical_cols].corr()

print("🔗 KEY CORRELATIONS:")
print("-" * 25)
print(f"Budget vs Revenue: {correlation_matrix.loc['budget', 'revenue']:.3f}")
print(f"Budget vs Rating: {correlation_matrix.loc['budget', 'vote_average']:.3f}")
print(f"Revenue vs Rating: {correlation_matrix.loc['revenue', 'vote_average']:.3f}")
print(f"Popularity vs Rating: {correlation_matrix.loc['popularity', 'vote_average']:.3f}")
print(f"Runtime vs Rating: {correlation_matrix.loc['runtime', 'vote_average']:.3f}")

# 2. Correlation Heatmap
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Correlation heatmap
import seaborn as sns
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, ax=axes[0], cbar_kws={"shrink": .8})
axes[0].set_title('Correlation Matrix Heatmap', fontsize=14, fontweight='bold')

# Budget vs Revenue scatter plot
scatter = axes[1].scatter(merged_df['budget'], merged_df['revenue'],
                         c=merged_df['vote_average'], cmap='viridis', alpha=0.6)
axes[1].set_xlabel('Budget ($)')
axes[1].set_ylabel('Revenue ($)')
axes[1].set_title('Budget vs Revenue (colored by Rating)', fontweight='bold')
plt.colorbar(scatter, ax=axes[1], label='Rating')

plt.tight_layout()
plt.show()

# 3. Top Movies Analysis
print("\n2. TOP MOVIES ANALYSIS:")
print("-" * 50)

# Top rated movies
top_rated = merged_df.nlargest(10, 'vote_average')[['original_title', 'year', 'vote_average', 'genres_parsed']]
print("🏆 TOP 10 HIGHEST RATED MOVIES:")
print("-" * 40)
for i, (idx, row) in enumerate(top_rated.iterrows(), 1):
    genres = ', '.join(row['genres_parsed'][:2]) if row['genres_parsed'] else 'Unknown'
    print(f"{i:2d}. {row['original_title']:<30} ({row['year']}) - {row['vote_average']:.1f}/10 - {genres}")

# Top grossing movies
top_grossing = merged_df.nlargest(10, 'revenue')[['original_title', 'year', 'revenue', 'budget', 'profit']]
print(f"\n💰 TOP 10 HIGHEST GROSSING MOVIES:")
print("-" * 45)
for i, (idx, row) in enumerate(top_grossing.iterrows(), 1):
    print(f"{i:2d}. {row['original_title']:<30} ({row['year']}) - ${row['revenue']:,.0f}")

# Most profitable movies
top_profitable = merged_df.nlargest(10, 'profit')[['original_title', 'year', 'profit', 'budget']]
print(f"\n📈 TOP 10 MOST PROFITABLE MOVIES:")
print("-" * 40)
for i, (idx, row) in enumerate(top_profitable.iterrows(), 1):
    print(f"{i:2d}. {row['original_title']:<30} ({row['year']}) - ${row['profit']:,.0f} profit")

# 4. Top Movies Visualization
fig, axes = plt.subplots(2, 2, figsize=(20, 12))

# Top rated movies
top_10_rated = merged_df.nlargest(10, 'vote_average')
axes[0, 0].barh(range(len(top_10_rated)), top_10_rated['vote_average'], color='gold', alpha=0.8)
axes[0, 0].set_yticks(range(len(top_10_rated)))
axes[0, 0].set_yticklabels([title[:20] + '...' if len(title) > 20 else title
                            for title in top_10_rated['original_title']], fontsize=8)
axes[0, 0].set_title('Top 10 Highest Rated Movies', fontweight='bold')
axes[0, 0].set_xlabel('Rating')

# Top grossing movies
top_10_grossing = merged_df.nlargest(10, 'revenue')
axes[0, 1].barh(range(len(top_10_grossing)), top_10_grossing['revenue']/1e9, color='green', alpha=0.8)
axes[0, 1].set_yticks(range(len(top_10_grossing)))
axes[0, 1].set_yticklabels([title[:20] + '...' if len(title) > 20 else title
                            for title in top_10_grossing['original_title']], fontsize=8)
axes[0, 1].set_title('Top 10 Highest Grossing Movies', fontweight='bold')
axes[0, 1].set_xlabel('Revenue (Billions $)')

# Budget vs Rating scatter
axes[1, 0].scatter(merged_df['budget']/1e6, merged_df['vote_average'], alpha=0.6, color='purple')
axes[1, 0].set_xlabel('Budget (Millions $)')
axes[1, 0].set_ylabel('Rating')
axes[1, 0].set_title('Budget vs Rating', fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)

# Runtime vs Rating scatter
axes[1, 1].scatter(merged_df['runtime'], merged_df['vote_average'], alpha=0.6, color='orange')
axes[1, 1].set_xlabel('Runtime (minutes)')
axes[1, 1].set_ylabel('Rating')
axes[1, 1].set_title('Runtime vs Rating', fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("✅ Correlation analysis and top movies analysis completed!")
