In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# Load the dataset
df = pd.read_csv('data.csv')

# Question 1: What is the average rating of all movies in the dataset?

def find_average_rating(df):
    avg_rating = df['averageRating']  
    print(f"Question 1: The average rating of all movies is {avg_rating:.2f}")
    return avg_rating

# Question 2: Which movie has the highest rating?

def find_highest_rated_movie(df):
    highest_rated = df.loc[df['averageRating'].idxmax()]
    print(f"Question 2: The highest rated movie is '{highest_rated['title']}' with a rating of {highest_rated['averageRating']}")
    return highest_rated['title'], highest_rated['averageRating']

# Question 3: What are the most common genres in the dataset?

def find_most_common_genres(df):
    all_genres = []
    for genres in df['genres']:
        all_genres.append(genres.split(','))  
    genre_counts = Counter(all_genres)
    most_common = genre_counts.most_common(5)
    
    print("Question 3: The most common genres are:")
    for genre, count in most_common:
        print(f"  {genre}: {count} movies")
    
    return most_common

# Question 4: How many movies were released in each decade?

def count_movies_by_decade(df):
    df['decade'] = df['releaseYear'] % 100 
    # Grouping movies by decade
    decade_counts = df['decade'].value_counts().sort_index()

    print("Question 4: Number of movies by decade:")
    for decade, count in decade_counts.items():
        print(f"  {decade}s: {count} movies")
    
    plt.figure(figsize=(10, 6))
    decade_counts.plot(kind='bar', color='skyblue')
    plt.title('Number of Movies Released by Decade')
    plt.xlabel('Decade')
    plt.ylabel('Number of Movies')
    plt.xticks(rotation=45)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return decade_counts

def analyze_movie_dataset(df):
    print("===== BEGINNER-FRIENDLY MOVIE DATASET ANALYSIS =====\n")
    
    avg_rating = find_average_rating(df)
    print()
    
    highest_movie, highest_rating = find_highest_rated_movie(df)
    print()
    
    common_genres = find_most_common_genres(df)
    print()
    
    decade_counts = count_movies_by_decade(df)
    print()
    
    results = {
        "average_rating": avg_rating,
        "highest_rated_movie": {"title": highest_movie, "rating": highest_rating},
        "common_genres": common_genres,
        "movies_by_decade": decade_counts,
    }
    
    return results

if *name* == "__main__": 
    analyze_movie_dataset(df)