<a href="https://colab.research.google.com/github/dineshnain00/Imdb_Analysis/blob/main/IMDb_Movie_Analysis_with_Plotly.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# =======================================
# IMDb Movie Analysis with Plotly
# =======================================

import pandas as pd
import numpy as np
import gzip
import requests
from io import BytesIO
import plotly.express as px

# ---------------------------------------
# 1️⃣ Download IMDb datasets
# ---------------------------------------
def download_imdb_dataset(url):
    print(f"Downloading {url} ...")
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        return gzip.decompress(response.content)
    else:
        raise Exception(f"Failed to download {url}")

basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"
ratings_url = "https://datasets.imdbws.com/title.ratings.tsv.gz"

basics_data = download_imdb_dataset(basics_url)
ratings_data = download_imdb_dataset(ratings_url)

basics_df = pd.read_csv(BytesIO(basics_data), sep='\t', dtype=str)
ratings_df = pd.read_csv(BytesIO(ratings_data), sep='\t', dtype=str)

Downloading https://datasets.imdbws.com/title.basics.tsv.gz ...
Downloading https://datasets.imdbws.com/title.ratings.tsv.gz ...


In [4]:

# ---------------------------------------
# 2️⃣ Clean & Merge Data
# ---------------------------------------
movies_df = basics_df[basics_df["titleType"] == "movie"].copy()
movies_df = movies_df.merge(ratings_df, on="tconst", how="left")

movies_df["averageRating"] = pd.to_numeric(movies_df["averageRating"], errors="coerce")
movies_df["numVotes"] = pd.to_numeric(movies_df["numVotes"], errors="coerce")
movies_df = movies_df.dropna(subset=["averageRating"])

# Keep only movies with at least 1000 votes
movies_df = movies_df[movies_df["numVotes"] >= 1000]

In [5]:
# ---------------------------------------
# 3️⃣ Split multi-genre movies properly
# ---------------------------------------
movies_df["genres"] = movies_df["genres"].fillna("Unknown")
movies_df["genres_list"] = movies_df["genres"].str.split(",")
movies_exploded = movies_df.explode("genres_list").rename(columns={"genres_list": "genre"})

In [6]:
# ---------------------------------------
# 4️⃣ Simulate extra data (Picture Quality, OTT, Cinema)
# ---------------------------------------
np.random.seed(42)
movies_exploded["Picture_Quality"] = np.random.choice(["HD", "Full HD", "4K"], size=len(movies_exploded))
movies_exploded["OTT_Sales"] = np.random.randint(10000, 500000, size=len(movies_exploded))
movies_exploded["Cinema_Collection"] = np.random.randint(50000, 10000000, size=len(movies_exploded))

In [7]:


# ---------------------------------------
# 5️⃣ Analysis Computations
# ---------------------------------------
# Top Movies by Genre (Top 5 per genre)
top_by_genre = (
    movies_exploded.groupby("genre", group_keys=False)
    .apply(lambda x: x.sort_values("averageRating", ascending=False).head(5))
    .reset_index(drop=True)
)

# Top Movies by Rating
top_by_rating = movies_exploded.sort_values("averageRating", ascending=False).head(10)

# Top Movies by OTT and Cinema Collections
top_ott = movies_exploded.sort_values("OTT_Sales", ascending=False).head(10)
top_cinema = movies_exploded.sort_values("Cinema_Collection", ascending=False).head(10)





In [9]:

# ---------------------------------------
# 6️⃣ Interactive Visualizations (Plotly)
# ---------------------------------------

# ⭐ Top 10 Movies by IMDb Rating
fig_rating = px.bar(
    top_by_rating,
    x="averageRating",
    y="primaryTitle",
    color="genre",
    orientation="h",
    hover_data=["numVotes", "genre"],
    title="⭐ Top 10 Movies by IMDb Rating",
)
fig_rating.update_layout(yaxis=dict(autorange="reversed"))
fig_rating.show()

In [8]:
# 💻 Top 10 Movies by OTT Sales
fig_ott = px.bar(
    top_ott,
    x="OTT_Sales",
    y="primaryTitle",
    color="Picture_Quality",
    orientation="h",
    hover_data=["averageRating", "genre"],
    title="💻 Top 10 Movies by OTT Sales (Simulated)",
)
fig_ott.update_layout(yaxis=dict(autorange="reversed"))
fig_ott.show()

In [10]:
# 🎥 Top 10 Movies by Cinema Collection
fig_cinema = px.bar(
    top_cinema,
    x="Cinema_Collection",
    y="primaryTitle",
    color="genre",
    orientation="h",
    hover_data=["averageRating", "Picture_Quality"],
    title="🎥 Top 10 Movies by Cinema Box Office Collection (Simulated)",
)
fig_cinema.update_layout(yaxis=dict(autorange="reversed"))
fig_cinema.show()

In [11]:
# 🎭 Optional: Dropdown to filter by Genre
fig_genre = px.scatter(
    top_by_genre,
    x="averageRating",
    y="genre",
    size="numVotes",
    color="Picture_Quality",
    hover_data=["primaryTitle"],
    title="🎭 Top 5 Movies by Genre (Interactive)",
)
fig_genre.update_traces(marker=dict(line=dict(width=0.5, color='DarkSlateGrey')))
fig_genre.show()

In [12]:
# ---------------------------------------
# 7️⃣ Export results (optional)
# ---------------------------------------
top_by_genre.to_csv("top_movies_by_genre.csv", index=False)
top_by_rating.to_csv("top_movies_by_rating.csv", index=False)
top_ott.to_csv("top_movies_by_ott.csv", index=False)
top_cinema.to_csv("top_movies_by_cinema.csv", index=False)

print("\n✅ Analysis completed and interactive charts displayed successfully.")


✅ Analysis completed and interactive charts displayed successfully.
