In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import requests
from src.data.dataloader import DataLoader

%load_ext autoreload
%autoreload 2

In [None]:
dataloader = DataLoader()

In [None]:
characters = dataloader.load_characters()
print(len(characters))
characters.head()

In [None]:
movies_with_characters = dataloader.load_movies_with_characters()

print(len(movies_with_characters))
movies_with_characters.head()

In [None]:
def get_revenue(wikidata_id):
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": wikidata_id,
        "format": "json",
        "languages": "en",
    }
    try:
        response = requests.get(url, params=params)
        data = response.json()
        if "entities" in data and wikidata_id in data["entities"]:
            claims = data["entities"][wikidata_id]["claims"]
            if "P2142" in claims:
                box_office_claim = claims["P2142"][0]["mainsnak"]["datavalue"]["value"]
                return float(box_office_claim["amount"])
            else:
                return None
        else:
            return None
    except requests.exceptions.RequestException:
        return None

def update_movie_revenue(movie_df):
    for index, row in movie_df.iterrows():
        if pd.notna(row["Movie box office revenue"]):
            continue
        revenue = get_revenue(row["wikidata_id"])
        if revenue == 0:
            revenue = None

        print("revenue", revenue)

        movie_df.at[index, "Movie box office revenue"] = revenue

    return movie_df

# movies_with_characters = update_movie_revenue(movies_with_characters)

In [None]:
genre_counts = (
    movies_with_characters["Movie genres"].str.split(", ").explode().value_counts()
)

plt.figure(figsize=(15, 6))
genre_counts[:20].plot(kind="bar", title="Top 20 genres", ylabel="Count", xlabel="Movie Genre", alpha=0.75)

In [None]:
year_counts = movies_with_characters["Movie release date"].value_counts()
# drop nan
year_counts = year_counts[year_counts.index.str.isnumeric()]
year_counts.sort_index(ascending=False, inplace=True)
plt.figure(figsize=(20, 6))
year_counts[:50].plot(kind="bar", title="Number of movies in last 50 years", ylabel="Count", xlabel="Year", alpha=0.75)

In [None]:
# Convert 'Movie release date' to numeric without modifying the original dataset
recent_years = (
    pd.to_numeric(movies_with_characters["Movie release date"], errors="coerce")
    .dropna()
    .astype(int)
    .sort_values(ascending=False)
    .unique()[3:15]
)

top_genres_per_year = pd.DataFrame()

for year in recent_years:
    # Get movies in the year
    movies_in_year = movies_with_characters[
        pd.to_numeric(movies_with_characters["Movie release date"], errors="coerce") == year
    ]

    # Count the genres for movies in this year
    genre_counts = (
        movies_in_year["Movie genres"].str.split(", ").explode().value_counts()
    )

    # Get the top 3 genres and their counts
    top_genres = genre_counts.head(3)

    # Add the data to the DataFrame
    top_genres_per_year = pd.concat(
        [top_genres_per_year, pd.DataFrame({year: top_genres})], axis=1
    )

# Transpose and clean up the DataFrame for plotting
top_genres_per_year = top_genres_per_year.T.fillna(0)

# Plotting the top 3 genres per year for the last 10 years
plt.figure(figsize=(15, 8))
top_genres_per_year.plot(kind="bar", stacked=True, figsize=(15, 8), width=0.8)

plt.title("Top 3 Movie Genres Per Year")
plt.xlabel("Year")
plt.ylabel("Number of Movies")
plt.xticks(rotation=45)
plt.legend(title="Genres", bbox_to_anchor=(1.05, 1))
plt.tight_layout()
plt.show()

In [None]:
language_counts = (
    movies_with_characters["Movie languages"].str.split(", ").explode().value_counts()
)
language_counts
# remove the languages with less than 2 character
language_counts = language_counts[language_counts.index.str.len() > 1]
language_counts[:20].plot(kind="bar", title="Top 20 languages", ylabel="Count")

In [None]:
# Extract the 'Actor gender' column, split it by commas, and count the occurrences of 'F' and 'M'
gender_counts = (
    movies_with_characters["actor_gender"].str.split(", ").explode().value_counts()
)

plt.figure(figsize=(8, 6))
gender_counts[["F", "M"]].plot(kind="bar", color=["pink", "blue"])
plt.title("Number of Female vs Male Actors")
plt.xlabel("Gender")
plt.ylabel("Number of Actors")
plt.xticks(rotation=0)
plt.show()

In [None]:
mean_revenue_per_year = movies_with_characters.groupby("Movie release date")[
    "Movie box office revenue"
].mean()
mean_revenue_per_year = mean_revenue_per_year.dropna()
mean_revenue_per_year = mean_revenue_per_year[mean_revenue_per_year.index != "nan"]

plt.figure(figsize=(25, 7))
plt.plot(
    mean_revenue_per_year.index,
    mean_revenue_per_year.values,
    color="b",
    marker=".",
    linestyle="solid",
    markersize=12,
    markerfacecolor="white",
)

plt.title("Mean Movie Box Office Revenue Per Year")
plt.xlabel("Year")
plt.ylabel("Mean Revenue")
plt.grid(True)
plt.xticks(rotation=45)
plt.show()

20k movies don't have any characters

We will not merge name clusters because there is a baseline bias - we would only consider movies that have been successful and have sequels

In [None]:
# Add these imports
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess and vectorize the plot text
tfidf = TfidfVectorizer(
    max_features=1000,  # Limit to top 1000 terms
    stop_words="english",
    ngram_range=(1, 2),  # Consider both single words and bigrams
    min_df=5,  # Ignore terms that appear in less than 5 documents
)

# Create document-term matrix
plot_features = tfidf.fit_transform(movies_with_characters["plot"])

# Reduce dimensionality (optional but recommended for better clustering)
svd = TruncatedSVD(n_components=100)
plot_features_reduced = svd.fit_transform(plot_features)

# Cluster the movies
n_clusters = 20  # You can adjust this number
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
movies_with_characters["cluster"] = kmeans.fit_predict(plot_features_reduced)


# Analyze the clusters
def get_top_terms_per_cluster():
    # Get the cluster centers in terms of the original TF-IDF features
    original_space_centroids = svd.inverse_transform(kmeans.cluster_centers_)

    for cluster in range(n_clusters):
        top_indices = np.argsort(original_space_centroids[cluster])[
            -10:
        ]  # Top 10 terms
        top_terms = [tfidf.get_feature_names_out()[i] for i in top_indices]
        print(f"\nCluster {cluster} top terms:")
        print(", ".join(top_terms))


# Display results
get_top_terms_per_cluster()