In [12]:
import json

import pandas as pd

character_metadata_path = "../../data/character.metadata.tsv"
movie_metadata_path = "../../data/movie.metadata.tsv"
name_cluster_path = "../../data/name.clusters.tsv"
plot_summaries_path = "../../data/plot_summaries.tsv"
tvtropes_path = "../../data/tvtropes.clusters.tsv"


characterMetadata = pd.read_csv(character_metadata_path, sep="\t")
movieMetadata = pd.read_csv(movie_metadata_path, sep="\t")
nameCluster = pd.read_csv(
    name_cluster_path,
    sep="\t",
    names=["Character name", "Freebase character/actor map ID"],
)
plotSummaries = pd.read_csv(
    plot_summaries_path, sep="\t", names=["Wikipedia movie ID", "plot"]
)
tvtropes = pd.read_csv(tvtropes_path, sep="\t", names=["trope", "details"])
tvtropes = pd.concat(
    [tvtropes["trope"], tvtropes["details"].apply(json.loads).apply(pd.Series)], axis=1
)
tvtropes = tvtropes.rename(columns={"id": "Freebase character/actor map ID"})


In [None]:
print("Character Metadata")
display(characterMetadata.head())
print("Movie Metadata")
display(movieMetadata.head())
print("Name Cluster")
display(nameCluster.head())
print("Plot Summaries")
display(plotSummaries.head())
print("TV Tropes")
display(tvtropes.head())

In [66]:
movies = pd.merge(
    movieMetadata,
    characterMetadata,
    on=["Wikipedia movie ID", "Freebase movie ID"],
    how="inner",
)
movies = pd.merge(movies, plotSummaries, on="Wikipedia movie ID", how="inner")
# merge with tvtropes
# movies = pd.merge(movies, tvtropes, on="Freebase character/actor map ID", how="inner")

20k movies don't have any characters

We will not merge name clusters because there is a baseline bias - we would only consider movies that have been successful and have sequels

In [None]:
len(movies), len(movieMetadata), len(movies.groupby("Wikipedia movie ID").count())

In [69]:
# Add these imports
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess and vectorize the plot text
tfidf = TfidfVectorizer(
    max_features=1000,  # Limit to top 1000 terms
    stop_words="english",
    ngram_range=(1, 2),  # Consider both single words and bigrams
    min_df=5,  # Ignore terms that appear in less than 5 documents
)

# Create document-term matrix
plot_features = tfidf.fit_transform(movies["plot"])

# Reduce dimensionality (optional but recommended for better clustering)
svd = TruncatedSVD(n_components=100)
plot_features_reduced = svd.fit_transform(plot_features)

# Cluster the movies
n_clusters = 10  # You can adjust this number
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
movies["cluster"] = kmeans.fit_predict(plot_features_reduced)


# Analyze the clusters
def get_top_terms_per_cluster():
    # Get the cluster centers in terms of the original TF-IDF features
    original_space_centroids = svd.inverse_transform(kmeans.cluster_centers_)

    for cluster in range(n_clusters):
        top_indices = np.argsort(original_space_centroids[cluster])[
            -10:
        ]  # Top 10 terms
        top_terms = [tfidf.get_feature_names_out()[i] for i in top_indices]
        print(f"\nCluster {cluster} top terms:")
        print(", ".join(top_terms))


# Display results
get_top_terms_per_cluster()


Cluster 0 top terms:
characters, follows, man, director, set, young, movie, life, story, film

Cluster 1 top terms:
daughter, girl, story, marry, married, life, marriage, falls, falls love, love

Cluster 2 top terms:
later, home, finds, goes, money, man, car, tells, house, police

Cluster 3 top terms:
crew, soldiers, world, men, captain, king, army, earth, ship, war

Cluster 4 top terms:
father, love, school, new, tells, elizabeth, john, mother, peter, mary

Cluster 5 top terms:
time, wife, love, man, life, home, tells, family, susan, david

Cluster 6 top terms:
time, friends, town, wife, jack, school, young, man, life, new

Cluster 7 top terms:
son, george, tells, house, home, ring, father, family, new, sam

Cluster 8 top terms:
husband, home, kids, child, mother, house, parents, wife, family, children

Cluster 9 top terms:
brother, old, house, home, life, daughter, son, mother, family, father
