In [2]:
import json

import pandas as pd

character_metadata_path = "../../data/character.metadata.tsv"
movie_metadata_path = "../../data/movie.metadata.tsv"
name_cluster_path = "../../data/name.clusters.tsv"
plot_summaries_path = "../../data/plot_summaries.tsv"
tvtropes_path = "../../data/tvtropes.clusters.tsv"


characterMetadata = pd.read_csv(character_metadata_path, sep="\t")
movieMetadata = pd.read_csv(movie_metadata_path, sep="\t")
nameCluster = pd.read_csv(
    name_cluster_path,
    sep="\t",
    names=["Character name", "Freebase character/actor map ID"],
)
plotSummaries = pd.read_csv(
    plot_summaries_path, sep="\t", names=["Wikipedia movie ID", "plot"]
)
tvtropes = pd.read_csv(tvtropes_path, sep="\t", names=["trope", "details"])
tvtropes = pd.concat(
    [tvtropes["trope"], tvtropes["details"].apply(json.loads).apply(pd.Series)], axis=1
)
tvtropes = tvtropes.rename(columns={"id": "Freebase character/actor map ID"})


In [3]:
print("Character Metadata")
display(characterMetadata.head())
print("Movie Metadata")
display(movieMetadata.head())
print("Name Cluster")
display(nameCluster.head())
print("Plot Summaries")
display(plotSummaries.head())
print("TV Tropes")
display(tvtropes.head())

Character Metadata


Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


Movie Metadata


Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,3196793,/m/08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/02n4kr"": ""Mystery"", ""/m/03bxz7"": ""Biograp..."
2,28463795,/m/0crgdbh,Brun bitter,1988,,83.0,"{""/m/05f_3"": ""Norwegian Language""}","{""/m/05b4w"": ""Norway""}","{""/m/0lsxr"": ""Crime Fiction"", ""/m/07s9rl0"": ""D..."
3,9363483,/m/0285_cd,White Of The Eye,1987,,110.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/01jfsb"": ""Thriller"", ""/m/0glj9q"": ""Erotic..."
4,261236,/m/01mrr1,A Woman in Flames,1983,,106.0,"{""/m/04306rv"": ""German Language""}","{""/m/0345h"": ""Germany""}","{""/m/07s9rl0"": ""Drama""}"


Name Cluster


Unnamed: 0,Character name,Freebase character/actor map ID
0,Stuart Little,/m/0k3w9c
1,Stuart Little,/m/0k3wcx
2,Stuart Little,/m/0k3wbn
3,John Doe,/m/0jyg35
4,John Doe,/m/0k2_zn


Plot Summaries


Unnamed: 0,Wikipedia movie ID,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six yea...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


TV Tropes


Unnamed: 0,trope,char,movie,Freebase character/actor map ID,actor
0,absent_minded_professor,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,/m/0k3rhh,James Spader


In [4]:
movies = pd.merge(
    movieMetadata,
    characterMetadata,
    on=["Wikipedia movie ID", "Freebase movie ID"],
    how="inner",
)
movies = pd.merge(movies, plotSummaries, on="Wikipedia movie ID", how="inner")
# merge with tvtropes
# movies = pd.merge(movies, tvtropes, on="Freebase character/actor map ID", how="inner")

20k movies don't have any characters

We will not merge name clusters because there is a baseline bias - we would only consider movies that have been successful and have sequels

In [5]:
len(movies), len(movieMetadata), len(movies.groupby("Wikipedia movie ID").count())
#movies = movies.groupby("Wikipedia movie ID")

(308485, 81741, 37779)

In [5]:
# Add these imports
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess and vectorize the plot text
tfidf = TfidfVectorizer(
    max_features=1000,  # Limit to top 1000 terms
    stop_words="english",
    ngram_range=(1, 2),  # Consider both single words and bigrams
    min_df=5,  # Ignore terms that appear in less than 5 documents
)

# Create document-term matrix
plot_features = tfidf.fit_transform(movies["plot"])

# Reduce dimensionality (optional but recommended for better clustering)
svd = TruncatedSVD(n_components=100)
plot_features_reduced = svd.fit_transform(plot_features)

# Cluster the movies
n_clusters = 20  # You can adjust this number
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
movies["cluster"] = kmeans.fit_predict(plot_features_reduced)


# Analyze the clusters
def get_top_terms_per_cluster():
    # Get the cluster centers in terms of the original TF-IDF features
    original_space_centroids = svd.inverse_transform(kmeans.cluster_centers_)

    for cluster in range(n_clusters):
        top_indices = np.argsort(original_space_centroids[cluster])[
            -10:
        ]  # Top 10 terms
        top_terms = [tfidf.get_feature_names_out()[i] for i in top_indices]
        print(f"\nCluster {cluster} top terms:")
        print(", ".join(top_terms))


# Display results
get_top_terms_per_cluster()

[WinError 2] The system cannot find the file specified
  File "c:\Users\eddli\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\eddli\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\eddli\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\eddli\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



Cluster 0 top terms:
gets, film, kill, man, killed, member, members, leader, police, gang

Cluster 1 top terms:
later, time, night, finds, goes, day, man, home, tells, house

Cluster 2 top terms:
crime, killed, detective, man, nick, officer, case, killer, murder, police

Cluster 3 top terms:
house, brother, wife, home, life, daughter, son, mother, father, family

Cluster 4 top terms:
son, house, men, time, father, love, ring, family, new, sam

Cluster 5 top terms:
characters, follows, man, director, set, young, movie, life, story, film

Cluster 6 top terms:
family, apartment, death, car, wife, tells, home, night, house, paul

Cluster 7 top terms:
man, world, home, tells, wife, family, life, new, father, jack

Cluster 8 top terms:
home, new, job, away, death, wife, car, black, film, harry

Cluster 9 top terms:
new, make, man, help, film, tells, job, adam, home, mike

Cluster 10 top terms:
friends, time, old, new, wife, town, school, man, life, young

Cluster 11 top terms:
new, police, 

In [6]:
print(len(movies))


308485


In [7]:
import wikipediaapi
import pandas as pd
import re

# Initialize Wikipedia API with a user-agent
wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent="MovieDataRetriever/1.0 (Contact: eddlimis@gmail.com)"
)


In [None]:
#movies = movies.groupby("Wikipedia movie ID")
#movies.head()



: 

In [11]:

# Function to retrieve box office revenue
def get_box_office(wiki_id):
    page = wiki_wiki.page(wiki_id)
    if page.exists():
        content = page.text
        # Use regex to find box office revenue details
        match = re.search(r'Box office\s*\n?\s*([^\n]+)', content)
        if match:
            print('done')
            return match.group(1).strip()
        else:
            return "Box office info not found"
    else:
        return "Page does not exist"

# Apply the function to fill in missing "Movie box office revenue" values
movies['Movie box office revenue'] = movies[:30].apply(
    lambda row: row['Movie box office revenue'] if pd.notna(row['Movie box office revenue']) else get_box_office(row['Wikipedia movie ID']),
    axis=1
)

# Display the DataFrame with updated box office revenue
display(movies)



Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie name,Movie release date_x,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples),Movie release date_y,...,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID,plot
0,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2001-08-24,...,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,"Set in the second half of the 22nd century, th..."
1,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2001-08-24,...,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,"Set in the second half of the 22nd century, th..."
2,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2001-08-24,...,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,"Set in the second half of the 22nd century, th..."
3,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,Page does not exist,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2001-08-24,...,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,"Set in the second half of the 22nd century, th..."
4,975900,/m/03vyhn,Ghosts of Mars,2001-08-24,Page does not exist,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science...",2001-08-24,...,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,"Set in the second half of the 22nd century, th..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
308480,12476867,/m/02w7zz8,Spliced,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0d060g"": ""Canada""}","{""/m/01jfsb"": ""Thriller"", ""/m/03npn"": ""Horror""...",2002,...,,,,,Billy Morton,,/m/0gchkcy,,/m/0gc4lfm,The movie is about a teenage girl who loves ho...
308481,12476867,/m/02w7zz8,Spliced,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0d060g"": ""Canada""}","{""/m/01jfsb"": ""Thriller"", ""/m/03npn"": ""Horror""...",2002,...,1982-01-28,,,,Andrea Runge,19.0,/m/0gckh4f,,/m/0gbx_rk,The movie is about a teenage girl who loves ho...
308482,12476867,/m/02w7zz8,Spliced,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0d060g"": ""Canada""}","{""/m/01jfsb"": ""Thriller"", ""/m/03npn"": ""Horror""...",2002,...,,F,,,Wendy Anderson,,/m/0gcp8fv,,/m/0gby01h,The movie is about a teenage girl who loves ho...
308483,12476867,/m/02w7zz8,Spliced,2002,,86.0,"{""/m/02h40lc"": ""English Language""}","{""/m/0d060g"": ""Canada""}","{""/m/01jfsb"": ""Thriller"", ""/m/03npn"": ""Horror""...",2002,...,,,,,Ariel Bastian,,/m/0gdkb51,,/m/0gdkb55,The movie is about a teenage girl who loves ho...
