In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle as p
import networkx as nx
import matplotlib.pyplot as plt
from ActorNetwork import ActorNetwork
import time
import numpy as np
import time, sys, pickle
from collections import defaultdict, deque

min_year=2020
max_year = 2026
basics = pd.read_csv(
    "data/title.basics.tsv",
    sep="\t",
    dtype=str,
    na_values="\\N",
    usecols=["tconst", "titleType", "primaryTitle", "originalTitle", "startYear"]
)

basics = basics[basics["titleType"].isin(["movie", "tvMovie"])].copy()
basics["startYear"] = pd.to_numeric(basics["startYear"], errors="coerce")

basics = basics[
    (basics["startYear"] >= min_year) &
    (basics["startYear"] <= max_year)
].copy()

basics["primary_norm"] = basics["primaryTitle"]
basics["original_norm"] = basics["originalTitle"]

print("IMDb basics rows after filter:", len(basics))


IMDb basics rows after filter: 134268


In [32]:
# -----------------------------
# Load IMDb AKAS (filtered)
# -----------------------------
akas = pd.read_csv(
    "data/title.akas.tsv",
    sep="\t",
    dtype=str,
    na_values="\\N",
    usecols=["titleId", "title"]
)

akas = akas[akas["titleId"].isin(set(basics["tconst"]))].copy()
akas["aka_norm"] = akas["title"]

print("IMDb akas rows after filter:", len(akas))

# -----------------------------
# Load CREW + NAMES (unchanged)
# -----------------------------
crew = pd.read_csv("data/title.crew.tsv", sep="\t", dtype=str, na_values="\\N")
names = pd.read_csv("data/name.basics.tsv", sep="\t", dtype=str, na_values="\\N")

names["name_norm"] = names["primaryName"]
name_map = dict(zip(names["nconst"], names["name_norm"]))

# Build tconst → directors map
director_map = {}
crew["directors"] = crew["directors"].fillna("")
for tconst, dstr in zip(crew["tconst"], crew["directors"]):
    director_map[tconst] = [name_map[d] for d in dstr.split(",") if d in name_map]

# -----------------------------
# FAST Title Index Construction
# -----------------------------
title_index = {}

def add(k, v):
    if k:
        title_index.setdefault(k, set()).add(v)

# Primary + original titles
for t, tc in zip(basics["primary_norm"], basics["tconst"]):
    add(t, tc)

for t, tc in zip(basics["original_norm"], basics["tconst"]):
    add(t, tc)

# Alternate titles
for t, tc in zip(akas["aka_norm"], akas["titleId"]):
    add(t, tc)

print("Title index size:", len(title_index))

IMDb akas rows after filter: 674704
Title index size: 284827


In [33]:
crew

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,
1,tt0000002,nm0721526,
2,tt0000003,nm0721526,nm0721526
3,tt0000004,nm0721526,
4,tt0000005,nm0005690,
...,...,...,...
12183680,tt9916848,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
12183681,tt9916850,nm1485677,"nm9187127,nm1485677,nm9826385,nm1628284"
12183682,tt9916852,nm1485677,"nm9187127,nm1485677,nm9826385,nm9299459,nm1628284"
12183683,tt9916856,nm10538645,nm6951431


In [34]:
names

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,name_norm
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164",Fred Astaire
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0038355,tt0117057",Lauren Bacall
2,nm0000003,Brigitte Bardot,1934,,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452",Brigitte Bardot
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723",John Belushi
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0083922,tt0050976",Ingmar Bergman
...,...,...,...,...,...,...,...
14768107,nm9993714,Romeo del Rosario,,,"animation_department,art_department","tt11657662,tt14069590,tt2455546",Romeo del Rosario
14768108,nm9993716,Essias Loberg,,,,,Essias Loberg
14768109,nm9993717,Harikrishnan Rajan,,,cinematographer,tt8736744,Harikrishnan Rajan
14768110,nm9993718,Aayush Nair,,,cinematographer,tt8736744,Aayush Nair


In [35]:
names["knownForTitles"].apply(lambda x: repr(x)).value_counts().head(10)


knownForTitles
nan             1733359
'tt0123338'        8235
'tt22014400'       7492
'tt4202558'        7290
'tt6168110'        6347
'tt0486535'        5115
'tt0441074'        4875
'tt0072584'        4760
'tt11874658'       4687
'tt0159881'        4377
Name: count, dtype: int64

In [46]:
#names = names[names["knownForTitles"].isna()]
names = names[
   ~names["knownForTitles"].isna() &
    ~names["knownForTitles"].astype(str).str.strip().eq("")
]
names

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,name_norm
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0025164",Fred Astaire
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0038355,tt0117057",Lauren Bacall
2,nm0000003,Brigitte Bardot,1934,,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452",Brigitte Bardot
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723",John Belushi
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0083922,tt0050976",Ingmar Bergman
...,...,...,...,...,...,...,...
14768102,nm9993709,Lu Bevins,,,"producer,writer,director","tt17717854,tt11772904,tt11772812,tt11697102",Lu Bevins
14768106,nm9993713,Sambit Mishra,,,"writer,producer","tt20319332,tt27191658,tt10709066,tt15134202",Sambit Mishra
14768107,nm9993714,Romeo del Rosario,,,"animation_department,art_department","tt11657662,tt14069590,tt2455546",Romeo del Rosario
14768109,nm9993717,Harikrishnan Rajan,,,cinematographer,tt8736744,Harikrishnan Rajan


In [37]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,primary_norm,original_norm
61097,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,2020.0,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante
69136,tt0070596,movie,Socialist Realism,El realismo socialista,2023.0,Socialist Realism,El realismo socialista
76036,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,2022.0,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal
91042,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,2020.0,Grizzly II: Revenge,Grizzly II: The Predator
95574,tt0097767,movie,Loading Ludwig,Loading Ludwig,2022.0,Loading Ludwig,Loading Ludwig
...,...,...,...,...,...,...,...
12177666,tt9914192,movie,No Gogó do Paulinho,No Gogó do Paulinho,2020.0,No Gogó do Paulinho,No Gogó do Paulinho
12178026,tt9914972,movie,Blind Ambition,Blind Ambition,2021.0,Blind Ambition,Blind Ambition
12178554,tt9916190,movie,Safeguard,Safeguard,2020.0,Safeguard,Safeguard
12178593,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,2020.0,Il talento del calabrone,Il talento del calabrone


In [38]:
basics["titleType"].unique()

array(['movie', 'tvMovie'], dtype=object)

In [39]:
basics = basics[basics["titleType"]=="movie"]




In [40]:
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,startYear,primary_norm,original_norm
61097,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,2020.0,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante
69136,tt0070596,movie,Socialist Realism,El realismo socialista,2023.0,Socialist Realism,El realismo socialista
76036,tt0077684,movie,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal,2022.0,Histórias de Combóios em Portugal,Histórias de Combóios em Portugal
91042,tt0093119,movie,Grizzly II: Revenge,Grizzly II: The Predator,2020.0,Grizzly II: Revenge,Grizzly II: The Predator
95574,tt0097767,movie,Loading Ludwig,Loading Ludwig,2022.0,Loading Ludwig,Loading Ludwig
...,...,...,...,...,...,...,...
12177666,tt9914192,movie,No Gogó do Paulinho,No Gogó do Paulinho,2020.0,No Gogó do Paulinho,No Gogó do Paulinho
12178026,tt9914972,movie,Blind Ambition,Blind Ambition,2021.0,Blind Ambition,Blind Ambition
12178554,tt9916190,movie,Safeguard,Safeguard,2020.0,Safeguard,Safeguard
12178593,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,2020.0,Il talento del calabrone,Il talento del calabrone


In [51]:
movie_tconsts = set(
    basics.loc[basics["titleType"] == "movie", "tconst"]
)
def known_titles_all_in_basics(kft):
    if pd.isna(kft):
        return False
    titles = kft.split(",")
    return all(t in movie_tconsts for t in titles)
names_filtered = names[names["knownForTitles"].apply(known_titles_all_in_basics)]


In [52]:
names_filtered

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,name_norm
4507,nm0004527,Daniel J. Heffner,1956,,"producer,executive,production_manager","tt11301886,tt5834874,tt13655328,tt21807222",Daniel J. Heffner
7002,nm0007030,Dean Thompson,,,camera_department,"tt22687790,tt12789558,tt2382320,tt2049403",Dean Thompson
7010,nm0007038,Matthew A. Petrosky,,,"camera_department,cinematographer,producer","tt9777666,tt10872600,tt9114286,tt20969586",Matthew A. Petrosky
12968,nm0013335,Jorge Aldama,,,"actor,director,producer","tt36593465,tt36593056,tt36592997,tt36591278",Jorge Aldama
16798,nm0017385,Julio Aldama Zaizar,,,actor,"tt32276885,tt36593104,tt33333180,tt30182410",Julio Aldama Zaizar
...,...,...,...,...,...,...,...
14767982,nm9993573,Lakisha Louissaint,,,"writer,casting_department,director",tt10299418,Lakisha Louissaint
14768047,nm9993648,Kirby Devon,,,actor,tt4460424,Kirby Devon
14768075,nm9993681,Saiya Palmer,,,actress,tt8295580,Saiya Palmer
14768083,nm9993689,Chris Bailey,,,miscellaneous,tt31850564,Chris Bailey


In [47]:
#basics["startYear"]==2020
#movies2020 = basics[basics["startYear"]==2020]

In [48]:
#movies2020

In [50]:
#crew

In [1]:
# =========================
# BUILD MOVIE → ACTORS LOOKUP
# =========================
movie_to_actors = defaultdict(deque)

for _, row in names_filtered.iterrows():
    if pd.isna(row["knownForTitles"]):
        continue
    for movie in row["knownForTitles"].split(","):
        movie_to_actors[movie].append(row["nconst"])

NameError: name 'defaultdict' is not defined

In [2]:
movie_to_actors

NameError: name 'movie_to_actors' is not defined

In [None]:
an=ActorNetwork()
counter = 0

for movie, actors in movie_to_actors.items():
    if len(actors) > 1:   # skip single-actor movies
        an.addMovie(movie, deque(actors))
        counter += 1

        if counter % 20000 == 0:
            print(f"{counter} movies processed")

In [None]:
an