<a href="https://colab.research.google.com/github/gomescheelsy03/INST-414-0101-/blob/main/W5_in_class_exercise_CGomes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Name: Chelsy Gomes
# Instructor: Cody Buntain
# Course: INST414 (0101)
# Date: 10/03/2025
# Assignment: In-Class Exercise Week 5

Exercise 1. Finding Similar Actors based on Genre

In [2]:
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import json
from sklearn.metrics import DistanceMetric


In [3]:
actor_name_map = {}
actor_genre_counts = defaultdict(Counter)
all_genres = set()


In [4]:
file_path = "/content/imdb_movies_2000to2022.prolific.json"

with open(file_path, "r") as in_file:
    for line in in_file:
        movie = json.loads(line)
        genres = movie.get("genres", []) or []
        actors = movie.get("actors", []) or []

        for g in genres:
            all_genres.add(g)

        for actor_id, actor_name in actors:
            actor_name_map[actor_id] = actor_name
            for g in genres:
                actor_genre_counts[actor_id][g] += 1


In [5]:
genres_sorted = sorted(all_genres)

df = pd.DataFrame.from_dict(
    {aid: [actor_genre_counts[aid].get(g, 0) for g in genres_sorted]
     for aid in actor_genre_counts},
    orient="index"
).astype(int)

actor_names = pd.Series({aid: actor_name_map.get(aid, aid) for aid in df.index}, name="name")


In [6]:
query_actor_id = "nm1165110"
if query_actor_id not in df.index:
    raise ValueError("Not found in the dataset")


In [7]:
dist = DistanceMetric.get_metric("euclidean")


query_vec = df.loc[[query_actor_id]].values
all_vecs = df.values

distances = dist.pairwise(query_vec, all_vecs).ravel()
dist_series = pd.Series(distances, index=df.index)

top10 = dist_series.drop(index=[query_actor_id]).sort_values().head(10)


In [8]:
result = (
    pd.DataFrame(
        {
            "actor_name": actor_names.loc[top10.index].values,
            "actor_id": top10.index,
            "euclidean_distance": top10.values,
        }
    )
    .reset_index(drop=True)
)

print("Top 10 actors most similar to Chris Hemsworth (nm1165110) by genre profile (Euclidean):")
print(result.to_string(index=False))


Top 10 actors most similar to Chris Hemsworth (nm1165110) by genre profile (Euclidean):
    actor_name  actor_id  euclidean_distance
    Tom Cruise nm0000129            5.477226
  Henry Cavill nm0147147            8.246211
 Tyrese Gibson nm0879085            8.831761
 Orlando Bloom nm0089217            9.055385
    Vin Diesel nm0004874            9.219544
Angelina Jolie nm0001401            9.219544
   Adrian Paul nm0001600            9.591663
        Jet Li nm0001472            9.591663
 Mark Dacascos nm0001092            9.797959
 Lorenzo Lamas nm0001444           10.049876


Extra Practice. Finding Similar Actors based on Co-stars

In [3]:
from collections import defaultdict, Counter
import pandas as pd
import numpy as np
import json
from sklearn.metrics import DistanceMetric


In [4]:

file_path = "/content/imdb_movies_2000to2022.prolific.json"


In [5]:
actor_name_map = {}
costar_counts  = defaultdict(Counter)

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        movie  = json.loads(line)
        actors = movie.get("actors", []) or []  # list of [actor_id, actor_name]

        ids = []
        for aid, aname in actors:
            actor_name_map[aid] = aname
            ids.append(aid)

        n = len(ids)
        for i in range(n):
            ai = ids[i]
            for j in range(i+1, n):
                aj = ids[j]
                costar_counts[ai][aj] += 1
                costar_counts[aj][ai] += 1


In [6]:
query_actor_id = "nm0424060"   # Scarlett Johansson

if query_actor_id not in costar_counts:
    raise ValueError("Scarlett Johansson not found in dataset.")

# Use Scarlett’s co-stars as features
features = set(costar_counts[query_actor_id].keys())
features = sorted(features)

# Scarlett’s vector
query_vec = np.array([costar_counts[query_actor_id].get(f, 0) for f in features]).reshape(1, -1)

# Other actors’ vectors
rows = []
actor_ids = []
for aid in actor_name_map.keys():
    if aid == query_actor_id:
        continue
    vec = [costar_counts[aid].get(f, 0) for f in features]
    if any(vec):   # skip if no overlap at all
        rows.append(vec)
        actor_ids.append(aid)

all_vecs = np.array(rows)


In [7]:
dist = DistanceMetric.get_metric("euclidean")

distances = dist.pairwise(query_vec, all_vecs).ravel()
dist_series = pd.Series(distances, index=actor_ids)


In [8]:
top10 = dist_series.sort_values().head(10)

result = pd.DataFrame({
    "actor_name": [actor_name_map[aid] for aid in top10.index],
    "actor_id": top10.index,
    "euclidean_distance": top10.values
}).reset_index(drop=True)

print(f"Top 10 actors most similar to {actor_name_map[query_actor_id]} ({query_actor_id}) by co-star profile (Euclidean):")
print(result.to_string(index=False))


Top 10 actors most similar to Scarlett Johansson (nm0424060) by co-star profile (Euclidean):
       actor_name  actor_id  euclidean_distance
     Mark Ruffalo nm0749263           10.198039
Samuel L. Jackson nm0000168           10.583005
  Chris Hemsworth nm1165110           10.677078
Robert Downey Jr. nm0000375           10.862780
    Nicole Kidman nm0000173           10.862780
Michelle Monaghan nm1157358           10.908712
      Colin Firth nm0000147           10.908712
  Jake Gyllenhaal nm0350453           10.908712
      Chris Evans nm0262635           10.954451
        Ed Harris nm0000438           10.954451
