In [1]:
import json
import pandas as pd

# Career Type Classification

The goal of this notebook is to classify actors into different movie categories (Romance, Action, Comedy...) based on the movies they are most known for.

## Loading the data

We first load the dataset of the 10k most popular people from the TMDB database, the data was fetched in a previous notebook.

In [3]:
json_data = json.load(open('../Data/tmdb_resources/tmdb_actors_db.json'))
tmdb_actors = pd.json_normalize(json_data['results'], sep='_')

# The column 'known_for' is a list of dictionaries. For each row, we wish to only keep the movies known for, not the media types == tv
tmdb_actors['known_for'] = tmdb_actors['known_for'].apply(lambda x: [d for d in x if d['media_type'] == 'movie'])

tmdb_actors

Unnamed: 0,adult,gender,id,known_for_department,name,original_name,popularity,profile_path,known_for
0,False,0,3234630,Acting,Sangeeth Shobhan,Sangeeth Shobhan,226.892,/7Vox31bH7XmgPNJzMKGa4uGyjW8.jpg,"[{'adult': False, 'backdrop_path': '/jBnnkkXRZ..."
1,False,2,64,Acting,Gary Oldman,Gary Oldman,220.449,/hHP0769L7YrApkMKCOyb9cwgxBW.jpg,"[{'adult': False, 'backdrop_path': '/nMKdUUepR..."
2,False,1,3194176,Acting,Angeli Khang,Angeli Khang,199.449,/7vrTWF8PxQogF6o9ORZprYQoDOr.jpg,"[{'adult': False, 'backdrop_path': '/27bkw4o1z..."
3,False,1,1373737,Acting,Florence Pugh,Florence Pugh,176.589,/421cSReX2Fktldac8SyY2k0yLwY.jpg,"[{'adult': False, 'backdrop_path': '/aAM3cQmYG..."
4,False,2,976,Acting,Jason Statham,Jason Statham,162.466,/whNwkEQYWLFJA8ij0WyOOAD5xhQ.jpg,"[{'adult': False, 'backdrop_path': '/ysKahAEPP..."
...,...,...,...,...,...,...,...,...,...
9975,False,1,1288047,Acting,Alice Isaaz,Alice Isaaz,14.133,/yULL4NbQW3ymzB2lHcI7SVlb7dS.jpg,"[{'adult': False, 'backdrop_path': '/vzcJQORoL..."
9976,False,2,19540,Acting,Peter Cullen,Peter Cullen,14.133,/9Snf4fBUkk5MrAjqtNtgZRJYJbj.jpg,"[{'adult': False, 'backdrop_path': '/2vFuG6bWG..."
9977,False,1,18465,Acting,Mary Crosby,Mary Crosby,14.133,/xMa1A3XncRLo7i1ACUzsTUyFBqy.jpg,"[{'adult': False, 'backdrop_path': '/eCebbqmTs..."
9978,False,2,110665,Acting,Daisuke Namikawa,Daisuke Namikawa,14.131,/iw0X8oDutxaBAri3Ifga8nhdUJK.jpg,[]


## Actor Genre Dataframe

We now construct a new dataframe from the oaded dataset that will store the name of the actors along with their most associated genres

In [4]:
tmdb_actors_name = pd.DataFrame(tmdb_actors['name'].unique(), columns=['name'])

To do so, we inspect the json format of a 'known_for' entry in the dataset, this entry reveals metadata info of the movies the actor is most known for

In [5]:
tmdb_actors['known_for'].loc[0]

[{'adult': False,
  'backdrop_path': '/jBnnkkXRZ0pV3Tw31Z2ALO638wA.jpg',
  'id': 1187075,
  'title': 'MAD',
  'original_language': 'te',
  'original_title': 'MAD',
  'overview': 'Set in an engineering college and revolves around the antics of the students there, primarily the boys, who get a kick out of torturing the hostel warden.',
  'poster_path': '/nDpOmgBfQZwOpFBcgokQGqd74r1.jpg',
  'media_type': 'movie',
  'genre_ids': [35, 10749, 18],
  'popularity': 8.439,
  'release_date': '2023-10-06',
  'video': False,
  'vote_average': 7.0,
  'vote_count': 4},
 {'adult': False,
  'backdrop_path': '/d7jfcyPb5ZncLyhPFNjpuIeeZ1y.jpg',
  'id': 1119091,
  'title': 'Prema Vimanam',
  'original_language': 'te',
  'original_title': 'ప్రేమ విమానం',
  'overview': 'Two kids with a dream to board a flight cross paths with a young couple who must urgently catch the flight to start a new life.',
  'poster_path': '/9eljOANAd6HafUDdmp3xnmkpnt8.jpg',
  'media_type': 'movie',
  'genre_ids': [18, 35],
  'popu

From inspecting the entries, we see that we will need to extract the genre_ids, the title of the movies and their popularity, displayed as a list for each actor

In [6]:
# Extracting 'genre_ids' from 'known_for' and adding it as a list to the DataFrame
tmdb_actors_name['genre_ids'] = tmdb_actors['known_for'].apply(lambda x: [movie['genre_ids'] for movie in x] if x else [])
#Extracting 'title' from known_for and adding it as a list to the DataFrame
tmdb_actors_name['movies'] = tmdb_actors['known_for'].apply(lambda x: [movie['title'] for movie in x] if x else [])
#Extracting popularity from known_for and adding it as a list to the DataFrame
tmdb_actors_name['popularity'] = tmdb_actors['known_for'].apply(lambda x: [movie['popularity'] for movie in x] if x else [])


In [7]:
display(tmdb_actors_name)

Unnamed: 0,name,genre_ids,movies,popularity
0,Sangeeth Shobhan,"[[35, 10749, 18], [18, 35]]","[MAD, Prema Vimanam]","[8.439, 5.124]"
1,Gary Oldman,"[[18, 28, 80, 53], [28, 80, 18, 53], [18, 36]]","[The Dark Knight, The Dark Knight Rises, Darke...","[127.121, 78.615, 32.352]"
2,Angeli Khang,"[[18, 53], [18], [18, 10749]]","[Silip Sa Apoy, Selina's Gold, Eva]","[31.85, 26.984, 31.232]"
3,Florence Pugh,"[[27, 18, 9648], [28, 12, 878], [18, 10749]]","[Midsommar, Black Widow, Little Women]","[53.79, 89.461, 36.364]"
4,Jason Statham,"[[80, 35], [28, 878, 27], [28, 80, 53]]","[Snatch, The Meg, The Transporter]","[36.288, 101.3, 34.472]"
...,...,...,...,...
9968,Alice Isaaz,"[[16, 10749, 18]]",[Flavors of Youth],[17.76]
9969,Peter Cullen,"[[27], [18], [878, 53, 18, 80]]","[Hostel: Part III, Whip It, The Butterfly Effe...","[32.287, 12.522, 16.667]"
9970,Mary Crosby,"[[12, 35, 10751, 878], [18, 9648, 53]]","[Honey, I Blew Up the Kid, Missing]","[17.743, 12.992]"
9971,Daisuke Namikawa,"[[12, 18, 35], [35], [9648, 18, 35, 80]]","[Fear and Loathing in Las Vegas, The New Guy, ...","[26.421, 15.943, 14.843]"


We then build a dictionary to map the genre ids to their respective genre values

In [8]:
# Get the list of possible movie genres

import requests
headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIyODI3M2I1YzE0MjJjOWI5YjA3ZTY2ZTczNjJjOWE2YiIsInN1YiI6IjY1NGQ1ZTc5MWFjMjkyN2IyZWJlYjZiNiIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.lqRBIip80XIdGrnJZQyZ2vY7jUWPfLYivERgw4-Ngik"
}

url_movie = "https://api.themoviedb.org/3/genre/movie/list?language=en"
response = requests.get(url_movie, headers=headers)
genres = response.json()['genres']



# Create a dictionary of genres with their ids as keys
genre_dict = {}
for genre in genres:
    genre_dict[genre['id']] = genre['name']

In [9]:
genre_dict

{28: 'Action',
 12: 'Adventure',
 16: 'Animation',
 35: 'Comedy',
 80: 'Crime',
 99: 'Documentary',
 18: 'Drama',
 10751: 'Family',
 14: 'Fantasy',
 36: 'History',
 27: 'Horror',
 10402: 'Music',
 9648: 'Mystery',
 10749: 'Romance',
 878: 'Science Fiction',
 10770: 'TV Movie',
 53: 'Thriller',
 10752: 'War',
 37: 'Western'}

We now compute the mean genre associated for each movie, we first do a non-weighted mean where we just count the number of appearance of each genre for each actor and output the ones with the highest count, then we do a weighted mean wehre we take into consideration the popularity of the movie.

### Non-weighted mean

In [10]:
# We first do a non-weighted count of the genres for each actor
def genre_count(genre_ids):
    #transform a list of list into a list
    genre_ids = [item for sublist in genre_ids for item in sublist]
    if (len(genre_ids) == 0):
        return []

    genre_count = {}
    for genre_id in genre_ids:
        if genre_id in genre_count:
            genre_count[genre_id] += 1
        else:
            genre_count[genre_id] = 1

    # Now return the genre with the highest count
    genre_count = {k: v for k, v in sorted(genre_count.items(), key=lambda item: item[1], reverse=True)}

    # If there is a tie, return them all
    max_count = max(genre_count.values())
    genre_count = {k: v for k, v in genre_count.items() if v == max_count}
    if len(genre_count) > 1:
        return [genre_dict[genre_id] for genre_id in genre_count.keys()]
    else:
        return [genre_dict[list(genre_count.keys())[0]]]

tmdb_actors_name['genre_mean'] = tmdb_actors_name['genre_ids'].apply(lambda x: genre_count(x))
tmdb_actors_name

Unnamed: 0,name,genre_ids,movies,popularity,genre_mean
0,Sangeeth Shobhan,"[[35, 10749, 18], [18, 35]]","[MAD, Prema Vimanam]","[8.439, 5.124]","[Comedy, Drama]"
1,Gary Oldman,"[[18, 28, 80, 53], [28, 80, 18, 53], [18, 36]]","[The Dark Knight, The Dark Knight Rises, Darke...","[127.121, 78.615, 32.352]",[Drama]
2,Angeli Khang,"[[18, 53], [18], [18, 10749]]","[Silip Sa Apoy, Selina's Gold, Eva]","[31.85, 26.984, 31.232]",[Drama]
3,Florence Pugh,"[[27, 18, 9648], [28, 12, 878], [18, 10749]]","[Midsommar, Black Widow, Little Women]","[53.79, 89.461, 36.364]",[Drama]
4,Jason Statham,"[[80, 35], [28, 878, 27], [28, 80, 53]]","[Snatch, The Meg, The Transporter]","[36.288, 101.3, 34.472]","[Crime, Action]"
...,...,...,...,...,...
9968,Alice Isaaz,"[[16, 10749, 18]]",[Flavors of Youth],[17.76],"[Animation, Romance, Drama]"
9969,Peter Cullen,"[[27], [18], [878, 53, 18, 80]]","[Hostel: Part III, Whip It, The Butterfly Effe...","[32.287, 12.522, 16.667]",[Drama]
9970,Mary Crosby,"[[12, 35, 10751, 878], [18, 9648, 53]]","[Honey, I Blew Up the Kid, Missing]","[17.743, 12.992]","[Adventure, Comedy, Family, Science Fiction, D..."
9971,Daisuke Namikawa,"[[12, 18, 35], [35], [9648, 18, 35, 80]]","[Fear and Loathing in Las Vegas, The New Guy, ...","[26.421, 15.943, 14.843]",[Comedy]


### Weighted Mean

In [11]:
def genre_weighted(genre_ids, popularity):
    #Multiply the genres by the popularity of their movie
    # Each sublist i gets multiplied by the popularity of the movie i
    genre_ids_mean = [[genre_id * popularity[i] for genre_id in sublist] for i, sublist in enumerate(genre_ids)]
    
    #transform a list of list into a list
    genre_ids_flatten = [item for sublist in genre_ids for item in sublist]
    if (len(genre_ids_flatten) == 0):
        return []

    genre_count = {}
    for list_id in range(len(genre_ids)):
        genre_movie = genre_ids[list_id]
        for id in range(len(genre_movie)):
            genre_id = genre_movie[id]
            weighted_genre_id = genre_ids_mean[list_id][id]
            if genre_id in genre_count:
                genre_count[genre_id] += weighted_genre_id
            else:
                genre_count[genre_id] = weighted_genre_id

    # Now return the genre with the highest count
    genre_count = {k: v for k, v in sorted(genre_count.items(), key=lambda item: item[1], reverse=True)}

    # If there is a tie, return them all
    max_count = max(genre_count.values())
    genre_count = {k: v for k, v in genre_count.items() if v == max_count}
    if len(genre_count) > 1:
        return [genre_dict[genre_id] for genre_id in genre_count.keys()]
    else:
        return [genre_dict[list(genre_count.keys())[0]]]


tmdb_actors_name['genre_mean_weighted']  = tmdb_actors_name.apply(lambda x: genre_weighted(x['genre_ids'], x['popularity']), axis=1)
    

In [12]:
display(tmdb_actors_name)

Unnamed: 0,name,genre_ids,movies,popularity,genre_mean,genre_mean_weighted
0,Sangeeth Shobhan,"[[35, 10749, 18], [18, 35]]","[MAD, Prema Vimanam]","[8.439, 5.124]","[Comedy, Drama]",[Romance]
1,Gary Oldman,"[[18, 28, 80, 53], [28, 80, 18, 53], [18, 36]]","[The Dark Knight, The Dark Knight Rises, Darke...","[127.121, 78.615, 32.352]",[Drama],[Crime]
2,Angeli Khang,"[[18, 53], [18], [18, 10749]]","[Silip Sa Apoy, Selina's Gold, Eva]","[31.85, 26.984, 31.232]",[Drama],[Romance]
3,Florence Pugh,"[[27, 18, 9648], [28, 12, 878], [18, 10749]]","[Midsommar, Black Widow, Little Women]","[53.79, 89.461, 36.364]",[Drama],[Mystery]
4,Jason Statham,"[[80, 35], [28, 878, 27], [28, 80, 53]]","[Snatch, The Meg, The Transporter]","[36.288, 101.3, 34.472]","[Crime, Action]",[Science Fiction]
...,...,...,...,...,...,...
9968,Alice Isaaz,"[[16, 10749, 18]]",[Flavors of Youth],[17.76],"[Animation, Romance, Drama]",[Romance]
9969,Peter Cullen,"[[27], [18], [878, 53, 18, 80]]","[Hostel: Part III, Whip It, The Butterfly Effe...","[32.287, 12.522, 16.667]",[Drama],[Science Fiction]
9970,Mary Crosby,"[[12, 35, 10751, 878], [18, 9648, 53]]","[Honey, I Blew Up the Kid, Missing]","[17.743, 12.992]","[Adventure, Comedy, Family, Science Fiction, D...",[Family]
9971,Daisuke Namikawa,"[[12, 18, 35], [35], [9648, 18, 35, 80]]","[Fear and Loathing in Las Vegas, The New Guy, ...","[26.421, 15.943, 14.843]",[Comedy],[Mystery]


Let's check with a know celebrity, Ryan Gosling should definitely be associated to Romance

In [13]:
# Display the genre_mean_weighted for Ryan Gosling
tmdb_actors_name[tmdb_actors_name['name'] == 'Ryan Gosling']

Unnamed: 0,name,genre_ids,movies,popularity,genre_mean,genre_mean_weighted
95,Ryan Gosling,"[[35, 18, 10749, 10402], [878, 18], [18, 53, 80]]","[La La Land, Blade Runner 2049, Drive]","[41.818, 113.999, 45.789]",[Drama],[Romance]


# Saving the resuts

In [16]:
# Save the results to a csv file
tmdb_actors_name.to_csv('../Data/preprocessed_data/actor_genre.csv', index=False)