In [1]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from collections import Counter
import pandas as pd
import warnings
import json
import os
import scipy

# Allow python to import modules from the preprocessing folder
sys.path.append("src/preprocessing/")
warnings.filterwarnings("ignore") # For seaborn

from load_dataset import *

SAVE_PATH = "./data/refined"
if not os.path.exists(SAVE_PATH):
    os.mkdir(SAVE_PATH)

# Load results of part 3

In [45]:
movies = pd.read_json(os.path.join(SAVE_PATH, "movies.json"))
movie_genres = pd.read_json(os.path.join(SAVE_PATH, "movie_genres.json"))
actors_characters_personas_genres = pd.read_json(os.path.join(SAVE_PATH, "characters_personas_genres.json"))

In [None]:
# Used characters without personas for genres => lot more occurences, potentially outside movie genres we have

In [46]:
# Maybe compute H movies only for movies for which we have characters
len(movies["freebase_id"].unique())

44450

In [47]:
len(actors_characters_personas_genres["freebase_id"].unique())

11220

In [44]:
movies

Unnamed: 0,tconst,runtimeMinutes,genres,averageRating,numVotes,freebase_id,wiki_movie_id,movie_name,movie_release_date,box_office,movie_languages,movie_countries,movie_genres,plot_summary
0,tt0000009,45,[Romance],5.3,207,/m/02q23xk,10109752,Miss Jerry,1894-10-09,,{'/m/06ppq': 'Silent film'},{'/m/09c7w0': 'United States of America'},"{'/m/02hmvc': 'Short Film', '/m/06ppq': 'Silen...",After finding out that her father is suffering...
1,tt0000147,100,"[Documentary, News, Sport]",5.3,484,/m/0czdh_n,28703057,The Corbett-Fitzsimmons Fight,1897-05-22,100000.0,{},{},{'/m/01z02hx': 'Sports'},The film no longer exists in its entirety; how...
2,tt0000574,70,"[Action, Adventure, Biography]",6.0,855,/m/0120y4,142995,The Story of the Kelly Gang,1906-12-26,,{'/m/02h40lc': 'English Language'},{'/m/0chghy': 'Australia'},"{'/m/0lsxr': 'Crime Fiction', '/m/06ppq': 'Sil...",The Story of the Kelly Gangs tone is of sorrow...
3,tt0000591,90,[Drama],5.0,21,/m/07s67rr,4849466,L'Enfant prodigue,1907-06-20,,{'/m/064_8sq': 'French Language'},{'/m/0f8l9c': 'France'},"{'/m/02hmvc': 'Short Film', '/m/06ppq': 'Silen...",
4,tt0000679,120,"[Adventure, Fantasy]",5.1,68,/m/0fgfyy,5954041,The Fairylogue and Radio-Plays,1908-09-24,,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/06ppq': 'Silent film', '/m/01g6gs': 'Blac...",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44471,tt9098234,77,"[Comedy, Musical, Mystery]",6.0,16,/m/07kc8js,24081363,Moskal-Charivnyk,1995,,"{'/m/0cjk9': 'Ukrainian Language', '/m/06b_j':...",{'/m/07t21': 'Ukraine'},"{'/m/04t36': 'Musical', '/m/01z4y': 'Comedy'}",The story takes place in Ukraine at the start ...
44472,tt9244928,119,[Drama],8.1,9,/m/02r1f94,11142347,Unarchigal,1976,,"{'/m/0999q': 'Malayalam Language', '/m/07c9s':...",{'/m/03rk0': 'India'},{'/m/07s9rl0': 'Drama'},"The movie deals with Selvam , an 18 year old s..."
44473,tt9330112,120,"[Drama, Mystery]",7.2,90,/m/04mzlrp,19620290,Ninaithale Inikkum,2009-09-04,,{'/m/07c9s': 'Tamil Language'},{'/m/03rk0': 'India'},"{'/m/02n4kr': 'Mystery', '/m/07s9rl0': 'Drama'}",The young college-going Shiva is extremely en...
44474,tt9401672,77,[Documentary],6.3,9,/m/04184zz,17093635,Criminals Gone Wild,2008,,{'/m/02h40lc': 'English Language'},{'/m/09c7w0': 'United States of America'},"{'/m/0hj3n07': 'Culture & Society', '/m/0hj3n9...",


In [26]:
# Retrieve global genre distribution
all_movie_genres = movies["genres"].copy()
frequency_genre = dict(Counter(all_movie_genres.explode().reset_index(drop=True).to_list()))

# Entropy of movie genres
global_entropy_genres = scipy.stats.entropy(list(frequency_genre.values()))

In [27]:
def compute_genre_mutual_information(actor_genres_frac):
    entropy_genres_given_actor = scipy.stats.entropy(actor_genres_frac)
    return (global_entropy_genres - entropy_genres_given_actor)/global_entropy_genres

In [28]:
actors_characters_personas_genres["genre_mu"] = actors_characters_personas_genres["genres_freq"].apply(compute_genre_mutual_information)

In [34]:
actors_characters_personas_genres[actors_characters_personas_genres.genre_mu != 1].sort_values("genre_mu")

Unnamed: 0,freebase_map_id,movie_name,token_occurences,estimated_trope,trope_distrib,wiki_movie_id,freebase_id,release_date,character_name,actor_birth,actor_gender,actor_height,actor_ethnicity,actor_name,release_actor_age,freebase_character_id,freebase_actor_id,genres_freq,genre_mu
3042,/m/04hv9dv,The Majestic,9,28,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",302491,/m/01s1wm,2001-12-11,Harry Trimble,1928-06-20,M,1.854,/m/048z7l,Martin Landau,73.0,/m/0gkztpd,/m/01mqnr,"[0.0, 0.0, 0.0331753555, 0.0047393365, 0.0, 0....",-0.559777
3041,/m/064z08s,The Aryan Couple,6,40,"[0.0, 0.0, 0.0799999982, 0.019999999600000002,...",22516313,/m/05ztgyf,2004-12-10,Joseph Krauzenberg,1928-06-20,M,1.854,/m/048z7l,Martin Landau,76.0,/m/0h351lm,/m/01mqnr,"[0.0, 0.0, 0.0331753555, 0.0047393365, 0.0, 0....",-0.559777
3038,/m/059snlg,Without Warning,6,21,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",17379289,/m/043qtrq,1980-11-26,Fred 'Sarge' Dobbs,1928-06-20,M,1.854,/m/048z7l,Martin Landau,52.0,/m/0h351mc,/m/01mqnr,"[0.0, 0.0, 0.0331753555, 0.0047393365, 0.0, 0....",-0.559777
3043,/m/0c5mvkk,Alone in the Dark,3,33,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.019999999600000002...",4083099,/m/0bh4wr,1982-11-12,Byron 'Preacher' Sutcliff,1928-06-20,M,1.854,/m/048z7l,Martin Landau,54.0,/m/0h351qs,/m/01mqnr,"[0.0, 0.0, 0.0331753555, 0.0047393365, 0.0, 0....",-0.559777
3044,/m/0jwhy0,Cleopatra,3,30,"[0.0799999982, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0...",4954774,/m/0cwy47,1963-06-12,Rufio,1928-06-20,M,1.854,/m/048z7l,Martin Landau,34.0,/m/0h35rdv,/m/01mqnr,"[0.0, 0.0, 0.0331753555, 0.0047393365, 0.0, 0....",-0.559777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27481,/m/0g4z86r,The Vow,3,23,"[0.0, 0.0, 0.0700000003, 0.1099999994, 0.0, 0....",33010153,/m/0g4z1x8,1946-07-29,"Aleksandr, her eldest son",1899-10-22,M,,,Nikolay Bogolyubov,,/m/0g4z82w,/m/04jnb0l,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.782704
4199,/m/0k27jd3,Doraemon: Nobita and the Steel Troops,21,34,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",10487338,/m/02qflz1,1986-03-15,Nobita Nobi,1935-10-02,F,,,Noriko Ohara,50.0,/m/0h57kr6,/m/0k_q34,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.800672
18554,/m/0h7dr6w,Chowringhee,3,3,"[0.0, 0.0, 0.1599999964, 0.6499999762, 0.0, 0....",15790575,/m/03nt_7g,1968,Sujata,1944-12-30,F,,,Anjana Bhowmick,23.0,/m/0h7dr6y,/m/04ybp5_,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",
29490,/m/02h7x3x,Don Giovanni,3,2,"[0.0, 0.0, 0.6499999762, 0.0900000036, 0.0, 0....",5878592,/m/0fbfhr,1979-11-06,Donna Anna,1938-10-27,F,,,Edda Moser,41.0,/m/02nwdd_,/m/069pqm,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",


In [37]:
# Close to 1, very polarized
actors_characters_personas_genres.iloc[4199]["genres_freq"]

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1153846154,
 0.0384615385,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0