In [1]:
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
from collections import Counter
import pandas as pd
import warnings
import json
import os
import scipy

# Allow python to import modules from the preprocessing folder
sys.path.append("src/preprocessing/")
warnings.filterwarnings("ignore") # For seaborn

from load_dataset import *

SAVE_PATH = "./data/refined"
if not os.path.exists(SAVE_PATH):
    os.mkdir(SAVE_PATH)

# Load results of part 3

In [50]:
movies = pd.read_json(os.path.join(SAVE_PATH, "movies.json"))
movie_genres = pd.read_json(os.path.join(SAVE_PATH, "movie_genres.json"))

actors_characters = pd.read_json(os.path.join(SAVE_PATH, "characters.json")) # Genres freq computed using this, not personas
actors_characters_personas = pd.read_json(os.path.join(SAVE_PATH, "characters_personas.json"))
actors_characters_personas_genres = pd.read_json(os.path.join(SAVE_PATH, "characters_personas_genres.json"))

In [63]:
# Genre frequency vectors are obtained using a much bigger collection of characters than those for which we have personas
# which is why we establish the movie entropy baseline using movies from that collection, not the one with personas
movies_from_characters = actors_characters.wiki_movie_id.unique()
movies_with_characters = movies[movies.wiki_movie_id.isin(movies_from_characters)]

In [69]:
# Retrieve global genre distribution
all_movie_genres = movies_with_characters["genres"].copy()
frequency_genre = dict(Counter(all_movie_genres.explode().reset_index(drop=True).to_list()))

# Global entropy of movie genres
global_entropy_genres = scipy.stats.entropy(list(frequency_genre.values()))

In [70]:
def compute_genre_mutual_information(actor_genres_frac):
    entropy_genres_given_actor = scipy.stats.entropy(actor_genres_frac)
    return (global_entropy_genres - entropy_genres_given_actor)/global_entropy_genres

In [74]:
# Compute for our final collection, with personas this time
actors_characters_personas_genres["genre_mu"] = actors_characters_personas_genres["genres_freq"].apply(compute_genre_mutual_information)
# genre_mu close to 1 => very polarized

### Inspecting how good this metric reflects polarization

In [81]:
sorted_mu = actors_characters_personas_genres[actors_characters_personas_genres["total_movies"] > 10].sort_values("genre_mu").copy()

In [85]:
sorted_mu_no_dup = sorted_mu.drop_duplicates("actor_name").reset_index(drop=True)

In [115]:
roi = -1 # Last
inspected_genres = list(zip(sorted_mu_no_dup.iloc[roi]["genres_freq"], movie_genres[0].to_list()))
for elem in inspected_genres:
    if elem[0] == 0:
        continue
    print(elem)

(0.1153846154, 'Animation')
(0.0384615385, 'Anime')
(0.8461538462, 'Japanese Movies')
