In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [3]:
from IPython.display import clear_output

In [4]:
from sklearn.cluster import AgglomerativeClustering, KMeans

In [5]:
from utils.clustering import get_lda_clusters, get_vocab, word_topics_clustering, sort_meaningful, get_trf_clusters, topic_count
from utils.clustering_evaluation import get_characters_with_tv_trop_info, variation_of_information, group_labels_by_clusters

  from .autonotebook import tqdm as notebook_tqdm


# Clustering methods comparison

We use Variation of Information between our clusters and golden clusters from TV Tropes as suggested in [Learning Latent Personas of Film Characters](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf). This way we can compare our methods' performance with the original method performance.

### LDA based clustering

In [6]:
characters_attributes =  pd.read_csv(
    'data/character_attributes.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
# select only the characters who have at least 3 liguistic features
characters_attributes = sort_meaningful(characters_attributes, 3)

characters_attributes.head()

Unnamed: 0,wiki_id,character,adj,active,patient
0,31186339,Peeta Mellark,[son],"[take, reveal, mean, form, present, beg, tell]",[force]
1,31186339,Cato,[],[kill],"[encounter, wound, shoot]"
2,31186339,Katniss,[],"[take, survive, drop, warn, run, shoot, presen...","[give, find, torment, spare, force, tell, warn]"
3,31186339,Rue,[die],"[draw, care, draw, trap]","[hear, stab, comfort, kill]"
4,31186339,Seneca Crane,[Gamemaker],"[change, lock]","[summon, convince]"


In [7]:
characters_to_check, tv_tropes = get_characters_with_tv_trop_info(characters_attributes)

In [8]:
agglomerative_clusters_n = [25, 50, 100]
n_components = [25, 50, 100]

configs = {}
config_base = {'characters': characters_to_check, 'min_freq': 5, 'max_freq':0.9}

for alg_n in agglomerative_clusters_n:
    for n in n_components:
        config = config_base.copy()
        config['clustering_algo'] = AgglomerativeClustering(n_clusters=alg_n, metric='cosine', linkage='complete')
        config['n_components'] = n
        configs[f'{alg_n} topics, {n} archetypes'] = config

results_lda = {}
for k, config in configs.items():
    clusters = get_lda_clusters(**config)
    results_lda[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_lda[k]}')

clear_output(wait=True)
results_lda

{'25 topics, 25 archetypes': 6.2466797099799205,
 '25 topics, 50 archetypes': 5.721030072070011,
 '25 topics, 100 archetypes': 5.2078476164448135,
 '50 topics, 25 archetypes': 6.3319648575061835,
 '50 topics, 50 archetypes': 5.666473591455292,
 '50 topics, 100 archetypes': 4.976150571899939,
 '100 topics, 25 archetypes': 6.349160556531042,
 '100 topics, 50 archetypes': 5.607356379575675,
 '100 topics, 100 archetypes': 4.898739702618948}

Note, that the results are even better (K=100, P=100, 5.42 in the paper and 4.9 here) than the results from the [paper](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf). That could indicate that using word2vec embeddings and Agglomerative clustering of the words to topics might be better suited for dividing the words into topics for the purpose of personas extraction.

### BERT based clustering

In [15]:
characters_with_trf_emb =  pd.read_csv(
    'data/trf_embeddings_for_labeled_characters.csv', 
    index_col=0,
    converters={
        "emb": lambda x: [float(k) for k in x.strip("[]").replace("'","").split(", ")]
        }
    )
# Leave only those, who we compared on the previous step
characters_with_trf_emb = characters_with_trf_emb[characters_with_trf_emb['wiki_id'].isin(characters_to_check['wiki_id'].values)]

characters_with_trf_emb.head()

Unnamed: 0,wiki_id,character,emb
0,6002183,Horton,"[1.0488402843475342, 0.3811729848384857, 0.645..."
1,6002183,Ned McDodd,"[-0.5622232556343079, -0.2521360516548157, -0...."
2,6002183,JoJo,"[-0.7999439835548401, -0.4102073311805725, 0.6..."
3,6002183,Sally,"[0.7803803086280823, -0.7006192207336426, 0.32..."
4,6002183,LaRue,"[-0.038770418614149094, 0.219954714179039, -0...."


In [16]:
characters_to_check_trf, tv_tropes = get_characters_with_tv_trop_info(characters_with_trf_emb)

In [17]:
results_trf = {}
for n in n_components:
    k = f'{n} archetypes, agglomerative clustering'
    agglomerative = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='complete')
    clusters = get_trf_clusters(characters_to_check_trf, agglomerative)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_trf[k]}')

    k = f'{n} archetypes, kmeans clustering'
    kmeans = KMeans(n_clusters=n)
    clusters = get_trf_clusters(characters_to_check_trf, kmeans)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_trf[k]}')

clear_output(wait=True)
results_trf

{'25 archetypes, agglomerative clustering': 6.126084917715467,
 '25 archetypes, kmeans clustering': 6.116164230254499,
 '50 archetypes, agglomerative clustering': 5.5954496618130625,
 '50 archetypes, kmeans clustering': 5.484842314253758,
 '100 archetypes, agglomerative clustering': 4.936886689856702,
 '100 archetypes, kmeans clustering': 4.809396999118923}

The results of BERT embeddings based clustering are also better than the results from the paper. Still, obtaining these embeddings is slow, and the difference between this and previous method is not that big, so we will stick to the faster and more interpretable LDA based method.

# Loading the data

For the character names and linguistic features extraction pipeline, please refer to `extract_character_attributes.ipynb`. And for the clusterization pipeline as well as the different clustering methods comparison refer to `clustering.ipynb`. For our initial analysis we will use 60 clusters.

In [18]:
characters =  pd.read_csv(
    'data/character_clusters.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

Unnamed: 0,wiki_id,character,adj,active,patient,cluster
0,31186339,Peeta Mellark,[son],"[take, reveal, mean, form, present, beg, tell]",[force],29
1,31186339,Cato,[],[kill],"[encounter, wound, shoot]",55
2,31186339,Katniss,[],"[take, survive, drop, warn, run, shoot, presen...","[give, find, torment, spare, force, tell, warn]",2
3,31186339,Rue,[die],"[draw, care, draw, trap]","[hear, stab, comfort, kill]",1
4,31186339,Seneca Crane,[Gamemaker],"[change, lock]","[summon, convince]",1


In [32]:
print(f"In the clustered characters dataframe there are {len(characters)} characters from {len(set(characters['wiki_id'].values))} movies")

In the clustered characters dataframe there are 74842 characters from 25664 movies


In [20]:
movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)

In [33]:
characters_and_movies = characters.merge(movies, how='left', on='wiki_id')
characters_and_movies = characters_and_movies[characters_and_movies['revenue'].notna()]

print(f"In the clustered characters with movie metadata dataframe there are {len(characters_and_movies)} characters from {len(set(characters_and_movies['wiki_id'].values))} movies with the revenue data")

In the clustered characters with movie metadata dataframe there are 23911 characters from 6280 movies with the revenue data


In [31]:
characters_and_movies.sample(5)

Unnamed: 0,wiki_id,character,adj,active,patient,cluster,freebase_id,title,release_date,revenue,runtime,languages,countries,genres
45359,3771410,Clete Ferguson,[],"[begin, try, try]",[stalk],35,/m/09_3yl,Revenge of the Creature,1955-05-11,1100000.0,82.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0cq22z7"": ""Sci-Fi Horror"", ""/m/03npn"": ""H..."
64902,985304,Lana Marcus,[friend],"[enter, have, have, leave]",[],12,/m/03wrqt,Deadly Blessing,1981-08-14,8279042.0,103.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/03npn"": ""Horror""..."
35853,20937181,Sheryl,[man],"[go, convince, go, kidnap, encounter, force, d...","[chase, find, find, mutilate, tell]",56,/m/05b4__s,Timber Falls,2007,680299.0,100.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01q03"": ""Cult"", ""/m/03npn"": ""Horror"", ""/m..."
7051,133574,Bill Daggett,"[sheriff, little, gunfighter, little, little, ...","[give, disarm, beat, arrive, have, assemble, c...","[leave, stop]",41,/m/0_92w,Unforgiven,1992-08-03,159157447.0,131.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/0hfjk"": ""Western"", ""/m/07s9rl0"": ""Drama""}"
40599,3439529,June Ellis,[patient],[die],[befriend],34,/m/09cgnx,The Doctor,1991-07-24,38120905.0,122.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/07s9rl0"": ""Drama""}"


In [29]:
characters_and_movies[characters_and_movies['title'].str.contains("Batman")][['title', 'character', 'cluster']]

Unnamed: 0,title,character,cluster
6348,Superman/Batman: Apocalypse,Superman,42
6349,Superman/Batman: Apocalypse,Batman,42
6350,Superman/Batman: Apocalypse,Barda,30
6351,Superman/Batman: Apocalypse,Kara,42
6352,Superman/Batman: Apocalypse,Darkseid,23
...,...,...,...
69321,Batman Forever,Riddler,42
69322,Batman Forever,Batman,42
69323,Batman Forever,Edward Nygma,56
69324,Batman Forever,Bruce Wayne,41


We can notice, that, probably, cluster number 42 is the cluster of super-heroes.

# Initial analysis of the actors' success

# Genre prediction using clusters

# Revenue prediction using clusters