In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [3]:
from sklearn.cluster import AgglomerativeClustering

In [4]:
from utils.clustering import get_lda_clusters, sort_meaningful, get_trf_clusters

In [5]:
characters =  pd.read_csv(
    'data/character_attributes_lemmatized.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

Unnamed: 0,wiki_id,character,adj,active,patient
0,31186339,Snow,[],"[summon, consider]",[]
1,31186339,Haymitch Abernathy,[],[warn],[]
2,31186339,Seneca Crane,[gamemaker],"[change, lock]","[summon, convince]"
3,31186339,Rue,[die],"[draw, care, draw, trap]","[hear, stab, comfort, kill]"
4,31186339,Primrose Everdeen,[old],[choose],[]


### Clusters from tv_tropes

In [6]:
import json

tropes_list=[]
with open('data/MovieSummaries/tvtropes.clusters.txt', 'r') as f:
    s = f.readline()
    while s:
        trope = s[:s.index('\t')]
        character = json.loads(s[s.index('\t'): ])
        character['trope'] = trope
        tropes_list.append(character)
        s = f.readline()
topres_df = pd.DataFrame(tropes_list)
topres_df.head()


movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)
topres_df = topres_df.merge(movies, how='left', left_on='movie', right_on='title')[['char', 'movie', 'trope', 'wiki_id']]
topres_df

Unnamed: 0,char,movie,trope,wiki_id
0,Professor Philip Brainard,Flubber,absent_minded_professor,1344253
1,Professor Keenbean,Richie Rich,absent_minded_professor,1486573
2,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,14143328
3,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,29760749
4,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,5991505
...,...,...,...,...
691,Morgan Earp,Tombstone,young_gun,525113
692,Colorado Ryan,Rio Bravo,young_gun,81100
693,Tom Sawyer,The League of Extraordinary Gentlemen,young_gun,4138142
694,William H. 'Billy the Kid' Bonney,Young Guns II,young_gun,675213


In [7]:
tropes_and_clusters = topres_df.merge(characters, how='left', left_on='wiki_id', right_on='wiki_id').dropna()
tropes_and_clusters

Unnamed: 0,char,movie,trope,wiki_id,character,adj,active,patient
0,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Philip Brainard,[professor],"[develop, manage]",[]
1,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Sara defeat Wilson,[],[],[]
2,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Chester Hoenicker,[],"[send, discover]","[confront, defeat]"
3,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Sara Jean Reynolds,[],"[return, confront, defeat]","[steal, win]"
4,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Philip dumps,[say],"[approach, discover, go, flunk, awaken, dump, ...","[lead, persuade]"
...,...,...,...,...,...,...,...,...
5940,Jake,Silverado,young_gun,2087781,Cobb,[sheriff],[],"[defy, kill]"
5941,Jake,Silverado,young_gun,2087781,Augie,[nephew],[],[]
5942,Jake,Silverado,young_gun,2087781,Mal,[cowboy],"[find, reunite]",[]
5943,Jake,Silverado,young_gun,2087781,Paden,"[man, nephew]","[ride, jail, kill, stay]",[aid]


In [8]:
def same_name(names1, names2):
    names1 = names1.values
    names2 = names2.values
    flag = []
    for i in range(len(names1)):
        flag.append(names2[i] in names1[i])
    return flag

tropes_and_clusters = tropes_and_clusters[same_name(tropes_and_clusters['char'], tropes_and_clusters['character'])]
tropes_and_clusters

Unnamed: 0,char,movie,trope,wiki_id,character,adj,active,patient
0,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Philip Brainard,[professor],"[develop, manage]",[]
18,Professor Keenbean,Richie Rich,absent_minded_professor,1486573,Keenbean,"[scientist, smellmaster, RoboBee]","[listen, state]","[confront, rescue]"
27,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,5991505,Reinhardt,"[father, scientist]",[disable],[rescue]
39,Dr. Harold Medford,Them!,absent_minded_professor,605676,Harold,[team],[],[]
41,Daniel Jackson,Stargate,absent_minded_professor,28327,Daniel Jackson,"[professor, chance]","[accept, translate, deduce, reveal, go, realiz...","[offer, give, kill]"
...,...,...,...,...,...,...,...,...
5879,Morgan Earp,Tombstone,young_gun,525113,Morgan,[],"[wound, kill]",[]
5898,Colorado Ryan,Rio Bravo,young_gun,81100,Colorado Ryan,[gunslinger],"[decline, prove, offer, visit, step, throw, ag...",[]
5912,Tom Sawyer,The League of Extraordinary Gentlemen,young_gun,4138142,Tom Sawyer,[],[],[]
5922,William H. 'Billy the Kid' Bonney,Young Guns II,young_gun,675213,Billy the Kid,[],"[become, meet, find, pose, mention, come, refu...","[pardon, find]"


In [9]:
characters_to_check = tropes_and_clusters[['character', 'adj', 'active', 'patient', 'trope', 'wiki_id', 'movie']]
characters_to_check = sort_meaningful(characters_to_check, 3)
characters_to_check

Unnamed: 0,character,adj,active,patient,trope,wiki_id,movie
0,Philip Brainard,[professor],"[develop, manage]",[],absent_minded_professor,1344253,Flubber
1,Keenbean,"[scientist, smellmaster, RoboBee]","[listen, state]","[confront, rescue]",absent_minded_professor,1486573,Richie Rich
2,Reinhardt,"[father, scientist]",[disable],[rescue],absent_minded_professor,5991505,The Shadow
3,Harold,[team],[],[],absent_minded_professor,605676,Them!
4,Daniel Jackson,"[professor, chance]","[accept, translate, deduce, reveal, go, realiz...","[offer, give, kill]",absent_minded_professor,28327,Stargate
...,...,...,...,...,...,...,...
419,Horton,[teacher],"[find, find, resolve, give, refuse, recover, h...","[name, confront, offer]",warrior_poet,6002183,Horton Hears a Who!
420,Morgan,[],"[wound, kill]",[],young_gun,525113,Tombstone
421,Colorado Ryan,[gunslinger],"[decline, prove, offer, visit, step, throw, ag...",[],young_gun,81100,Rio Bravo
422,Billy the Kid,[],"[become, meet, find, pose, mention, come, refu...","[pardon, find]",young_gun,675213,Young Guns II


# Clustering evaluation

We want to see, how good our clustering algorithm performs.

In [10]:
from math import log

def group_labels_by_clusters(clusters):
    _, clusters = np.unique(clusters, return_inverse=True)
    l = [[] for _ in range(np.max(clusters) + 1)]
    for i in range(len(clusters)):
        l[clusters[i]].append(i)
    return l

def variation_of_information(X, Y):
    n = float(sum([len(x) for x in X]))
    sigma = 0.0
    for x in X:
        p = len(x) / n
        for y in Y:
           q = len(y) / n
           r = len(set(x) & set(y)) / n
           if r > 0.0:
               sigma += r * (log(r / p, 2) + log(r / q, 2))
    return abs(sigma)

In [11]:
tv_tropes = group_labels_by_clusters(characters_to_check['trope'].values)

In [12]:
len(tv_tropes)

71

In [13]:
agglomerative_clusters_n = [25, 50, 100, 200]
n_components = [25, 50, 100]

configs = {}
config_base = {'characters': characters_to_check, 'min_freq': 3, 'max_freq':1.0}

for alg_n in agglomerative_clusters_n:
    for n in n_components:
        config = config_base.copy()
        config['clustering_algo'] = AgglomerativeClustering(n_clusters=alg_n, metric='cosine', linkage='complete')
        config['n_components'] = n
        configs[f'{alg_n} topics, {n} archetypes'] = config

results = {}
for k, config in configs.items():
    clusters = get_lda_clusters(**config)
    results[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results[k]}')

vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 644216.73it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12422.27it/s]

topics count DONE
LDA





25 topics, 25 archetypes VI = 6.245318740419193
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 1090252.31it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12817.00it/s]

topics count DONE
LDA





25 topics, 50 archetypes VI = 5.822569919688307
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 627705.91it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12302.22it/s]

topics count DONE
LDA





25 topics, 100 archetypes VI = 5.421960057286387
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 1155465.38it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12707.02it/s]

topics count DONE
LDA





50 topics, 25 archetypes VI = 6.319223026919285
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 643008.63it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12580.36it/s]

topics count DONE
LDA





50 topics, 50 archetypes VI = 5.769318448246155
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 637331.51it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12637.22it/s]

topics count DONE
LDA





50 topics, 100 archetypes VI = 5.148159135812935
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 616421.31it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12303.08it/s]

topics count DONE
LDA





100 topics, 25 archetypes VI = 6.370545154521971
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 1146770.41it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12906.96it/s]

topics count DONE
LDA





100 topics, 50 archetypes VI = 5.73939834088372
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 648481.04it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12374.13it/s]

topics count DONE
LDA





100 topics, 100 archetypes VI = 4.969093711080894
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 1064032.12it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12643.15it/s]

topics count DONE
LDA





200 topics, 25 archetypes VI = 6.374610053850584
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 1186451.04it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12760.81it/s]

topics count DONE
LDA





200 topics, 50 archetypes VI = 5.709258109531297
vocabulary extraction
vocabulary extraction DONE
topics clustering


100%|██████████| 327/327 [00:00<00:00, 1162319.84it/s]


topics clustering DONE
topics count


100%|██████████| 424/424 [00:00<00:00, 12817.74it/s]

topics count DONE
LDA





200 topics, 100 archetypes VI = 4.9701142105634295


In [14]:
results

{'25 topics, 25 archetypes': 6.245318740419193,
 '25 topics, 50 archetypes': 5.822569919688307,
 '25 topics, 100 archetypes': 5.421960057286387,
 '50 topics, 25 archetypes': 6.319223026919285,
 '50 topics, 50 archetypes': 5.769318448246155,
 '50 topics, 100 archetypes': 5.148159135812935,
 '100 topics, 25 archetypes': 6.370545154521971,
 '100 topics, 50 archetypes': 5.73939834088372,
 '100 topics, 100 archetypes': 4.969093711080894,
 '200 topics, 25 archetypes': 6.374610053850584,
 '200 topics, 50 archetypes': 5.709258109531297,
 '200 topics, 100 archetypes': 4.9701142105634295}

Note, that the results are even better (K=100, P=100, 5.42 in the paper and 4.97 here) than the results from the [paper](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf)

## BERT based clustering

In [15]:
characters_with_trf_emb =  pd.read_csv(
    'data/trf_embeddings_for_labeled_characters.csv', 
    index_col=0,
    converters={
        "emb": lambda x: [float(k) for k in x.strip("[]").replace("'","").split(", ")]
        }
    )
characters_with_trf_emb.head()

Unnamed: 0,wiki_id,character,emb
0,6002183,Horton,"[1.0488402843475342, 0.3811729848384857, 0.645..."
1,6002183,Ned McDodd,"[-0.5622232556343079, -0.2521360516548157, -0...."
2,6002183,JoJo,"[-0.7999439835548401, -0.4102073311805725, 0.6..."
3,6002183,Sally,"[0.7803803086280823, -0.7006192207336426, 0.32..."
4,6002183,LaRue,"[-0.038770418614149094, 0.219954714179039, -0...."


In [16]:
tropes_and_clusters = characters_to_check.merge(characters_with_trf_emb, how='left', left_on='wiki_id', right_on='wiki_id').dropna()
tropes_and_clusters = tropes_and_clusters[tropes_and_clusters['character_x'] == tropes_and_clusters['character_y']]
characters_to_check_trf = tropes_and_clusters[['character_y', 'emb', 'trope', 'wiki_id', 'movie']]
characters_to_check_trf

Unnamed: 0,character_y,emb,trope,wiki_id,movie
0,Philip Brainard,"[0.17131628096103668, -0.22005470097064972, -0...",absent_minded_professor,1344253,Flubber
14,Keenbean,"[-0.31679657101631165, 0.075782410800457, 0.06...",absent_minded_professor,1486573,Richie Rich
25,Reinhardt,"[-0.2164442092180252, 0.4853280186653137, 0.41...",absent_minded_professor,5991505,The Shadow
33,Harold,"[-0.010978280566632748, 0.2072685807943344, 0....",absent_minded_professor,605676,Them!
38,Daniel Jackson,"[-0.2861030399799347, 0.06075410172343254, 0.3...",absent_minded_professor,28327,Stargate
...,...,...,...,...,...
4106,Horton,"[1.0488402843475342, 0.3811729848384857, 0.645...",warrior_poet,6002183,Horton Hears a Who!
4114,Morgan,"[0.8440001606941223, -0.34702691435813904, -0....",young_gun,525113,Tombstone
4137,Colorado Ryan,"[-0.8454567193984985, -0.2715449929237366, -0....",young_gun,81100,Rio Bravo
4149,Billy the Kid,"[0.6221001148223877, 0.06010516732931137, 0.58...",young_gun,675213,Young Guns II


In [17]:
tv_tropes_trf = group_labels_by_clusters(characters_to_check_trf['trope'].values)

In [18]:
results_trf = {}
for n in n_components:
    k = f'{n} archetypes'
    clusters = get_trf_clusters(characters_to_check_trf, n)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes_trf)
    print(k, f'VI = {results_trf[k]}')

25 archetypes VI = 6.338304025432505
50 archetypes VI = 5.828470180691933
100 archetypes VI = 5.034504432300139


It's still better than the article but worse than the previous method, moreover, extracting embeddings is very slow.