In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import torch

In [2]:
from utils.transformer_character_embeddings import embeddings_from_text

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
plots =  pd.read_csv(
    'data/MovieSummaries/plot_summaries.txt', 
    sep='\t', 
    names=['wiki_id', 'plot']
)

plots['plot'] = plots['plot'].apply(lambda x: ' '.join(x.split()))

plots.head(5)

Unnamed: 0,wiki_id,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six year...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


## Extract cluster labels from tvtropes

In [4]:
import json

tropes_list=[]
with open('data/MovieSummaries/tvtropes.clusters.txt', 'r') as f:
    s = f.readline()
    while s:
        trope = s[:s.index('\t')]
        character = json.loads(s[s.index('\t'): ])
        character['trope'] = trope
        tropes_list.append(character)
        s = f.readline()
topres_df = pd.DataFrame(tropes_list)
topres_df.head()

Unnamed: 0,char,movie,id,actor,trope
0,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams,absent_minded_professor
1,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane,absent_minded_professor
2,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen,absent_minded_professor
3,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn,absent_minded_professor
4,Daniel Jackson,Stargate,/m/0k3rhh,James Spader,absent_minded_professor


In [5]:
movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)
topres_df = topres_df.merge(movies, how='left', left_on='movie', right_on='title')[['char', 'movie', 'trope', 'wiki_id']]
topres_df

Unnamed: 0,char,movie,trope,wiki_id
0,Professor Philip Brainard,Flubber,absent_minded_professor,1344253
1,Professor Keenbean,Richie Rich,absent_minded_professor,1486573
2,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,14143328
3,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,29760749
4,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,5991505
...,...,...,...,...
691,Morgan Earp,Tombstone,young_gun,525113
692,Colorado Ryan,Rio Bravo,young_gun,81100
693,Tom Sawyer,The League of Extraordinary Gentlemen,young_gun,4138142
694,William H. 'Billy the Kid' Bonney,Young Guns II,young_gun,675213


In [6]:
plots_with_cluster_labels = plots[plots['wiki_id'].isin(topres_df['wiki_id'].values)]

## BERT Embeddings

In [8]:
character_list = []

for index, row in tqdm([row for row in plots_with_cluster_labels.iterrows()]):
    plot = row['plot'] 
    character_emb = embeddings_from_text(plot)

    for name in character_emb.keys():
        character_list.append(
            {
                'wiki_id': row['wiki_id'],
                'character': name,
                'emb': character_emb[name].tolist(),
            }
        )

character_df = pd.DataFrame(character_list)
character_df.head()

  0%|          | 0/466 [00:00<?, ?it/s]

100%|██████████| 466/466 [31:48<00:00,  4.09s/it]


Unnamed: 0,wiki_id,character,emb
0,6002183,Horton,"[1.0488402843475342, 0.3811729848384857, 0.645..."
1,6002183,Ned McDodd,"[-0.5622232556343079, -0.2521360516548157, -0...."
2,6002183,JoJo,"[-0.7999439835548401, -0.4102073311805725, 0.6..."
3,6002183,Sally,"[0.7803803086280823, -0.7006192207336426, 0.32..."
4,6002183,LaRue,"[-0.038770418614149094, 0.219954714179039, -0...."


In [9]:
character_df.to_csv('data/trf_embeddings_for_labeled_characters.csv')

## Clustering

In [435]:
from sklearn.cluster import AgglomerativeClustering, KMeans, AffinityPropagation
X = np.array(character_df['emb'].values.tolist())
clustering = AgglomerativeClustering(n_clusters=120, metric='euclidean', linkage='complete').fit(X)
clustering.labels_

array([ 90,   0,   9, ...,   3, 103,  19])

In [436]:
character_df['cluster'] = clustering.labels_

In [437]:
character_df.groupby('cluster').size().var()

4593.06862745098

In [438]:
tropes_and_clusters = topres_df.merge(character_df, how='left', left_on='wiki_id', right_on='wiki_id').dropna()
tropes_and_clusters

Unnamed: 0,char,movie,trope,wiki_id,character,emb,cluster
0,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Philip Brainard,"[0.17131628096103668, -0.22005470097064972, -0...",21.0
1,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Sara Jean Reynolds,"[0.037948835641145706, -0.46521881222724915, 0...",19.0
2,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Sara defeat Wilson,"[0.562729001045227, 0.2940244674682617, 1.1099...",67.0
3,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Philip dumps,"[-0.2966870963573456, 0.5965226888656616, 1.20...",90.0
4,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Wilson Croft,"[0.6437947750091553, 0.1472240835428238, 0.628...",34.0
...,...,...,...,...,...,...,...
5762,Jake,Silverado,young_gun,2087781,Ethan McKendrick,"[0.27263399958610535, -0.7422829866409302, -0....",2.0
5763,Jake,Silverado,young_gun,2087781,Ray Baker,"[-0.25491079688072205, 0.18189920485019684, 0....",4.0
5764,Jake,Silverado,young_gun,2087781,Ezra,"[0.7891530394554138, 0.3878965377807617, -0.81...",17.0
5765,Jake,Silverado,young_gun,2087781,Cobb,"[0.11774065345525742, -0.28131282329559326, 0....",61.0


In [439]:
tropes_and_clusters[tropes_and_clusters['char']==tropes_and_clusters['character']]

Unnamed: 0,char,movie,trope,wiki_id,character,emb,cluster
41,Daniel Jackson,Stargate,absent_minded_professor,28327,Daniel Jackson,"[-0.2861030399799347, 0.06075410172343254, 0.3...",61.0
124,Han,Enter the Dragon,arrogant_kungfu_guy,10193,Han,"[0.02808583341538906, -0.014965079724788666, 0...",69.0
129,Johnny Lawrence,The Karate Kid,arrogant_kungfu_guy,91133,Johnny Lawrence,"[0.3227328956127167, -0.14889095723628998, 1.0...",67.0
161,Pai Mei,Kill Bill Volume 2,arrogant_kungfu_guy,525270,Pai Mei,"[0.28408095240592957, -0.15003634989261627, 0....",6.0
175,Apollo Creed,Rocky,arrogant_kungfu_guy,45772,Apollo Creed,"[0.4457775354385376, -0.37654444575309753, -0....",77.0
...,...,...,...,...,...,...,...
5559,Katsumoto,The Last Samurai,warrior_poet,228274,Katsumoto,"[-0.020103439688682556, -0.23334017395973206, ...",9.0
5664,T. E. Lawrence,Lawrence of Arabia,warrior_poet,43452,T. E. Lawrence,"[0.5024977922439575, 0.06926974654197693, 0.34...",21.0
5718,Colorado Ryan,Rio Bravo,young_gun,81100,Colorado Ryan,"[-0.8454567193984985, -0.2715449929237366, -0....",4.0
5733,Tom Sawyer,The League of Extraordinary Gentlemen,young_gun,4138142,Tom Sawyer,"[0.049022164195775986, -0.12225841730833054, 0...",61.0


In [440]:
tropes_and_clusters[tropes_and_clusters['char']==tropes_and_clusters['character']].groupby('cluster')['trope'].agg([set, lambda x: len(set(x)), lambda x: len(list(x))]).head(40)

Unnamed: 0_level_0,set,<lambda_0>,<lambda_1>
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,"{egomaniac_hunter, corrupt_corporate_executive}",2,2
1.0,{stupid_crooks},1,1
2.0,{master_swordsman},1,1
4.0,"{evil_prince, young_gun, corrupt_corporate_exe...",4,4
6.0,"{gadgeteer_genius, hitman_with_a_heart, arroga...",3,3
9.0,"{loveable_rogue, officer_and_a_gentleman, warr...",7,7
11.0,"{ophelia, chanteuse}",2,2
13.0,{master_swordsman},1,1
17.0,"{charmer, loveable_rogue}",2,4
19.0,"{granola_person, bounty_hunter, crazy_jealous_...",6,7


We can see, that the clusters don't match completely, but we have some promising clusters in which we see similar tropes.