In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import torch

In [2]:
from utils.transformer_character_embeddings import embeddings_from_text

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
plots =  pd.read_csv(
    'data/MovieSummaries/plot_summaries.txt', 
    sep='\t', 
    names=['wiki_id', 'plot']
)

plots['plot'] = plots['plot'].apply(lambda x: ' '.join(x.split()))

plots.head(5)

Unnamed: 0,wiki_id,plot
0,23890098,"Shlykov, a hard-working taxi driver and Lyosha..."
1,31186339,The nation of Panem consists of a wealthy Capi...
2,20663735,Poovalli Induchoodan is sentenced for six year...
3,2231378,"The Lemon Drop Kid , a New York City swindler,..."
4,595909,Seventh-day Adventist Church pastor Michael Ch...


In [4]:
input_text = plots.iloc[1]['plot']

In [6]:
character_list = []

for index, row in tqdm([row for row in plots.iterrows()][:100]):
    plot = row['plot'] 
    character_emb = embeddings_from_text(plot)

    for name in character_emb.keys():
        character_list.append(
            {
                'wiki_id': row['wiki_id'],
                'character': name,
                'emb': character_emb[name].tolist(),
            }
        )

character_df = pd.DataFrame(character_list)
character_df.head()

100%|██████████| 100/100 [05:00<00:00,  3.01s/it]


Unnamed: 0,wiki_id,character,emb
0,31186339,Primrose Everdeen,"[-0.30263614654541016, -0.6969847679138184, 0...."
1,31186339,Katniss,"[-0.19846008718013763, -0.7502771019935608, 0...."
2,31186339,Peeta Mellark,"[-0.10818104445934296, -0.3706214129924774, -0..."
3,31186339,Haymitch Abernathy,"[-0.1719408631324768, -0.26161321997642517, -0..."
4,31186339,Caesar Flickerman,"[0.8750091791152954, -0.30963605642318726, -0...."


In [7]:
character_df.to_csv('data/trf_embeddings_1000.csv')

In [156]:
from sklearn.cluster import AgglomerativeClustering
X = np.array(character_df['emb'].values.tolist())
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=0.75, metric='cosine', linkage='complete').fit(X)
clustering.labels_

array([28,  4,  4, 16,  2,  0,  4,  9, 11, 31, 31, 31,  7, 31, 31, 31, 31,
        7,  7,  7,  7, 31, 31, 31, 30, 16, 16,  0, 11,  6, 11,  1,  1,  0,
        2,  0, 30,  2,  2,  0,  0,  0,  1,  0, 13, 28,  0, 13, 13, 28,  0,
        0,  0, 32, 11,  0, 25, 32,  4, 30,  0,  0,  0,  0,  1,  0,  0,  1,
        1,  1,  1,  1,  5,  3, 24,  3, 13, 24,  2, 24,  6, 24,  8,  8,  8,
       21, 30,  1, 13,  0,  1,  1,  8, 30, 12,  2,  4, 30,  0,  0,  4, 13,
       11,  9,  2,  0,  9,  3,  3, 13, 13,  0,  0,  1, 15, 15, 18, 23, 16,
        0,  1, 18, 18, 30, 28, 30, 32,  4, 11, 28, 20,  7,  7,  7,  7,  7,
        7,  2,  7,  7,  7,  7,  7,  7,  7, 16,  7,  7, 16, 16,  7,  7, 16,
        8, 24, 24, 24, 24, 13,  0, 13,  0,  9, 25,  0,  0, 30,  9, 13, 28,
       28, 12, 23,  3,  0, 13,  0,  0, 11, 11, 15, 15, 16, 15, 15, 32, 15,
       15,  4, 29,  4, 13,  3, 28, 18, 27, 27, 16, 27, 27,  2, 27, 27,  2,
       27, 27, 16,  2, 27, 27, 27, 27, 25, 27,  2,  2, 16, 27,  2, 16, 12,
       19, 19, 13, 19, 19

In [157]:
character_df['character'].iloc[clustering.labels_==0]

5                      Rue
27                 Charley
33     Michael Chamberlain
35             Alex Thomas
39            Jack Baldwin
              ...         
450              Madeleine
451                Bernard
460             Abbas Khan
473       Anton Gorodetsky
513                    Soy
Name: character, Length: 68, dtype: object

Mostly women cluster:

In [158]:
character_df['character'].iloc[clustering.labels_==1]

31                 Azaria
32                  Lindy
42                 Dahlia
64                  Dotty
67                 Simona
68                   Eman
69                  Lubos
70                  Goran
71              Frantisek
87       Shelia Landreaux
90                   Zora
91                 Salena
113       Arabella Simone
120                 Addie
273           Odile Deray
281              Narcissa
306                  Jada
315               Quincey
370      Sergio Constanza
386               Jeannie
391          Hester Adams
405                  Cera
415    Miriam fires Velma
420                 Bobbi
423                 Malou
483                 Wanda
485                   Jan
486                   Eva
495           Thara Kurup
Name: character, dtype: object

Cluster of names with 's (preprocessing bug)

In [159]:
character_df['character'].iloc[clustering.labels_==2]

4          Caesar Flickerman
34              Chamberlains
37             Alex Thomas's
38              Thomas jumps
78          Ari Ben Canaan's
95              Shelia Kid's
104                  Lowells
137            Shambu Dada’s
200                Cao Cao's
203                Liu Bei's
207            Zhuge Liang's
214              Jiang Gan's
215                Zhou Yu's
218              Zhuge walks
230             Betty Boop's
267             Mud Buddha's
272                  milkman
287    Seth Michael Donsky's
313                  Husband
351            Ellen Brody's
375                Wasserman
381                  Woodmen
396         Stephen bargains
499       Sukumaran committs
Name: character, dtype: object

Mostly animated characters:

In [162]:
character_df['character'].iloc[clustering.labels_==4]

1            Katniss
2      Peeta Mellark
6               Cato
58      Buzz Buzzard
96           Pee Wee
100          Pee-Wee
127            Lumpy
188       Daffy Duck
190            Porky
229             Koko
231            Bimbo
232       Betty Boop
263          Dee Dee
269            Porky
270           Dobbin
304         Baby Boy
339          Wile E.
347           Hoagie
401             Mutt
404       Littlefoot
448            Pudgy
Name: character, dtype: object

This thing is just strange

In [166]:
character_df['character'].iloc[clustering.labels_==6]

29         Santa Claus
80           Auschwitz
338    Charlie Chaplin
482          Auschwitz
484            Gestapo
Name: character, dtype: object