In [5]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

In [60]:
from utils.character_attributes_extraction import attributes2vec, word2vec

In [58]:
characters =  pd.read_csv(
    'data/character_attributes.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

Unnamed: 0,wiki_id,character,adj,active,patient
0,31186339,Snow,[],"[summons, considers]",[]
1,31186339,Haymitch Abernathy,[],[warns],[]
2,31186339,Seneca Crane,[Gamemaker],"[changes, locked]","[summons, convince]"
3,31186339,Rue,[dying],"[draws, cares, draw, trapped]","[hears, stab, comforts, killing]"
4,31186339,Primrose Everdeen,[old],[chosen],[]


In [65]:
characters['active_vec'] = characters['active'].apply(lambda x: np.array([word2vec(a).tolist() for a in x]).mean(axis=0))

In [24]:
def set_distance(a):
    n = len(a)
    if n > 0:
        res_a = cosine_similarity(a).sum() / n ** 2
    else:
        res_a = 0
        
    return res_a

attr_dist = []
active_dist = []
patient_dist = []
for i, r1 in tqdm([r for r in characters.iterrows()]):
    attr_1, active_1, patient_1 = attributes2vec(r1)
    
    attr_dist.append(set_distance(attr_1))
    active_dist.append(set_distance(active_1))
    patient_dist.append(set_distance(patient_1))

  0%|          | 0/300 [00:00<?, ?it/s]

100%|██████████| 300/300 [00:07<00:00, 38.74it/s]


In [25]:
def set_distance(a, b):
    n, m = len(a), len(b)
    if n * m > 0:
        res = cosine_similarity(a, b).sum() / n / m
    else:
        res = 0
        
    return res

def distance(r1, r2):
    attr_1, active_1, patient_1 = attributes2vec(r1)
    attr_2, active_2, patient_2 = attributes2vec(r2)

    return set_distance(attr_1, attr_2) + set_distance(active_1, active_2) + set_distance(patient_1, patient_2)

In [26]:
characters_distance = np.zeros((len(characters), len(characters)))
for i, r1 in tqdm([r for r in characters.iterrows()]):
    for j, r2 in characters.iterrows():
        if i < j:
            characters_distance[i][j] = distance(r1, r2)
            characters_distance[j][i] = distance(r1, r2)

  0%|          | 0/300 [00:00<?, ?it/s]

100%|██████████| 300/300 [54:53<00:00, 10.98s/it]  


In [27]:
for i in range(300):
    for j in range(300):
        characters_distance[i][j] = attr_dist[i] + attr_dist[j] + active_dist[i] + active_dist[j] + patient_dist[i] + patient_dist[j] - 2 * characters_distance[i][j]

In [66]:
characters['active_vec']

0         [-2.1264500617980957, -0.03609500825405121, -0...
1         [3.6321001052856445, -0.7956299781799316, 3.76...
2         [1.0319049954414368, 2.2294501066207886, -1.43...
3         [-1.6292250752449036, 1.3702874928712845, -1.6...
4         [-4.070700168609619, 3.1312999725341797, -1.24...
                                ...                        
205001    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
205002    [0.8045874238014221, 0.49913502112030983, -1.3...
205003    [-3.1821000576019287, 1.176800012588501, -3.21...
205004    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
205005    [1.0667750239372253, 1.558899998664856, -0.098...
Name: active_vec, Length: 205006, dtype: object

In [69]:
import json

tropes_list=[]
with open('data/MovieSummaries/tvtropes.clusters.txt', 'r') as f:
    s = f.readline()
    while s:
        trope = s[:s.index('\t')]
        character = json.loads(s[s.index('\t'): ])
        character['trope'] = trope
        tropes_list.append(character)
        s = f.readline()
topres_df = pd.DataFrame(tropes_list)
topres_df.head()

Unnamed: 0,char,movie,id,actor,trope
0,Professor Philip Brainard,Flubber,/m/0jy9q0,Robin Williams,absent_minded_professor
1,Professor Keenbean,Richie Rich,/m/02vchl3,Michael McShane,absent_minded_professor
2,Dr. Reinhardt Lane,The Shadow,/m/0k6fkc,Ian McKellen,absent_minded_professor
3,Dr. Harold Medford,Them!,/m/0k6_br,Edmund Gwenn,absent_minded_professor
4,Daniel Jackson,Stargate,/m/0k3rhh,James Spader,absent_minded_professor


In [70]:
movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)
topres_df = topres_df.merge(movies, how='left', left_on='movie', right_on='title')[['char', 'movie', 'trope', 'wiki_id']]
topres_df

Unnamed: 0,char,movie,trope,wiki_id
0,Professor Philip Brainard,Flubber,absent_minded_professor,1344253
1,Professor Keenbean,Richie Rich,absent_minded_professor,1486573
2,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,14143328
3,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,29760749
4,Dr. Reinhardt Lane,The Shadow,absent_minded_professor,5991505
...,...,...,...,...
691,Morgan Earp,Tombstone,young_gun,525113
692,Colorado Ryan,Rio Bravo,young_gun,81100
693,Tom Sawyer,The League of Extraordinary Gentlemen,young_gun,4138142
694,William H. 'Billy the Kid' Bonney,Young Guns II,young_gun,675213


In [71]:
characters_with_cluster_labels = characters[characters['wiki_id'].isin(topres_df['wiki_id'].values)]

In [85]:
from sklearn.cluster import AgglomerativeClustering
X = np.array(characters_with_cluster_labels['active_vec'].values.tolist()) + 1e-9 # constant for numerical stability
clustering = AgglomerativeClustering(n_clusters=100, metric='cosine', linkage='complete').fit(X)
clustering.labels_

array([92, 32, 67, ..., 90,  0,  0])

In [86]:
characters_with_cluster_labels['cluster'] = clustering.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  characters_with_cluster_labels['cluster'] = clustering.labels_


In [87]:
characters_with_cluster_labels.groupby('cluster').size()

cluster
0     476
1      37
2       7
3      31
4       5
     ... 
95     94
96      5
97      7
98      2
99      5
Length: 100, dtype: int64

In [88]:
tropes_and_clusters = topres_df.merge(characters_with_cluster_labels, how='left', left_on='wiki_id', right_on='wiki_id').dropna()
tropes_and_clusters

Unnamed: 0,char,movie,trope,wiki_id,character,adj,active,patient,active_vec,cluster
0,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Philip Brainard,[professor],"[developing, manages]",[],"[1.3284500241279602, 0.4241199791431427, 0.309...",20.0
1,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Sara defeat Wilson,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",67.0
2,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Chester Hoenicker,[],"[sends, discovers]","[confront, defeat]","[1.7295499444007874, -0.5240299999713898, 1.79...",8.0
3,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Sara Jean Reynolds,[],"[return, confront, defeat]","[steal, win]","[-1.3484200437863667, 0.8088666598002116, -0.7...",41.0
4,Professor Philip Brainard,Flubber,absent_minded_professor,1344253,Philip dumps,[saying],"[approached, discovers, goes, flunked, awakens...","[leading, persuade]","[0.48791268738833343, 0.09813999181443994, 0.1...",0.0
...,...,...,...,...,...,...,...,...,...,...
5940,Jake,Silverado,young_gun,2087781,Cobb,[sheriff],[],"[defy, kills]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",67.0
5941,Jake,Silverado,young_gun,2087781,Augie,[nephew],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",67.0
5942,Jake,Silverado,young_gun,2087781,Mal,[cowboy],"[finds, reunite]",[],"[2.3090500235557556, 0.9622804950922728, -0.15...",8.0
5943,Jake,Silverado,young_gun,2087781,Paden,"[man, nephew]","[ride, jailed, kills, stays]",[aids],"[-0.7538174688816071, 2.6196600273251534, -3.8...",14.0


In [89]:
tropes_and_clusters = tropes_and_clusters[tropes_and_clusters['char']==tropes_and_clusters['character']]
tropes_and_clusters

Unnamed: 0,char,movie,trope,wiki_id,character,adj,active,patient,active_vec,cluster
41,Daniel Jackson,Stargate,absent_minded_professor,28327,Daniel Jackson,"[professor, chance]","[accepts, translates, deduces, reveals, go, re...","[offers, gives, kill]","[-0.8129565224051476, 0.7743979714810848, -1.1...",0.0
124,Han,Enter the Dragon,arrogant_kungfu_guy,10193,Han,"[mysterious, infuriated]","[suspected, allow, runs, warns, ends, summons,...",[tells],"[1.0476499835357946, 1.135257041629623, -0.778...",16.0
132,Johnny Lawrence,The Karate Kid,arrogant_kungfu_guy,91133,Johnny Lawrence,[boyfriend],"[defeats, torment, advances, looked, looks, ti...","[tripping, giving, directs]","[0.3809028353009905, 0.5147542783192226, 0.157...",8.0
164,Pai Mei,Kill Bill Volume 2,arrogant_kungfu_guy,525270,Pai Mei,[],"[refused, ridicules]",[poisoned],"[-0.5073399990797043, 1.8759400248527527, -1.6...",39.0
182,Apollo Creed,Rocky,arrogant_kungfu_guy,45772,Apollo Creed,[],"[comes, Meets, says, take, sustains]","[beat, embarrassing]","[3.386649966239929, -0.28600339144468306, -0.6...",66.0
...,...,...,...,...,...,...,...,...,...,...
5727,Katsumoto,The Last Samurai,warrior_poet,228274,Katsumoto,[],"[reminded, offers, refuses, asks, dies]",[frees],"[2.21120745614171, 0.940552008152008, 0.537811...",54.0
5851,T. E. Lawrence,Lawrence of Arabia,warrior_poet,43452,T. E. Lawrence,"[himself, dejected]","[killed, meets, ignores, proposes, turns, pers...",[],"[-0.1014499742951658, 1.319502744409773, -1.66...",0.0
5898,Colorado Ryan,Rio Bravo,young_gun,81100,Colorado Ryan,[gunslinger],"[declines, proven, offers, visits, steps, thro...",[],"[2.0096672541861023, 0.8191714457103184, -0.60...",95.0
5912,Tom Sawyer,The League of Extraordinary Gentlemen,young_gun,4138142,Tom Sawyer,[],[],[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",67.0


In [97]:
tropes_and_clusters[['character', 'active', 'trope', 'cluster']][tropes_and_clusters['cluster']==20]


Unnamed: 0,character,active,trope,cluster
888,Robert Angier,"[blames, becomes, incensed, sabotages, argue, ...",byronic_hero,20.0
924,Loki,"[discovers, seizes, sends, betrays, allows]",byronic_hero,20.0
3289,Beth,"[intervenes, notices, defend]",granola_person,20.0
3715,John Lee,"[seeks, intends]",hitman_with_a_heart,20.0
4043,Peter Blood,"[summoned, transported, purchased, continues, ...",loveable_rogue,20.0
4187,Archibald Cunningham,"[kills, captures, presents]",master_swordsman,20.0
4192,Madmartigan,"[offers, prepares, battle, killing]",master_swordsman,20.0
4793,Aaron Hallam,"[captured, states, manages, evades]",pupil_turned_to_evil,20.0
5658,Loki,"[discovers, seizes, sends, betrays, allows]",trickster,20.0
