In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [None]:
from IPython.display import clear_output

In [None]:
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN
from sklearn import metrics
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
from utils.clustering import get_lda_clusters, get_vocab, word_topics_clustering, sort_meaningful, get_trf_clusters, topic_count

In [None]:
from utils.clustering import get_lda_clusters, get_vocab, word_topics_clustering, sort_meaningful, get_trf_clusters, topic_count

In [None]:
characters =  pd.read_csv(
    'data/character_attributes.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

In [None]:
characters = sort_meaningful(characters, 3)

In [None]:
len(characters)

### Clusters from tv_tropes

In [None]:
import json

tropes_list=[]
with open('data/MovieSummaries/tvtropes.clusters.txt', 'r') as f:
    s = f.readline()
    while s:
        trope = s[:s.index('\t')]
        character = json.loads(s[s.index('\t'): ])
        character['trope'] = trope
        tropes_list.append(character)
        s = f.readline()
topres_df = pd.DataFrame(tropes_list)
topres_df.head()


movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)
topres_df = topres_df.merge(movies, how='left', left_on='movie', right_on='title')[['char', 'movie', 'trope', 'wiki_id']]
topres_df

In [None]:
tropes_and_clusters = topres_df.merge(characters, how='left', left_on='wiki_id', right_on='wiki_id').dropna()
tropes_and_clusters

In [None]:
def same_name(names1, names2):
    names1 = names1.values
    names2 = names2.values
    flag = []
    for i in range(len(names1)):
        flag.append(names2[i] in names1[i])
    return flag

tropes_and_clusters = tropes_and_clusters[same_name(tropes_and_clusters['char'], tropes_and_clusters['character'])]

In [None]:
characters_to_check = tropes_and_clusters[['character', 'adj', 'active', 'patient', 'trope', 'wiki_id', 'movie']].reset_index(drop=True)
characters_to_check

# Clustering evaluation

We want to see, how good our clustering algorithm performs compared to the original alorithm proposed in the paper [Learning Latent Personas of Film Characters](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf)

In [None]:
from math import log

def group_labels_by_clusters(clusters):
    _, clusters = np.unique(clusters, return_inverse=True)
    l = [[] for _ in range(np.max(clusters) + 1)]
    for i in range(len(clusters)):
        l[clusters[i]].append(i)
    return l

def variation_of_information(X, Y):
    n = float(sum([len(x) for x in X]))
    sigma = 0.0
    for x in X:
        p = len(x) / n
        for y in Y:
           q = len(y) / n
           r = len(set(x) & set(y)) / n
           if r > 0.0:
               sigma += r * (log(r / p, 2) + log(r / q, 2))
    return abs(sigma)

In [None]:
tv_tropes = group_labels_by_clusters(characters_to_check['trope'].values)

In [None]:
len(tv_tropes)

In [None]:
agglomerative_clusters_n = [25, 50, 100]
n_components = [25, 50, 100]

configs = {}
config_base = {'characters': characters_to_check, 'min_freq': 5, 'max_freq':0.9}

for alg_n in agglomerative_clusters_n:
    for n in n_components:
        config = config_base.copy()
        config['clustering_algo'] = AgglomerativeClustering(n_clusters=alg_n, metric='cosine', linkage='complete')
        config['n_components'] = n
        configs[f'{alg_n} topics, {n} archetypes'] = config

results_lda = {}
for k, config in configs.items():
    clusters = get_lda_clusters(**config)
    results_lda[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_lda[k]}')

clear_output(wait=True)
results_lda

Note, that the results are even better (K=100, P=100, 5.42 in the paper and 4.9 here) than the results from the [paper](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf). That could indicate that using word2vec embeddings and Agglomerative clustering of the words to topics might be better suited for dividing the words into topics for the purpose of personas extraction.

## BERT based clustering

In [None]:
characters_with_trf_emb =  pd.read_csv(
    'data/trf_embeddings_for_labeled_characters.csv', 
    index_col=0,
    converters={
        "emb": lambda x: [float(k) for k in x.strip("[]").replace("'","").split(", ")]
        }
    )
characters_with_trf_emb.head()

In [None]:
tropes_and_clusters = characters_to_check.merge(characters_with_trf_emb, how='left', left_on='wiki_id', right_on='wiki_id').dropna()
tropes_and_clusters = tropes_and_clusters[tropes_and_clusters['character_x'] == tropes_and_clusters['character_y']]
characters_to_check_trf = tropes_and_clusters[['character_y', 'emb', 'trope', 'wiki_id', 'movie']]

In [None]:
tv_tropes_trf = group_labels_by_clusters(characters_to_check_trf['trope'].values)

In [None]:
results_trf = {}
for n in n_components:
    k = f'{n} archetypes, agglomerative clustering'
    agglomerative = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='complete')
    clusters = get_trf_clusters(characters_to_check_trf, agglomerative)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes_trf)
    print(k, f'VI = {results_trf[k]}')

    k = f'{n} archetypes, kmeans clustering'
    kmeans = KMeans(n_clusters=n)
    clusters = get_trf_clusters(characters_to_check_trf, kmeans)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes_trf)
    print(k, f'VI = {results_trf[k]}')

clear_output(wait=True)
results_trf

It's still better than the results from the paper, but extracting BERT embeddings is very slow, so we will stick to the previous method.

# Find optimal number of clusters

In [None]:
def medoid(vec):
    dist_matrix = np.zeros((len(vec), len(vec)))
    for i in range(len(vec)):
        for j in range(i + 1, len(vec)):
            dist_matrix[i][j] = np.sum(np.abs(vec[i] - vec[j]))
            dist_matrix[j][i] = dist_matrix[i][j]
    argmin = np.argmin(np.sum(dist_matrix, axis=0))
    return vec[argmin]

def unsupervised_evaluation(features, labels):
    unique_labels = np.unique(labels)
    # Calculate cluster medoid
    cluster_medoids = np.array([medoid(features[labels == label]) for label in unique_labels])

    # Calculate within-cluster sum of squares (WSS)
    wss = 0
    for num, label in enumerate(unique_labels):
        distance = np.sum((features[labels == label] - cluster_medoids[num]) ** 2)
        wss += distance

    sil_score = metrics.silhouette_score(features, labels)
    return wss, sil_score

In [None]:
characters_for_eval = characters.sample(1000).reset_index(drop=True)

In [None]:
clustering_algo = AgglomerativeClustering(n_clusters=200, metric='cosine', linkage='complete')

k_values = np.arange(10, 101, 5)
wsss = []
silhouettes = []
min_wss_idx = 0

for i in tqdm(range(len(k_values))):
    y, X = get_lda_clusters(characters_for_eval, 5, 0.9, clustering_algo, k_values[i], return_topic_counts=True)
    wss, silhouette = unsupervised_evaluation(X, y)
    wsss.append(wss)
    silhouettes.append(silhouette)
    clear_output(wait=True)

In [None]:
plt.subplot(1, 2, 1)
plt.plot(k_values, wsss)
plt.title("WSS scores")

plt.subplot(1, 2, 2)
plt.plot(k_values, silhouettes)
plt.title("Silhouette scores")

plt.show()

We can see, that the 60 clusters provide relatively good combination of the silhouette and WSS scores and is still manageable to interpret.

In [None]:
optimal_k = 60

In [None]:
%%script false --no-raise-error
# We want to save components of lda and word clusters (topic_dict) as well as clustering

vocab, vocab_vectors = get_vocab(characters, 5, 0.9)
topic_dict = word_topics_clustering(vocab, vocab_vectors, clustering_algo)
counts = topic_count(characters, topic_dict)
lda = LatentDirichletAllocation(
        n_components=optimal_k, random_state=0
).fit(counts)

characters['cluster'] = lda.transform(counts).argmax(axis=1)

In [None]:
%%script false --no-raise-error

characters.to_csv('data/character_clusters.csv')

In [None]:
%%script false --no-raise-error
topic_to_words = {}

for k, v in topic_dict.items():
    v = str(v)
    if v in topic_to_words:
        topic_to_words[v].append(k)
    else:
        topic_to_words[v] = [k]
        
json.dump( topic_to_words, open( "data/words_by_topic.json", 'w' ) )

In [None]:
%%script false --no-raise-error

with open('data/lda_components.npy', 'wb') as f:
    np.save(f, lda.components_)