In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm

In [None]:
from IPython.display import clear_output

In [None]:
from sklearn.cluster import AgglomerativeClustering, KMeans

In [None]:
from utils.clustering import get_lda_clusters, get_vocab, word_topics_clustering, sort_meaningful, get_trf_clusters, topic_count
from utils.clustering_evaluation import get_characters_with_tv_trop_info, variation_of_information, group_labels_by_clusters

# Clustering methods comparison

We use Variation of Information between our clusters and golden clusters from TV Tropes as suggested in [Learning Latent Personas of Film Characters](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf). This way we can compare our methods' performance with the original method performance.

### LDA based clustering

In [None]:
characters_attributes =  pd.read_csv(
    'data/character_attributes.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
# select only the characters who have at least 3 liguistic features
characters_attributes = sort_meaningful(characters_attributes, 3)

characters_attributes.head()

In [None]:
characters_to_check, tv_tropes = get_characters_with_tv_trop_info(characters_attributes)

In [None]:
agglomerative_clusters_n = [25, 50, 100]
n_components = [25, 50, 100]

configs = {}
config_base = {'characters': characters_to_check, 'min_freq': 5, 'max_freq':0.9}

for alg_n in agglomerative_clusters_n:
    for n in n_components:
        config = config_base.copy()
        config['clustering_algo'] = AgglomerativeClustering(n_clusters=alg_n, metric='cosine', linkage='complete')
        config['n_components'] = n
        configs[f'{alg_n} topics, {n} archetypes'] = config

results_lda = {}
for k, config in configs.items():
    clusters = get_lda_clusters(**config)
    results_lda[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_lda[k]}')

clear_output(wait=True)
results_lda

Note, that the results are even better (K=100, P=100, 5.42 in the paper and 4.9 here) than the results from the [paper](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf). That could indicate that using word2vec embeddings and Agglomerative clustering of the words to topics might be better suited for dividing the words into topics for the purpose of personas extraction.

### BERT based clustering

In [None]:
characters_with_trf_emb =  pd.read_csv(
    'data/trf_embeddings_for_labeled_characters.csv', 
    index_col=0,
    converters={
        "emb": lambda x: [float(k) for k in x.strip("[]").replace("'","").split(", ")]
        }
    )
# Leave only those, who we compared on the previous step
characters_with_trf_emb = characters_with_trf_emb[characters_with_trf_emb['wiki_id'].isin(characters_to_check['wiki_id'].values)]

characters_with_trf_emb.head()

In [None]:
characters_to_check_trf, tv_tropes = get_characters_with_tv_trop_info(characters_with_trf_emb)

In [None]:
results_trf = {}
for n in n_components:
    k = f'{n} archetypes, agglomerative clustering'
    agglomerative = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='complete')
    clusters = get_trf_clusters(characters_to_check_trf, agglomerative)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_trf[k]}')

    k = f'{n} archetypes, kmeans clustering'
    kmeans = KMeans(n_clusters=n)
    clusters = get_trf_clusters(characters_to_check_trf, kmeans)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_trf[k]}')

clear_output(wait=True)
results_trf

The results of BERT embeddings based clustering are also better than the results from the paper. Still, obtaining these embeddings is slow, and the difference between this and previous method is not that big, so we will stick to the faster and more interpretable LDA based method.

# Loading the data

For the character names and linguistic features extraction pipeline, please refer to `extract_character_attributes.ipynb`. And for the clusterization pipeline as well as the different clustering methods comparison refer to `clustering.ipynb`. For our initial analysis we will use 60 clusters.

In [None]:
characters =  pd.read_csv(
    'data/character_clusters.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

In [None]:
print(f"In the clustered characters dataframe there are {len(characters)} characters from {len(set(characters['wiki_id'].values))} movies")

In [None]:
movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)

In [None]:
characters_and_movies = characters.merge(movies, how='left', on='wiki_id')
characters_and_movies = characters_and_movies[characters_and_movies['revenue'].notna()]

print(f"In the clustered characters with movie metadata dataframe there are {len(characters_and_movies)} characters from {len(set(characters_and_movies['wiki_id'].values))} movies with the revenue data")

In [None]:
characters_and_movies.sample(5)

In [None]:
characters_and_movies[characters_and_movies['title'].str.contains("Batman")][['title', 'character', 'cluster']]

We can notice, that, probably, cluster number 42 is the cluster of super-heroes.

# Initial analysis of the actors' success

# Genre prediction using clusters

# Revenue prediction using clusters