In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns

In [None]:
from IPython.display import clear_output
import json

In [None]:
from sklearn.cluster import AgglomerativeClustering, KMeans

In [None]:
from utils.clustering import get_lda_clusters, get_vocab, word_topics_clustering, sort_meaningful, get_trf_clusters, topic_count
from utils.clustering_evaluation import get_characters_with_tv_trop_info, variation_of_information, group_labels_by_clusters

# Clustering methods comparison

We use Variation of Information between our clusters and golden clusters from TV Tropes as suggested in [Learning Latent Personas of Film Characters](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf). This way we can compare our methods' performance with the original method performance.

### LDA based clustering
For the character names and linguistic features extraction pipeline, please refer to `extract_character_attributes.ipynb`.

In [None]:
characters_attributes =  pd.read_csv(
    'data/character_attributes.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
# select only the characters who have at least 3 liguistic features
characters_attributes = sort_meaningful(characters_attributes, 3)

characters_attributes.head()

In [None]:
characters_to_check, tv_tropes = get_characters_with_tv_trop_info(characters_attributes)

In [None]:
agglomerative_clusters_n = [25, 50, 100]
n_components = [25, 50, 100]

configs = {}
config_base = {'characters': characters_to_check, 'min_freq': 5, 'max_freq':0.9}

for alg_n in agglomerative_clusters_n:
    for n in n_components:
        config = config_base.copy()
        config['clustering_algo'] = AgglomerativeClustering(n_clusters=alg_n, metric='cosine', linkage='complete')
        config['n_components'] = n
        configs[f'{alg_n} topics, {n} archetypes'] = config

results_lda = {}
for k, config in configs.items():
    clusters = get_lda_clusters(**config)
    results_lda[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_lda[k]}')

clear_output(wait=True)
results_lda

Note, that the results are even better (K=100, P=100, 5.42 in the paper and 4.9 here) than the results from the [paper](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf). That could indicate that using word2vec embeddings and Agglomerative clustering of the words to topics might be better suited for dividing the words into topics for the purpose of personas extraction.

### BERT based clustering

For the embedding extraction see `utils/archive/transformer_embeddings.ipynb`

In [None]:
characters_with_trf_emb =  pd.read_csv(
    'data/trf_embeddings_for_labeled_characters.csv', 
    index_col=0,
    converters={
        "emb": lambda x: [float(k) for k in x.strip("[]").replace("'","").split(", ")]
        }
    )
# Leave only those, who we compared on the previous step
characters_with_trf_emb = characters_with_trf_emb[characters_with_trf_emb['wiki_id'].isin(characters_to_check['wiki_id'].values)]

characters_with_trf_emb.head()

In [None]:
characters_to_check_trf, tv_tropes = get_characters_with_tv_trop_info(characters_with_trf_emb)

In [None]:
results_trf = {}
for n in n_components:
    k = f'{n} archetypes, agglomerative clustering'
    agglomerative = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='complete')
    clusters = get_trf_clusters(characters_to_check_trf, agglomerative)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_trf[k]}')

    k = f'{n} archetypes, kmeans clustering'
    kmeans = KMeans(n_clusters=n)
    clusters = get_trf_clusters(characters_to_check_trf, kmeans)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_trf[k]}')

clear_output(wait=True)
results_trf

The results of BERT embeddings based clustering are also better than the results from the paper. Still, obtaining these embeddings is slow, and the difference between this and previous method is not that big, so we will stick to the faster and more interpretable LDA based method.

**Comparison Table**
|                        |Paper K=100, P=100| LDA-based clustering  K=100, P=100| BERT-based kmeans clustering P=100|
|------------------------|------------------|-----------------------------------|-----------------------------------|
|Variation of Information|              5.42|                               4.90|                              ~4.80|


# Loading the data

For the clusterization pipeline as well as the different clustering methods comparison refer to `clustering.ipynb`. For our initial analysis we will use 50 clusters.

In [None]:
characters =  pd.read_csv(
    'data/character_clusters.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

In [None]:
print(f"In the clustered characters dataframe there are {len(characters)} characters from {len(set(characters['wiki_id'].values))} movies")

In [None]:
movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)

In [None]:
characters_and_movies = characters.merge(movies, how='left', on='wiki_id')
characters_and_movies = characters_and_movies[characters_and_movies['revenue'].notna()]

print(f"In the clustered characters with movie metadata dataframe there are {len(characters_and_movies)} characters from {len(set(characters_and_movies['wiki_id'].values))} movies with the revenue data")

In [None]:
characters_and_movies.sample(3)

In [None]:
characters_and_movies[characters_and_movies['title'].str.contains("Batman")][['title', 'character', 'cluster']]

We can notice, that, probably, cluster number 42 is the cluster of super-heroes.

## Clusters interpretability
For now we won't give particular names to each cluster. But we show, how to use data from Latent Dirichlet Allocation model to understand what is the meaning of each cluster. We can look at the most important topics (groups of words) for each cluster and conclude, what is the role of the character form a particular cluster. 

In [None]:
topics_dict = json.load(open('data/words_by_topic.json', 'r'))
lda_components = np.load('data/lda_components.npy')

In [None]:
for i in range(0, 50, 10):
    idx = lda_components[i].argmax()
    feature_type = 'attribute'
    if idx // 200 == 1:
        feature_type = 'active verb'
    elif idx // 200 == 2:
        feature_type = 'patient verb'
    print(f'For the cluster {i}, the most important topic is {feature_type} from')
    print(topics_dict[str(idx % 200)])
    print()

For example, we can see, that characters in cluster 0 are the onces who move a lot, while in cluster 10 characters are usually someones relative and characters in cluster 40 are some authority figures. Further, we will look at the top topics for each cluster to interpret, what are the common traits of the characters in one cluster.

# Initial analysis of the actors' success

In [None]:
actors = pd.read_csv(
    'data/MovieSummaries/character.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'release_date', 'character', 'date_of_birth', 'sex', 'height', '.','actor','age','character_map','..','...','....']
)
actors.head()

In [None]:
map_dict_to_list = lambda x: [value for key, value in eval(x).items()]
movies['languages'] = movies['languages'].apply(map_dict_to_list)
movies['countries'] = movies['countries'].apply(map_dict_to_list)
movies['genres'] = movies['genres'].apply(map_dict_to_list)
movies.head()

### Determine which actors bring in the most money


In [None]:
# Merge the two datas
df_merged = actors.merge(movies, on=['wiki_id'], how='inner')

# Clean data of movies without box offic revenue
df_merged = df_merged.dropna(subset=['revenue'])

# Determine the actor and sum the box office revenue
actor_totalRevenue = df_merged.groupby(['actor'])['revenue'].agg(['sum', 'count']).reset_index()
actor_totalRevenue.columns = ['actor', 'bo_revenue', 'Actor Count']

# Sort the actor_revenue DataFrame in descending order
actor_revenue_sort = actor_totalRevenue.sort_values(by='bo_revenue', ascending=False)

#Extract only the top 20 best actors
actor_top20 = actor_revenue_sort.head(20)

# Merge the top 20 actors with original datas
top20_actor_data = actor_top20.merge(df_merged[['actor', 'languages','sex']], on='actor', how='left')
top20_actor_data = top20_actor_data.drop_duplicates(subset = ['actor'])
top20_actor_data.head()

In [None]:
# Plot the data above
actor_names = top20_actor_data['actor']
revenues_total = top20_actor_data['bo_revenue']

plt.figure(figsize=(12, 6))
plt.plot(actor_names, revenues_total)
plt.xlabel('Actor Names')
plt.ylabel('Total Box Office Revenue')
plt.title('Top 10 Actors by Box Office Revenue for the Movies They Acted In')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.tight_layout()
plt.show()



In the data above, we are ranking the actors according to the sum of the box office revenue from the movies in which they acted, without considering whether it was a first, second, or episodic role.

### Study the main lead actors obtaining the highest revenue

Let's consider the weight for every role as a share of words in the plot that is related to the specific character. Then, let's make the simple assumption that all revenue is created thanks to the characters from the plot and calculate how much money corresponds to every actor's role.

In [None]:
def calculater_importance(x):
    a = 0
    for w in x["adj"]:
        a += w.isalpha()
    for w in x["active"]:
        a += w.isalpha()
    for w in x["patient"]:
        a += w.isalpha()
    return a

characters['importance'] = characters.apply(calculater_importance, axis=1)

characters_with_importance = characters[['wiki_id', 'character', 'importance']]

In [None]:
characters_with_importance['importance_share'] = characters_with_importance['importance'] / characters_with_importance.groupby('wiki_id')['importance'].transform('sum')
characters_with_importance['is_important'] = characters_with_importance['importance_share'] >= 0.2
characters_with_importance.sample(10)

In [None]:
df_charac_actor = df_merged[['wiki_id', 'character','actor','revenue']]

# Include characters and actors together in the same dataset
character_actor = df_charac_actor.merge(characters_with_importance, on=['wiki_id', 'character'], how='left')

# Merge only the main charac of movies\n",
main_character = character_actor[character_actor['is_important'] == True]

#Add up the Box office revenue and count the occurance of movie appearance
main_charac_mergedBOR = main_character.groupby(['actor'])['revenue'].agg(['sum', 'count']).reset_index()

main_charac_sort = main_charac_mergedBOR.sort_values(by='sum', ascending=False)

main_charac_top20 = main_charac_sort.head(20)
main_charac_top20.head()

In [None]:
# Plot the charts for main actors\n",
main_charac_top20_names = main_charac_top20['actor']
main_charac_revenue = main_charac_top20['sum']

plt.figure(figsize=(12, 6))
plt.plot(main_charac_top20_names,main_charac_revenue)
plt.xlabel('Actor Names')
plt.ylabel('Box Office Revenue')
plt.title('Top 20 Main Characters by Box average Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.show()

The top of the actors changed, and we see, that the actors look like those who we would expect to be successful (generally subjectively more famous than the previous top).

### Study the language used in top 50 films / Which language drive the highest revenue?

In [None]:
# Extract the top 100 movies
film_sort = df_merged[['countries','languages','revenue','wiki_id']]
film_sort = film_sort.drop_duplicates(subset = ['wiki_id'])
film_sort = film_sort.sort_values(by='revenue', ascending=False)
top100film = film_sort.head(100)

# Split the languages into new data frame
df_split_languages = top100film.explode('languages')

# Count the occurrence of the languages
language_counts = df_split_languages['languages'].value_counts()
df_language_counts = pd.DataFrame({'Language': language_counts.index, 'Count': language_counts.values})

# Plot the data
ax = df_language_counts.plot(kind='bar', x='Language', y='Count', figsize=(12, 6),logy=True)
plt.title('How many movies in the given language are in top 100 movies by revenue')
plt.xlabel('Language')
plt.ylabel('Count (logscale)')
plt.xticks(rotation=45, ha='right')
plt.bar_label(ax.containers[0], label_type='edge')

plt.show()

Here we can see a reasonable patter: English is almost everywhere, followed by wide-distributed languages like Spanish and French, and many languages that are used in only one movie.

### Study the genre of movies that earn the highest revenue.

In [None]:
#Clean data to remove repeated movies
df_clean_split_genres = df_merged[['title','genres','revenue','wiki_id']]
df_clean_split_genres = df_clean_split_genres.drop_duplicates(subset = ['wiki_id'])

# Split the genre into new data frames
df_split_genre = df_clean_split_genres.explode('genres')[['title', 'revenue', 'genres']]

# Find top genres
top_genres = df_split_genre.groupby('genres').size().reset_index().sort_values(by=0, ascending=False)
top_genres = top_genres.head(10)['genres'].values

# Determine the actor and sum the box office revenue
genre_totalRevenue = df_split_genre[df_split_genre['genres'].isin(top_genres)].groupby('genres')['revenue'].median().reset_index()

# Sort the BOR in ascending order
genre_totalRevenue = genre_totalRevenue.sort_values(by='revenue', ascending=False)

genre = genre_totalRevenue['genres']
genreRevenue = genre_totalRevenue['revenue']

#Plot the charts
plt.plot(genre, genreRevenue)
plt.xlabel('Genre')
plt.ylabel('Box office revenue')
plt.title('Top 10 popular Genres sorted by median Box Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability
plt.show()

# Genre-cluster correlation

In [None]:
characters_and_movies_with_genres = characters.merge(movies, how='left', on='wiki_id')
characters_and_movies_with_genres = characters_and_movies_with_genres[characters_and_movies_with_genres['genres'].notna()]
characters_and_movies_with_genres = characters_and_movies_with_genres[['wiki_id', 'title', 'character', 'cluster', 'genres']]
characters_and_movies_with_genres.sample(5)

In [None]:
characters_and_movies_with_genres = characters_and_movies_with_genres.explode('genres')
characters_and_movies_with_genres = characters_and_movies_with_genres[characters_and_movies_with_genres['genres'].isin(top_genres)]
characters_and_movies_with_genres.sample(5)

In [None]:
cluster_genre_table = characters_and_movies_with_genres[['cluster', 'genres']].groupby(['cluster', 'genres']).size().reset_index()
cluster_genre_table = cluster_genre_table.pivot(index='cluster', columns='genres', values=0)
cluster_genre_table

In [None]:
from scipy.stats import chi2_contingency

table = cluster_genre_table[['Romantic comedy', 'Thriller']].values
res = chi2_contingency(table)
print(f'pvalue of the test with H0: cluster distribution is the same in Romantic comedy	and Thriller films: {res[1]}')

# Revenue prediction using clusters