In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns

In [4]:
from IPython.display import clear_output
import json

In [5]:
from sklearn.cluster import AgglomerativeClustering, KMeans

In [None]:
from utils.clustering import get_lda_clusters, get_vocab, word_topics_clustering, sort_meaningful, get_trf_clusters, topic_count
from utils.clustering_evaluation import get_characters_with_tv_trop_info, variation_of_information, group_labels_by_clusters

# Clustering methods comparison

We use Variation of Information between our clusters and golden clusters from TV Tropes as suggested in [Learning Latent Personas of Film Characters](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf). This way we can compare our methods' performance with the original method performance.

### LDA based clustering
For the character names and linguistic features extraction pipeline, please refer to `extract_character_attributes.ipynb`.

In [None]:
characters_attributes =  pd.read_csv(
    'data/character_attributes.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
# select only the characters who have at least 3 liguistic features
characters_attributes = sort_meaningful(characters_attributes, 3)

characters_attributes.head()

In [None]:
characters_to_check, tv_tropes = get_characters_with_tv_trop_info(characters_attributes)

In [None]:
agglomerative_clusters_n = [25, 50, 100]
n_components = [25, 50, 100]

configs = {}
config_base = {'characters': characters_to_check, 'min_freq': 5, 'max_freq':0.9}

for alg_n in agglomerative_clusters_n:
    for n in n_components:
        config = config_base.copy()
        config['clustering_algo'] = AgglomerativeClustering(n_clusters=alg_n, metric='cosine', linkage='complete')
        config['n_components'] = n
        configs[f'{alg_n} topics, {n} archetypes'] = config

results_lda = {}
for k, config in configs.items():
    clusters = get_lda_clusters(**config)
    results_lda[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_lda[k]}')

clear_output(wait=True)
results_lda

Note, that the results are even better (K=100, P=100, 5.42 in the paper and 4.9 here) than the results from the [paper](http://www.cs.cmu.edu/~dbamman/pubs/pdf/bamman+oconnor+smith.acl13.pdf). That could indicate that using word2vec embeddings and Agglomerative clustering of the words to topics might be better suited for dividing the words into topics for the purpose of personas extraction.

### BERT based clustering

For the embedding extraction see `utils/archive/transformer_embeddings.ipynb`

In [None]:
characters_with_trf_emb =  pd.read_csv(
    'data/trf_embeddings_for_labeled_characters.csv', 
    index_col=0,
    converters={
        "emb": lambda x: [float(k) for k in x.strip("[]").replace("'","").split(", ")]
        }
    )
# Leave only those, who we compared on the previous step
characters_with_trf_emb = characters_with_trf_emb[characters_with_trf_emb['wiki_id'].isin(characters_to_check['wiki_id'].values)]

characters_with_trf_emb.head()

In [None]:
characters_to_check_trf, tv_tropes = get_characters_with_tv_trop_info(characters_with_trf_emb)

In [None]:
results_trf = {}
for n in n_components:
    k = f'{n} archetypes, agglomerative clustering'
    agglomerative = AgglomerativeClustering(n_clusters=n, metric='euclidean', linkage='complete')
    clusters = get_trf_clusters(characters_to_check_trf, agglomerative)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_trf[k]}')

    k = f'{n} archetypes, kmeans clustering'
    kmeans = KMeans(n_clusters=n)
    clusters = get_trf_clusters(characters_to_check_trf, kmeans)
    results_trf[k] = variation_of_information(group_labels_by_clusters(clusters), tv_tropes)
    print(k, f'VI = {results_trf[k]}')

clear_output(wait=True)
results_trf

The results of BERT embeddings based clustering are also better than the results from the paper. Still, obtaining these embeddings is slow, and the difference between this and previous method is not that big, so we will stick to the faster and more interpretable LDA based method.

# Loading the data

For the clusterization pipeline as well as the different clustering methods comparison refer to `clustering.ipynb`. For our initial analysis we will use 50 clusters.

In [None]:
characters =  pd.read_csv(
    'data/character_clusters.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )
characters.head()

In [None]:
print(f"In the clustered characters dataframe there are {len(characters)} characters from {len(set(characters['wiki_id'].values))} movies")

In [None]:
movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)

In [None]:
characters_and_movies = characters.merge(movies, how='left', on='wiki_id')
characters_and_movies = characters_and_movies[characters_and_movies['revenue'].notna()]

print(f"In the clustered characters with movie metadata dataframe there are {len(characters_and_movies)} characters from {len(set(characters_and_movies['wiki_id'].values))} movies with the revenue data")

In [None]:
characters_and_movies.sample(3)

In [None]:
characters_and_movies[characters_and_movies['title'].str.contains("Batman")][['title', 'character', 'cluster']]

We can notice, that, probably, cluster number 42 is the cluster of super-heroes.

## Clusters interpretability
For now we won't give particular names to each cluster. But we show, how to use data from Latent Dirichlet Allocation model to understand what is the meaning of each cluster. We can look at the most important topics (groups of words) for each cluster and conclude, what is the role of the character form a particular cluster. 

In [None]:
topics_dict = json.load(open('data/words_by_topic.json', 'r'))
lda_components = np.load('data/lda_components.npy')

In [None]:
for i in range(0, 50, 10):
    idx = lda_components[i].argmax()
    feature_type = 'attribute'
    if idx // 200 == 1:
        feature_type = 'active verb'
    elif idx // 200 == 2:
        feature_type = 'patient verb'
    print(f'For the cluster {i}, the most important topic is {feature_type} from')
    print(topics_dict[str(idx % 200)])
    print()

For example, we can see, that characters in cluster 0 are the onces who move a lot, while in cluster 10 characters are usually someones relative and characters in cluster 40 are some authority figures. Further, we will look at the top topics for each cluster to interpret, what are the common traits of the characters in one cluster.

# Initial analysis of the actors' success

In [None]:
actors = pd.read_csv(
    'data/MovieSummaries/character.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'release_date', 'character', 'date_of_birth', 'sex', 'height', '.','actor','age','character_map','..','...','....']
)
actors.head()

In [None]:
map_dict_to_list = lambda x: [value for key, value in eval(x).items()]
movies['languages'] = movies['languages'].apply(map_dict_to_list)
movies['countries'] = movies['countries'].apply(map_dict_to_list)
movies['genres'] = movies['genres'].apply(map_dict_to_list)
movies.head()

### Determine which actors bring in the most money


In [None]:
# Merge the two datas
df_merged = actors.merge(movies, on=['wiki_id'], how='inner')

# Clean data of movies without box offic revenue
df_merged = df_merged.dropna(subset=['revenue'])

# Determine the actor and sum the box office revenue
actor_totalRevenue = df_merged.groupby(['actor'])['revenue'].agg(['sum', 'count']).reset_index()
actor_totalRevenue.columns = ['actor', 'bo_revenue', 'Actor Count']

# Sort the actor_revenue DataFrame in descending order
actor_revenue_sort = actor_totalRevenue.sort_values(by='bo_revenue', ascending=False)

#Extract only the top 20 best actors
actor_top20 = actor_revenue_sort.head(20)

# Merge the top 20 actors with original datas
top20_actor_data = actor_top20.merge(df_merged[['actor', 'languages','sex']], on='actor', how='left')
top20_actor_data = top20_actor_data.drop_duplicates(subset = ['actor'])
top20_actor_data.head()

In [None]:
# Plot the data above
actor_names = top20_actor_data['actor']
revenues_total = top20_actor_data['bo_revenue']

plt.figure(figsize=(12, 6))
plt.plot(actor_names, revenues_total)
plt.xlabel('Actor Names')
plt.ylabel('Total Box Office Revenue')
plt.title('Top 10 Actors by Box Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.tight_layout()
plt.show()



In the data above, we are ranking the actors according to the sum of box office revenue of the movies they acted in.

### Study the main lead actors obtaining the highest revenue

In [None]:
def calculater_importance(x):
    a = 0
    for w in x["adj"]:
        a += w.isalpha()
    for w in x["active"]:
        a += w.isalpha()
    for w in x["patient"]:
        a += w.isalpha()
    return a

characters['importance'] = characters.apply(calculater_importance, axis=1)

characters_with_importance = characters[['wiki_id', 'character', 'importance']]

In [None]:
characters_with_importance['importance_share'] = characters_with_importance['importance'] / characters_with_importance.groupby('wiki_id')['importance'].transform('sum')
characters_with_importance['is_important'] = characters_with_importance['importance_share'] >= 0.2
characters_with_importance.sample(10)

In [None]:
df_charac_actor = df_merged[['wiki_id', 'character','actor','revenue']]

# Include characters and actors together in the same dataset
character_actor = df_charac_actor.merge(characters_with_importance, on=['wiki_id', 'character'], how='left')

# Merge only the main charac of movies\n",
main_character = character_actor[character_actor['is_important'] == True]

#Add up the Box office revenue and count the occurance of movie appearance
main_charac_mergedBOR = main_character.groupby(['actor'])['revenue'].agg(['sum', 'count']).reset_index()

main_charac_sort = main_charac_mergedBOR.sort_values(by='sum', ascending=False)

main_charac_top20 = main_charac_sort.head(20)
main_charac_top20.head()

In [None]:
# Plot the charts for main actors\n",
main_charac_top20_names = main_charac_top20['actor']
main_charac_revenue = main_charac_top20['sum']

plt.figure(figsize=(12, 6))
plt.plot(main_charac_top20_names,main_charac_revenue)
plt.xlabel('Actor Names')
plt.ylabel('Box Office Revenue')
plt.title('Top 20 Main Characters by Box average Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

plt.show()

The top of the actors changed, and we see, that the actors look like those who we would expect to be successful (generally subjectively more famous than the previous top).

### Male or female perform better in generating box office revenue?

In [None]:
gender_actor_data = actor_revenue_sort.merge(actors[['actor', 'sex']], on='actor', how='left')
gender_actor_data = gender_actor_data.drop_duplicates(subset = ['actor'])
gender_actor_data.head()

In [None]:
# Box plot for revenue between genders
plt.figure(figsize=(10, 10))
gender_boxplot = sns.boxplot(x="sex", y="bo_revenue", data=gender_actor_data.loc[gender_actor_data['sex'].isin(['M','F'])])

# Set labels
plt.ylabel("Movie box office revenue($)")
plt.title("Box office revenue between genders")
plt.ylim(0, 500000000)
          
#Show plot
plt.tight_layout()
plt.show()

# Given the variance and the 25th percentile, 75th percentile and medan, male actors drive higher box office revenue."

### Study the language used in top 50 films / Which language drive the highest revenue?

In [None]:
# Extract the top 100 movies
film_sort = df_merged[['countries','languages','revenue','wiki_id']]
film_sort = film_sort.drop_duplicates(subset = ['wiki_id'])
film_sort = film_sort.sort_values(by='revenue', ascending=False)
top100film = film_sort.head(100)

# Split the language into new data frames\
df_split_languages = top100film.explode('languages')

# Count the occurance of the languages
language_counts = df_split_languages['languages'].value_counts()
df_language_counts = pd.DataFrame({'Language': language_counts.index, 'Count': language_counts.values})

# Plot the data
df_language_counts.plot(kind='bar', x='Language', y='Count', figsize=(12, 6),log=True)
plt.title('How many movies in the given language are in top 100 movies by revenue')
plt.xlabel('Language')
plt.ylabel('Count (logscale)')
plt.xticks(rotation=45, ha='right')
plt.show()

### Study the genre of movies that earn the highest revenue.

In [None]:
#Clean data to remove repeated movies
df_clean_split_genres = df_merged[['title','genres','revenue','wiki_id']]
df_clean_split_genres = df_clean_split_genres.drop_duplicates(subset = ['wiki_id'])

# Split the genre into new data frames
df_split_genre = df_clean_split_genres.explode('genres')[['title', 'revenue', 'genres']]

# Find top genres
top_genres = df_split_genre.groupby('genres').size().reset_index().sort_values(by=0, ascending=False)
top_genres = top_genres.head(10)['genres'].values

# Determine the actor and sum the box office revenue
genre_totalRevenue = df_split_genre[df_split_genre['genres'].isin(top_genres)].groupby('genres')['revenue'].median().reset_index()

# Sort the BOR in ascending order
genre_totalRevenue = genre_totalRevenue.sort_values(by='revenue', ascending=False)

genre = genre_totalRevenue['genres']
genreRevenue = genre_totalRevenue['revenue']

#Plot the charts
plt.plot(genre, genreRevenue)
plt.xlabel('Genre')
plt.ylabel('Box office revenue')
plt.title('Top 10 popular Genres sorted by median Box Office Revenue')
plt.xticks(rotation=45, ha='right')  # Rotate the actor names for better readability

# Genre prediction using clusters

# Revenue prediction using clusters

First, We'll make a table of what archetypal characters each film contains.

In [25]:
colnames = ["wiki_id","Freebase movie ID"," Movie name","Movie release date", "MovieBoxOfficeRevenue","Movie runtime","Movie languages","Movie countries","Movie genres" ]
df_movies = pd.read_csv("data/MovieSummaries/movie.metadata.tsv", sep='\t',names= colnames)

In [26]:
# read data
df_clusters = pd.read_csv("data/character_clusters.csv")
df_clusters = df_clusters.drop(df_clusters.columns[[0,3,4,5]], axis=1) # dropping unneccesary columns


In [27]:
# merging datasets
df_merged = pd.merge(df_clusters, df_movies, on='wiki_id')

In [46]:
#add column for each archetypes. And they will take binary number.
for i in range(1,51):
    df_merged['archetype{}'.format(i)]=df_merged['cluster'].map(lambda x: 1 if x== i else 0)

In [29]:
# if the movie include the archetype[i], then the column archetype[i] will take 1 , otherwise 0.
def dummy(s):
    if sum(s)>0:
        return 1
    else:
        return 0

# this is just an opperation for getting Box office value after a groupby operation.
def boxoffice(s):
    return sum(s)/len(s)

string= ''    
for i in range(1,51):
    string = string + '\'archetype' +str(i) +'\':dummy,'
string = string[:-1]
print(string)

'archetype1':dummy,'archetype2':dummy,'archetype3':dummy,'archetype4':dummy,'archetype5':dummy,'archetype6':dummy,'archetype7':dummy,'archetype8':dummy,'archetype9':dummy,'archetype10':dummy,'archetype11':dummy,'archetype12':dummy,'archetype13':dummy,'archetype14':dummy,'archetype15':dummy,'archetype16':dummy,'archetype17':dummy,'archetype18':dummy,'archetype19':dummy,'archetype20':dummy,'archetype21':dummy,'archetype22':dummy,'archetype23':dummy,'archetype24':dummy,'archetype25':dummy,'archetype26':dummy,'archetype27':dummy,'archetype28':dummy,'archetype29':dummy,'archetype30':dummy,'archetype31':dummy,'archetype32':dummy,'archetype33':dummy,'archetype34':dummy,'archetype35':dummy,'archetype36':dummy,'archetype37':dummy,'archetype38':dummy,'archetype39':dummy,'archetype40':dummy,'archetype41':dummy,'archetype42':dummy,'archetype43':dummy,'archetype44':dummy,'archetype45':dummy,'archetype46':dummy,'archetype47':dummy,'archetype48':dummy,'archetype49':dummy,'archetype50':dummy


In [30]:
df_moviearchetypes = df_merged.groupby('wiki_id').agg({'MovieBoxOfficeRevenue':boxoffice,'archetype1':dummy,'archetype2':dummy,'archetype3':dummy,'archetype4':dummy,'archetype5':dummy,'archetype6':dummy,'archetype7':dummy,'archetype8':dummy,'archetype9':dummy,'archetype10':dummy,'archetype11':dummy,'archetype12':dummy,'archetype13':dummy,'archetype14':dummy,'archetype15':dummy,'archetype16':dummy,'archetype17':dummy,'archetype18':dummy,'archetype19':dummy,'archetype20':dummy,'archetype21':dummy,'archetype22':dummy,'archetype23':dummy,'archetype24':dummy,'archetype25':dummy,'archetype26':dummy,'archetype27':dummy,'archetype28':dummy,'archetype29':dummy,'archetype30':dummy,'archetype31':dummy,'archetype32':dummy,'archetype33':dummy,'archetype34':dummy,'archetype35':dummy,'archetype36':dummy,'archetype37':dummy,'archetype38':dummy,'archetype39':dummy,'archetype40':dummy,'archetype41':dummy,'archetype42':dummy,'archetype43':dummy,'archetype44':dummy,'archetype45':dummy,'archetype46':dummy,'archetype47':dummy,'archetype48':dummy,'archetype49':dummy,'archetype50':dummy})

In [47]:
#drop the raw without Box Office value
df_moviearchetypes = df_moviearchetypes.dropna(subset=['MovieBoxOfficeRevenue'])

In [48]:
#table will look like this
df_moviearchetypes.sample(10)

Unnamed: 0_level_0,MovieBoxOfficeRevenue,archetype1,archetype2,archetype3,archetype4,archetype5,archetype6,archetype7,archetype8,archetype9,...,archetype41,archetype42,archetype43,archetype44,archetype45,archetype46,archetype47,archetype48,archetype49,archetype50
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3037944,18.369109,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
464189,15.201805,0,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
30818824,17.753526,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4856981,15.048241,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
51888,19.97708,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
2251161,17.093348,0,0,0,0,0,0,0,0,0,...,0,1,1,0,0,0,0,0,0,0
8481,17.278085,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
28880684,18.777314,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1474328,18.165999,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
921301,14.391003,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now we got the table.
Our purpose is to create a regression prediction model with each archetype as a variable.

In [34]:
import statsmodels.formula.api as smf

In [35]:
#Apply log to boxoffice
df_moviearchetypes['MovieBoxOfficeRevenue'] = df_moviearchetypes['MovieBoxOfficeRevenue'].apply(np.log)
df_moviearchetypes.head()

Unnamed: 0_level_0,MovieBoxOfficeRevenue,archetype1,archetype2,archetype3,archetype4,archetype5,archetype6,archetype7,archetype8,archetype9,...,archetype41,archetype42,archetype43,archetype44,archetype45,archetype46,archetype47,archetype48,archetype49,archetype50
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3217,16.883694,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3333,17.727534,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3746,17.31624,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3837,18.598827,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3947,15.961585,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
#The number of archetypes
n = 50

model_str = "MovieBoxOfficeRevenue ~ "
for i in range(1,n+1):
    model_str += "C(archetype" + str(i) + ")+"

model_str_without_interaction = model_str.strip("+")

for i in range(1,n):
    for j in range(i+1, n+1):
        model_str += "C(archetype" + str(i) + "):C(archetype" + str(j) + ")+"

model_str = model_str.strip("+")


In [50]:
# Declare the model
mod = smf.ols(formula = model_str_without_interaction, data = df_moviearchetypes)

In [51]:
# Fit the model (adding a random seed ensuring consistency)
np.random.seed(2)
res = mod.fit()

In [52]:
print(res.summary())

                              OLS Regression Results                             
Dep. Variable:     MovieBoxOfficeRevenue   R-squared:                       0.108
Model:                               OLS   Adj. R-squared:                  0.101
Method:                    Least Squares   F-statistic:                     15.38
Date:                   Fri, 17 Nov 2023   Prob (F-statistic):          6.86e-119
Time:                           15:55:29   Log-Likelihood:                -13066.
No. Observations:                   6280   AIC:                         2.623e+04
Df Residuals:                       6230   BIC:                         2.657e+04
Df Model:                             49                                         
Covariance Type:               nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Inte

R-squared is very bad. But we can still select which of those archetypes might have an effect on box office revenue.

Then,we will add interaction terms of each two archetypes and try again.

In [53]:
mod = smf.ols(formula = model_str, data = df_moviearchetypes)

In [56]:
# Fit the model 
res = mod.fit()

In [57]:
print(res.summary())

                              OLS Regression Results                             
Dep. Variable:     MovieBoxOfficeRevenue   R-squared:                       0.273
Model:                               OLS   Adj. R-squared:                  0.097
Method:                    Least Squares   F-statistic:                     1.553
Date:                   Fri, 17 Nov 2023   Prob (F-statistic):           8.79e-25
Time:                           15:56:22   Log-Likelihood:                -12423.
No. Observations:                   6280   AIC:                         2.729e+04
Df Residuals:                       5056   BIC:                         3.555e+04
Df Model:                           1223                                         
Covariance Type:               nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------

R-squared is still not good but  better than the last one. We can conclude evaluating the combination of archetypes are effective.