In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns
from IPython.display import clear_output
import json

In [None]:
from utils.imdb_ratings import movies_with_imdb_rating
from utils.cluster_interpretation import plot_topic_distribution

In [None]:
np.random.seed(2)

# What isn't included in this notebook

This project required a lot of preprocessing, which is an interesting task, but is not related to the research questions. In this notebook we will focus on the research questions only.

For extracting characters and their attributes from the plot texts, refer to `extract_character_attributes.ipynb`.

For the clustering method please refer to `clustering.ipynb`, there you can find the methods comparison and the pipeline for characters clustering.

# Load the data

In [None]:
characters =  pd.read_csv(
    'data/character_clusters.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )

movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)

actors = pd.read_csv(
    'data/MovieSummaries/character.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'release_date', 'character', 'date_of_birth', 'sex', 'height', '.','actor','age','character_map','..','...','....']
)

In [None]:
cpi_data = pd.read_csv('data/cpi_data.csv', )
cpi_data.head()

In [None]:
def same_name(names1, names2):
    names1 = names1.values
    names2 = names2.values
    flag = []
    for i in range(len(names1)):
        flag.append(names1[i] in names2[i])
    return flag


actors_and_characters = characters.merge(actors, how='left', left_on='wiki_id', right_on='wiki_id').dropna(subset=['character_y'])

actors_and_characters = actors_and_characters[same_name(actors_and_characters['character_x'], actors_and_characters['character_y'])]
actors_and_characters['character'] = actors_and_characters['character_x']
actors_and_characters = actors_and_characters.drop(columns=['character_x', 'character_y'])
actors_and_characters = actors_and_characters[['character', 'actor', 'cluster', 'wiki_id', 'release_date', 'date_of_birth', 'sex', 'height', 'age', 'adj', 'active', 'patient']]
actors_and_characters.sample(5)

In [None]:
def discount_revenue(year, revenue):
    if year in cpi_data['year'].values:
        cpi = cpi_data[cpi_data['year'] == year]['cpi'].values[0]
    else:
        cpi = 100
    return (revenue /  cpi)*100


map_dict_to_list = lambda x: [value for key, value in eval(x).items()]
release_year = lambda x: pd.to_numeric(x.str.replace(r'-\d{2}-\d{2}$', '', regex=True).str.replace(r'-\d{2}$', '', regex=True))

movies['languages'] = movies['languages'].apply(map_dict_to_list)
movies['countries'] = movies['countries'].apply(map_dict_to_list)
movies['genres'] = movies['genres'].apply(map_dict_to_list)

movies["release_year"] = release_year(movies['release_date'])
movies["release_year"] = movies['release_year'].apply(lambda x: x if x > 1800 else x + 1000)

movies['discounted_revenue'] = movies.apply(lambda x: discount_revenue(x.release_year, x.revenue), axis=1)

movies.head()

In [None]:
%%script false --no-raise-error
# Script takes time to run, so we will use saved version instead
movies_with_rating = movies_with_imdb_rating(movies)

In [None]:
movies_with_rating = pd.read_csv(
    'data/movies_with_rating.csv', 
    index_col=0,
    converters={
        "languages": map_dict_to_list,
        "countries": map_dict_to_list,
        "genres": map_dict_to_list
        }
    )
    
movies_with_rating['release_year'] = release_year(movies_with_rating['release_date'])
movies_with_rating['discounted_revenue'] = movies_with_rating.apply(lambda x: discount_revenue(x.release_year, x.revenue), axis=1)

movies_with_rating.head()

In [None]:
plots = pd.read_csv(
    'data/MovieSummaries/plot_summaries.txt', 
    sep='\t', 
    names=['wiki_id', 'plot']
)
movies_and_plots = movies.merge(plots, how='right', left_on='wiki_id', right_on='wiki_id')
num_plot = len(pd.unique(movies_and_plots['wiki_id']))

In [None]:
characters_and_movies = characters.merge(movies, left_on='wiki_id', right_on='wiki_id')
num_char = len(pd.unique(characters_and_movies['wiki_id']))

### First look at the data

In [None]:
print(f"Number of movies: {len(movies)}")
print(f"Number of movies with revenue: {movies['revenue'].notna().sum()}")
print(f"Number of movies with rating: {len(movies_with_rating)}")
print(f"Number of movies with rating and revenue: {movies_with_rating['revenue'].notna().sum()}")
print(f"Number of movies with plot: {num_plot}")
print(f"Number of movies, where we find archetypes: {num_char}")
print(f"Number of actors with the characters who have an archetype: {len(actors_and_characters)}")

In [None]:
import plotly.graph_objects as go

# Obtain the data
categories_movie_plot = ["Total Movies", "Movies with Plot", "Movies with Archetypes"]

movie_values = [len(movies),
          num_plot,
          num_char]

# Create a bar chart
fig = go.Figure(data=[go.Bar(x=categories_movie_plot, y=movie_values)])

# Update layout for better visualization
fig.update_layout(
    title="Statistics of Movies with Plots and Archetypes",
    xaxis_title="Categories",
    yaxis_title="Number of Movies",
)

# Show the plot
fig.show()

In [None]:
print(f"Number of characters with archetypes: {len(characters)}")
print(f"Number of actors: {len(actors)}")
print(f"Number of actors with the characters who have an archetype: {len(actors_and_characters)}")

In [None]:
print(f"Number of actors with the characters who have an archetype in the movies with revenue and rating: {len(actors_and_characters[actors_and_characters['wiki_id'].isin(movies_with_rating[movies_with_rating['revenue'].notna()]['wiki_id'])])}")

### What are the countries of production

In [None]:
coutries_distr = movies.explode('countries').groupby('countries').size()
coutries_distr_with_rating = movies_with_rating.explode('countries').groupby('countries').size()
coutries_distr_with_rating_and_revenue = movies_with_rating[movies_with_rating['revenue'].notna()].explode('countries').groupby('countries').size()

coutries = list(set(
    coutries_distr.sort_values(ascending=False)[:20].index.to_list() 
    + coutries_distr_with_rating.sort_values(ascending=False)[:20].index.to_list() 
    + coutries_distr_with_rating_and_revenue.sort_values(ascending=False)[:20].index.to_list()))

coutries_distr = coutries_distr.loc[coutries].sort_values(ascending=True)
coutries = coutries_distr.index.to_list() 
coutries_distr_with_rating = coutries_distr_with_rating.loc[coutries]
coutries_distr_with_rating_and_revenue = coutries_distr_with_rating_and_revenue.loc[coutries]

plt.figure(figsize=(12, 5))
plt.title('Top of movie production countries')

plt.barh(coutries_distr.index, coutries_distr.values, label='all movies')
plt.barh(coutries_distr_with_rating.index, coutries_distr_with_rating.values, label='movies with rating')
plt.barh(coutries_distr_with_rating_and_revenue.index, coutries_distr_with_rating_and_revenue.values, label='movies with rating and revenue')

plt.xscale('log')
plt.legend()

plt.show()

We can notice that most of the movies in the dataset are made in the US, moreover, we have much less data for movies with revenue and this data is't distributed prportionally to the overall number of movies produced in the country.

### What is the historical distribution

In [None]:
movies.groupby('release_year').size().plot(figsize=(15, 5), title='Number of released movies', label='number of released movies')
plt.xticks(np.arange(1890, 2021, 7))

plt.axvspan(1914, 1918, alpha=0.3, label='World War I')
plt.axvspan(1929, 1939, alpha=0.3, label='Great Depression', color='green')
plt.axvspan(1939, 1945, alpha=0.3, label='World War II')
plt.axvspan(1961.2, 1961.3, alpha=0.3, label='First space flight', color='purple')
plt.axvspan(2007, 2008, alpha=0.3, label='Global Financial Crisis', color='green')

plt.legend()

plt.show()

We don't have much data before 1910-s and after 2012.

# Clusters interpretation
To interpret clusters, we can use the function `plot_topic_distribution` to see the topics with the largest probabilities to be in the cluster.

In [None]:
plot_topic_distribution(42)

# Historycal trends

In [None]:
movies_count = characters_and_movies.groupby('release_year').size().reset_index(name='movie_count')
movies_count = movies_count[movies_count['movie_count'] >= 15]
movies_count.plot(x='release_year', y='movie_count')
plt.yscale('log')

Comment: We decide to analyze trends where there is a stable abundance of data, and remove movies before 1932 and tha last two years (2013-2014). For further analysis we are selecting important clusters (by relative popularity or changes in popularity) but this selection is skewed by the years where there is little data since that gives a very high proportion for every cluster. So the early clusters will appear very significant despite that not being the case (if e.g. there are only a handful of movies, the archetype distribution is not very interesting). Therefore the filtered subset is used, not only for plot, but also for cluster ranking.

In [None]:
archetype_counts = characters_and_movies[characters_and_movies['release_year'].isin(movies_count['release_year'])].groupby(['release_year', 'cluster']).size().reset_index(name='character_count')
archetype_counts = archetype_counts.pivot(index='release_year', columns='cluster', values='character_count').fillna(0)
archetype_counts.plot(legend=False)

In [None]:
normalized_archetype_counts = (archetype_counts)/(archetype_counts.values.sum(1).reshape(-1, 1))
normalized_archetype_counts.plot(legend=False)
plt.yscale('log')

### Top archetypes

- By the highest sum of normalized frequency (popularity)
- By the biggest range in normalized frequency (changes in popularity)

In [None]:
# sum of normalized frequency

top_clusters = normalized_archetype_counts.sum(0).sort_values(ascending=False)[:10].index.values
top_clusters_archetype_counts = normalized_archetype_counts[top_clusters]
top_clusters_archetype_counts

In [None]:
top_clusters_archetype_counts.plot(figsize=(12, 6))
plt.yscale("log")
plt.xticks(top_clusters_archetype_counts.index[::5], rotation=45, ha='right')
plt.xlim([1931, 2013])
plt.xlabel('')
plt.ylabel('Normalized character count')
plt.title('Normalized character counts by cluster: subset 1')
plt.grid(True)
plt.show()

# -----------------------------------
n = 10 # sliding average window size

plt.figure(figsize=(12, 6))

# Iterate over clusters and plot a line for each
for cluster in top_clusters:
    x = top_clusters_archetype_counts[cluster]
    x_avg = np.convolve(x, np.ones(n)/n, mode='valid')
    y = top_clusters_archetype_counts.index
    y_1 = y[round(n/2):-(n-round(n/2))+1]
    plt.plot(y_1, x_avg, label=f'Cluster {cluster}', marker='', linewidth=0.7)

plt.yscale("log")
plt.xticks(top_clusters_archetype_counts.index[::5], rotation=45, ha='right')
plt.xlim([1931, 2013])
plt.xlabel('')
plt.ylabel('Normalized character count')
plt.title(f'Normalized character counts by cluster: subset 1. Sliding average (n={n})')
plt.legend()
plt.grid(True)
plt.show()

#### Interpretation

In [None]:
def print_cluster_info(n):
    print('Cluster: ', n)
    top = characters_and_movies[(characters_and_movies['cluster'] == n) & (characters_and_movies['revenue'] > 5e8)]
    top = top.sort_values(by='revenue', ascending=False).head(5)
    print(top[['title', 'character']])
    plot_topic_distribution(n)

In [None]:
for cluster in top_clusters[:5]:
    print_cluster_info(cluster)

We can notice that three most popular archetypes are all archetypes of different kinds of protagonists and their close allies.

In [None]:
# biggest range in normalized frequency

top_diff_clusters = normalized_archetype_counts.apply(np.ptp).sort_values(ascending=False)[:10].index.values
top_clusters_archetype_counts = normalized_archetype_counts[top_diff_clusters]
top_clusters_archetype_counts

In [None]:
top_clusters_archetype_counts.plot(figsize=(12, 6))
plt.yscale("log")
plt.xticks(top_clusters_archetype_counts.index[::5], rotation=45, ha='right')
plt.xlim([1931, 2013])
plt.xlabel('')
plt.ylabel('Normalized character count')
plt.title('Normalized character counts by cluster: subset 2')
plt.grid(True)
plt.show()

# -----------------------------------
n = 10 # sliding average window size

plt.figure(figsize=(12, 6))

# Iterate over clusters and plot a line for each
for cluster in top_diff_clusters:
    x = top_clusters_archetype_counts[cluster]
    x_avg = np.convolve(x, np.ones(n)/n, mode='valid')
    y = top_clusters_archetype_counts.index
    y_1 = y[round(n/2):-(n-round(n/2))+1]
    plt.plot(y_1, x_avg, label=f'Cluster {cluster}', marker='', linewidth=0.7)

plt.yscale("log")
plt.xticks(top_clusters_archetype_counts.index[::5], rotation=45, ha='right')
plt.xlim([1931, 2013])
plt.xlabel('')
plt.ylabel('Normalized character count')
plt.title(f'Normalized character counts by cluster: subset 1. Sliding average (n={n})')
plt.legend()
plt.grid(True)
plt.show()

#### Interpretation

In [None]:
for cluster in top_diff_clusters[:5]:
    print_cluster_info(cluster)

The archetypes that changed in popularity the most are some side caracters.

# Cultural preference

We are interested in the cultural preferences at more modern times, so we will look only at the data from the 21 century. We also will use only the first country in the list of production countries.

In [None]:
char_movies_countries = characters_and_movies[characters_and_movies['release_year'] > 2000]
char_movies_countries['countries'] = char_movies_countries['countries'].apply(lambda x: x[0] if x else 'Unknown')

char_movies_countries = char_movies_countries.groupby(['countries', 'cluster']).size().reset_index(name='character_count')

char_movies_countries = char_movies_countries[['countries', 'cluster', 'character_count']]

In [None]:
# top production countries
char_movies_countries.groupby('countries')['character_count'].sum().sort_values(ascending=False)[:50].index

There are quite a lot of films with unknown country of production.

In [None]:
top_countries = char_movies_countries.groupby('countries')['character_count'].sum().sort_values(ascending=False)[:11].index
top_countries = top_countries.drop('Unknown')
top_countries

In [None]:
char_movies_countries = char_movies_countries[char_movies_countries['countries'].isin(top_countries)]
archetype_by_country = char_movies_countries.pivot(index='countries', columns='cluster', values='character_count').fillna(0)
archetype_by_country

In [None]:
char_movies_countries.groupby('countries')['character_count'].sum().plot.barh(x='countries')

As we can see, we have much more data on the american films, so we will normalize the data. After that let's look at the distribution of the global top 5 archetypes.

In [None]:
normalized_archetype_by_country = (archetype_by_country)/(archetype_by_country.values.sum(1).reshape(-1, 1))

normalized_archetype_by_country[top_clusters[:5]].plot.barh(figsize=(7, 10), title='Distribution of the top 5 clusters in top 10 countries')

It's easy to notice the difference between this countries. We can notice the difference in the types of the most popular protagonists for example in India and Hong Kong compared to United Kingdom and Spain. In the first group, the most popular protagonists are those who act and achieve something, while in the second group the most popular protagonists are communicating more and travel.

Now, we can't say, that the distribution of the archetypes is different in different countries. But was it the case in 20th century?

In [None]:
char_movies_countries = characters_and_movies[characters_and_movies['release_year'] < 2001]
char_movies_countries['countries'] = char_movies_countries['countries'].apply(lambda x: x[0] if x else 'Unknown')

char_movies_countries = char_movies_countries.groupby(['countries', 'cluster']).size().reset_index(name='character_count')

char_movies_countries = char_movies_countries[['countries', 'cluster', 'character_count']]

char_movies_countries = char_movies_countries[char_movies_countries['countries'].isin(top_countries)]
archetype_by_country = char_movies_countries.pivot(index='countries', columns='cluster', values='character_count').fillna(0)

normalized_archetype_by_country = (archetype_by_country)/(archetype_by_country.values.sum(1).reshape(-1, 1))

normalized_archetype_by_country[top_clusters[:5]].plot.barh(figsize=(7, 10), title='Distribution of the top 5 clusters in top 10 countries in the 20th century')

We can notice the shift that hapend from the 20th to 21st century from more achieving to communicating protagonists.

# Movie success based on the archetypes

### Linear model for revenue prediction

To determine importance of the archetypes for the movie success, we can build the linear model and tell what are the most important archetypes based on the coefficient and p-value.

In [None]:
cluster_and_revenue = characters_and_movies[characters_and_movies['discounted_revenue'].notna()][['wiki_id', 'discounted_revenue']]
cluster_and_revenue['log_revenue'] = np.log(cluster_and_revenue['discounted_revenue'])
cluster_and_revenue = cluster_and_revenue[['wiki_id', 'log_revenue']].drop_duplicates().reset_index(drop=True)

revenues = cluster_and_revenue['log_revenue'].values
wiki_ids = cluster_and_revenue['wiki_id'].values

plt.hist(revenues, bins=50)
plt.title('Log revenue histogram')
plt.show()

In [None]:
import plotly.graph_objects as go

# Obtain the data
categories_movie_plot = ["Movies with Revenue", "Movies with Rating", "Movies with Rating and Revenue"]

movie_values = [movies['revenue'].notna().sum(),
          len(movies_with_rating),
          movies_with_rating['revenue'].notna().sum()]

# Create a bar chart
fig = go.Figure(data=[go.Bar(x=categories_movie_plot, y=movie_values)])

# Update layout for better visualization
fig.update_layout(
    title="Statistics of Movies with Revenue and Rating",
    xaxis_title="Categories",
    yaxis_title="Number of Movies",
)

# Show the plot
fig.show()

In [None]:
clusters = np.zeros((len(cluster_and_revenue), 50)) #initializing the clusters
for i in range(len(wiki_ids)):
    wiki_id = wiki_ids[i]
    for c in characters_and_movies[characters_and_movies['wiki_id'] == wiki_id]['cluster'].values:
        clusters[i][c] = 1
        

cluster_revenue_data = pd.DataFrame(clusters, columns=[f'archetype_{i}' for i in np.arange(50)])
cluster_revenue_data['log_revenue'] = cluster_and_revenue['log_revenue']
cluster_revenue_data

In [None]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [None]:
n = 50

model_str = "log_revenue ~ "
for i in range(n):
    model_str += "C(archetype_" + str(i) + ")+"

model_str_without_interaction = model_str.strip("+")

for i in range(n):
    for j in range(i+1, n):
        model_str += "C(archetype_" + str(i) + "):C(archetype_" + str(j) + ")+"

model_str = model_str.strip("+")

In [None]:
mod = smf.ols(formula = model_str_without_interaction, data = cluster_revenue_data)
res_without_interaction = mod.fit()
res_without_interaction.summary().tables[0]

In [None]:
mod = smf.ols(formula = model_str, data = cluster_revenue_data)
res = mod.fit()
res.summary().tables[0]

Based on the significant improvement in R-squared metric, we can say that interactions between archetypes are important.

Next, let's look at anova results to determine the important archetypes and interactions.

In [None]:
sm.stats.anova_lm(res, robust='hc3').sort_values('PR(>F)')[:50]

# Actors success based on the archetypes

In [None]:
movies_rating_revenue = movies_with_rating[['wiki_id', 'discounted_revenue', 'averageRating']].dropna(subset=['discounted_revenue']).reset_index(drop=True)
movies_rating_revenue['discounted_revenue'] = np.log(movies_rating_revenue['discounted_revenue'])
movies_rating_revenue['norm_log_revenue'] = (movies_rating_revenue['discounted_revenue'] - np.min(movies_rating_revenue['discounted_revenue'])) * 10/ (np.max(movies_rating_revenue['discounted_revenue']) - np.min(movies_rating_revenue['discounted_revenue']))
movies_rating_revenue

In [None]:
movies_rating_revenue[['norm_log_revenue', 'averageRating']].plot.hist(alpha=0.3, bins=30)

In [None]:
from scipy.stats import pearsonr

pearsonr(movies_rating_revenue['norm_log_revenue'].values, movies_rating_revenue['averageRating'].values)

We can see, that there is some statistically significant correlation, even though it's not wery big. We will use the sum of normalized log revenue and rating of the film as the metric for success.

In [None]:
movies_rating_revenue['success'] = movies_rating_revenue['averageRating'] + movies_rating_revenue['norm_log_revenue']
movies_rating_revenue['success'].plot.hist(alpha=0.3, bins=30)

In [None]:
actors_and_characters['importance'] = actors_and_characters['adj'].apply(len) + actors_and_characters['active'].apply(len) + actors_and_characters['patient'].apply(len)
actors_and_characters['importance'] = actors_and_characters['importance'] / actors_and_characters.groupby('wiki_id')['importance'].transform('sum')
actors_and_characters

In [None]:
actors_and_characters_with_success = actors_and_characters.merge(movies_rating_revenue[['wiki_id', 'success']], left_on='wiki_id', right_on='wiki_id')
actors_and_characters_with_success = actors_and_characters_with_success[['actor', 'cluster', 'date_of_birth', 'sex', 'height', 'age', 'importance', 'success']]
actors_and_characters_with_success.sample(10)

In [None]:
actors_and_characters_with_success['weighted_success'] = actors_and_characters_with_success['importance'] * actors_and_characters_with_success['success']
top_actors = actors_and_characters_with_success.groupby('actor').agg({'weighted_success': ['sum', 'size'], 'cluster':list, 'sex':'last', 'date_of_birth':'last'}).sort_values(('weighted_success',  'sum'), ascending=False)

In [None]:
# Only look at the actors with at least 5 films
top_actors.columns = ['sum_success', 'num_films', 'clusters', 'sex', 'date_of_birth']
top_actors = top_actors.reset_index()
top_actors = top_actors[top_actors['num_films'] > 4]
top_actors

In [None]:
top_actors['year_of_birth'] = top_actors['date_of_birth'].apply(lambda x: int(x[:4]))

In [None]:
plt.figure(figsize=(12, 7))
sum_success_plot = sns.scatterplot(data=top_actors[:50], x='actor', y='sum_success', hue='num_films')
plt.xticks(rotation=90)
plt.show()

#### Archetype number and actor's success

In [None]:
top_actors['cluster_number'] = top_actors['clusters'].apply(lambda x: len(set(x)))
top_actors['clusters_to_films_ratio'] = top_actors['cluster_number']/top_actors['num_films']
top_actors

In [None]:
plt.hist(top_actors['clusters_to_films_ratio'], bins=20)
print('Median of the cluster to film ratio: ', np.median(top_actors['clusters_to_films_ratio']))

In [None]:
top_actors['many_archetypes'] = top_actors['clusters_to_films_ratio'].apply(lambda x: int(x > 0.70))

Now let's perform causal analysis.

In [None]:
norm_top_actors_data = top_actors[['sum_success', 'many_archetypes', 'year_of_birth', 'sex', 'num_films']].reset_index(drop=True)
norm_top_actors_data['sex'] =  norm_top_actors_data['sex'].apply(lambda x: int(x=='F'))
norm_top_actors_data['year_of_birth'] = norm_top_actors_data['year_of_birth'] - norm_top_actors_data['year_of_birth'].mean() / norm_top_actors_data['year_of_birth'].std()
norm_top_actors_data['num_films'] = norm_top_actors_data['num_films'] - norm_top_actors_data['num_films'].mean() / norm_top_actors_data['num_films'].std()


mod = smf.logit(formula='many_archetypes ~  year_of_birth + sex + num_films', data=norm_top_actors_data)
res = mod.fit()

# Extract the estimated propensity scores
norm_top_actors_data['Propensity_score'] = res.predict()

print(res.summary())

In [None]:
import networkx as nx


def get_similarity(propensity_score1, propensity_score2):
    '''Calculate similarity for instances with given propensity scores'''
    return 1-np.abs(propensity_score1-propensity_score2)

treatment_df = norm_top_actors_data[norm_top_actors_data['many_archetypes'] == 1]
control_df = norm_top_actors_data[norm_top_actors_data['many_archetypes'] == 0]

G = nx.Graph()

for control_id, control_row in control_df.iterrows():
    for treatment_id, treatment_row in treatment_df.iterrows():

        similarity = get_similarity(control_row['Propensity_score'],
                                    treatment_row['Propensity_score'])

        G.add_weighted_edges_from([(control_id, treatment_id, similarity)])

matching = nx.max_weight_matching(G)

In [None]:
matched = [i[0] for i in list(matching)] + [i[1] for i in list(matching)]
balanced_norm_top_actors_data = norm_top_actors_data.iloc[matched]
balanced_norm_top_actors_data

In [None]:
treated = balanced_norm_top_actors_data.loc[balanced_norm_top_actors_data['many_archetypes'] == 1]
control = balanced_norm_top_actors_data.loc[balanced_norm_top_actors_data['many_archetypes'] == 0]

ax = sns.histplot(treated['sum_success'], kde=True, stat='density', color='blue', label='many_archetypes');
ax = sns.histplot(control['sum_success'], kde=True, stat='density', color='orange', label='not many_archetypes')
ax.set(title='Succes distribution comparison',xlabel='sum success', ylabel='density')
plt.legend()
plt.show()

In [None]:
from scipy.stats import ttest_ind

ttest_ind(treated['sum_success'],control['sum_success'], alternative='less')

Now we can say that ators, who have played less archetypes are statistically significantly more successful that those, who played more various archetypes.