In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
import seaborn as sns
from IPython.display import clear_output
import json

In [3]:
from utils.imdb_ratings import movies_with_imdb_rating
from utils.cluster_interpretation import plot_topic_distribution

# What isn't included in this notebook

This project required a lot of preprocessing, which is an interesting task, but is not related to the research questions. In this notebook we will focus on the research questions only.

For extracting characters and their attributes from the plot texts, refer to `extract_character_attributes.ipynb`.

For the clustering method please refer to `clustering.ipynb`, there you can find the methods comparison and the pipeline for characters clustering.

# Load the data

In [None]:
characters =  pd.read_csv(
    'data/character_clusters.csv', 
    index_col=0,
    converters={
        "adj": lambda x: x.strip("[]").replace("'","").split(", "), # need this to read list columns from csv
        "active": lambda x: x.strip("[]").replace("'","").split(", "),
        "patient": lambda x: x.strip("[]").replace("'","").split(", ")
        }
    )

movies = pd.read_csv(
    'data/MovieSummaries/movie.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']
)

actors = pd.read_csv(
    'data/MovieSummaries/character.metadata.tsv', 
    sep='\t', 
    names=['wiki_id', 'freebase_id', 'release_date', 'character', 'date_of_birth', 'sex', 'height', '.','actor','age','character_map','..','...','....']
)

In [None]:
cpi_data = pd.read_csv('data/cpi_data.csv', )
cpi_data.head()

In [None]:
def same_name(names1, names2):
    names1 = names1.values
    names2 = names2.values
    flag = []
    for i in range(len(names1)):
        flag.append(names1[i] in names2[i])
    return flag


actors_and_characters = characters.merge(actors, how='left', left_on='wiki_id', right_on='wiki_id').dropna(subset=['character_y'])

actors_and_characters = actors_and_characters[same_name(actors_and_characters['character_x'], actors_and_characters['character_y'])]
actors_and_characters['character'] = actors_and_characters['character_x']
actors_and_characters = actors_and_characters.drop(columns=['character_x', 'character_y'])
actors_and_characters = actors_and_characters[['character', 'actor', 'cluster', 'wiki_id', 'release_date', 'date_of_birth', 'sex', 'height', 'age', 'adj', 'active', 'patient']]
actors_and_characters.sample(5)

In [None]:
def discount_revenue(year, revenue):
    if year in cpi_data['year']:
        cpi = cpi_data[cpi_data['year'] == year]['cpi'][0]
    else:
        cpi = 1
    return (revenue /  cpi)*100


map_dict_to_list = lambda x: [value for key, value in eval(x).items()]
release_year = lambda x: pd.to_numeric(x.str.replace(r'-\d{2}-\d{2}$', '', regex=True).str.replace(r'-\d{2}$', '', regex=True))

movies['languages'] = movies['languages'].apply(map_dict_to_list)
movies['countries'] = movies['countries'].apply(map_dict_to_list)
movies['genres'] = movies['genres'].apply(map_dict_to_list)

movies["release_year"] = release_year(movies['release_date'])
movies["release_year"] = movies['release_year'].apply(lambda x: x if x > 1800 else x + 1000)

movies['discounted_revenue'] = movies.apply(lambda x: discount_revenue(x.release_year, x.revenue), axis=1)

movies.head()

In [None]:
%%script false --no-raise-error
# Script takes time to run, so we will use saved version instead
movies_with_rating = movies_with_imdb_rating(movies)

In [None]:
movies_with_rating = pd.read_csv(
    'data/movies_with_rating.csv', 
    index_col=0,
    converters={
        "languages": map_dict_to_list,
        "countries": map_dict_to_list,
        "genres": map_dict_to_list
        }
    )
    
movies_with_rating['release_year'] = release_year(movies_with_rating['release_date'])
movies_with_rating['discounted_revenue'] = movies_with_rating.apply(lambda x: discount_revenue(x.release_year, x.revenue), axis=1)

movies_with_rating.head()

### First look at the data

In [None]:
print(f"Number of movies: {len(movies)}")
print(f"Number of movies with revenue: {movies['revenue'].notna().sum()}")
print(f"Number of movies with rating: {len(movies_with_rating)}")
print(f"Number of movies with rating and revenue: {movies_with_rating['revenue'].notna().sum()}")

In [None]:
print(f"Number of characters with archetypes: {len(characters)}")
print(f"Number of actors: {len(actors)}")
print(f"Number of actors with the characters who have an archetype: {len(actors_and_characters)}")

In [None]:
print(f"Number of actors with the characters who have an archetype in the movies with revenue and rating: {len(actors_and_characters[actors_and_characters['wiki_id'].isin(movies_with_rating[movies_with_rating['revenue'].notna()]['wiki_id'])])}")

In [None]:
coutries_distr = movies.explode('countries').groupby('countries').size()
coutries_distr_with_rating = movies_with_rating.explode('countries').groupby('countries').size()
coutries_distr_with_rating_and_revenue = movies_with_rating[movies_with_rating['revenue'].notna()].explode('countries').groupby('countries').size()

coutries = list(set(
    coutries_distr.sort_values(ascending=False)[:20].index.to_list() 
    + coutries_distr_with_rating.sort_values(ascending=False)[:20].index.to_list() 
    + coutries_distr_with_rating_and_revenue.sort_values(ascending=False)[:20].index.to_list()))

coutries_distr = coutries_distr.loc[coutries].sort_values(ascending=True)
coutries = coutries_distr.index.to_list() 
coutries_distr_with_rating = coutries_distr_with_rating.loc[coutries]
coutries_distr_with_rating_and_revenue = coutries_distr_with_rating_and_revenue.loc[coutries]

plt.figure(figsize=(12, 5))
plt.title('Top of movie production countries')

plt.barh(coutries_distr.index, coutries_distr.values, label='all movies')
plt.barh(coutries_distr_with_rating.index, coutries_distr_with_rating.values, label='movies with rating')
plt.barh(coutries_distr_with_rating_and_revenue.index, coutries_distr_with_rating_and_revenue.values, label='movies with rating and revenue')

plt.xscale('log')
plt.legend()

plt.show()

We can notice that most of the movies in the dataset are made in the US, moreover, we have much less data for movies with revenue and this data is't distributed prportionally to the overall number of movies produced in the country.

In [None]:
movies.groupby('release_year').size().plot(figsize=(15, 5), title='Number of released movies', label='number of released movies')
plt.xticks(np.arange(1890, 2021, 7))

plt.axvspan(1914, 1918, alpha=0.3, label='World War I')
plt.axvspan(1929, 1939, alpha=0.3, label='Great Depression', color='green')
plt.axvspan(1939, 1945, alpha=0.3, label='World War II')
plt.axvspan(1961.2, 1961.3, alpha=0.3, label='First space flight', color='purple')
plt.axvspan(2007, 2008, alpha=0.3, label='Global Financial Crisis', color='green')

plt.legend()

plt.show()

We don't have much data before 1910-s and after 2012.

# Clusters interpretation
To interpret clusters, we can use the function `plot_topic_distribution` to see the topics with the largest probabilities to be in the cluster.

In [None]:
plot_topic_distribution(42)

# Historycal trends

In [None]:
characters_and_movies = characters.merge(movies, left_on='wiki_id', right_on='wiki_id')

In [None]:
movies_count = characters_and_movies.groupby('release_year').size().reset_index(name='movie_count')
movies_count = movies_count[movies_count['movie_count'] >= 15]
movies_count.plot(x='release_year', y='movie_count')
plt.yscale('log')

Comment: We decide to analyze trends where there is a stable abundance of data, and remove movies before 1932 and tha last two years (2013-2014). For further analysis we are selecting important clusters (by relative popularity or changes in popularity) but this selection is skewed by the years where there is little data since that gives a very high proportion for every cluster. So the early clusters will appear very significant despite that not being the case (if e.g. there are only a handful of movies, the archetype distribution is not very interesting). Therefore the filtered subset is used, not only for plot, but also for cluster ranking.

In [None]:
archetype_counts = characters_and_movies[characters_and_movies['release_year'].isin(movies_count['release_year'])].groupby(['release_year', 'cluster']).size().reset_index(name='character_count')
archetype_counts = archetype_counts.pivot(index='release_year', columns='cluster', values='character_count').fillna(0)
archetype_counts.plot(legend=False)

In [None]:
normalized_archetype_counts = (archetype_counts)/(archetype_counts.values.sum(1).reshape(-1, 1))
normalized_archetype_counts.plot(legend=False)
plt.yscale('log')

### Top archetypes

- By the highest sum of normalized frequency (popularity)
- By the biggest range in normalized frequency (changes in popularity)

In [None]:
# sum of normalized frequency

top_clusters = normalized_archetype_counts.sum(0).sort_values(ascending=False)[:10].index.values
top_clusters_archetype_counts = normalized_archetype_counts[top_clusters]
top_clusters_archetype_counts

In [None]:
top_clusters_archetype_counts.plot(figsize=(12, 6))
plt.yscale("log")
plt.xticks(top_clusters_archetype_counts.index[::5], rotation=45, ha='right')
plt.xlim([1931, 2013])
plt.xlabel('')
plt.ylabel('Normalized character count')
plt.title('Normalized character counts by cluster: subset 1')
plt.grid(True)
plt.show()

# -----------------------------------
n = 10 # sliding average window size

plt.figure(figsize=(12, 6))

# Iterate over clusters and plot a line for each
for cluster in top_clusters:
    x = top_clusters_archetype_counts[cluster]
    x_avg = np.convolve(x, np.ones(n)/n, mode='valid')
    y = top_clusters_archetype_counts.index
    y_1 = y[round(n/2):-(n-round(n/2))+1]
    plt.plot(y_1, x_avg, label=f'Cluster {cluster}', marker='', linewidth=0.7)

plt.yscale("log")
plt.xticks(top_clusters_archetype_counts.index[::5], rotation=45, ha='right')
plt.xlim([1931, 2013])
plt.xlabel('')
plt.ylabel('Normalized character count')
plt.title(f'Normalized character counts by cluster: subset 1. Sliding average (n={n})')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
def print_cluster_info(n):
    print('Cluster: ', n)
    top = characters_and_movies[(characters_and_movies['cluster'] == n) & (characters_and_movies['revenue'] > 5e8)]
    top = top.sort_values(by='revenue', ascending=False).head(5)
    print(top[['title', 'character']])
    plot_topic_distribution(n)

In [None]:
print_cluster_info(19)
print_cluster_info(12)
print_cluster_info(42)

In [None]:
# biggest range in normalized frequency

top_clusters = normalized_archetype_counts.apply(np.ptp).sort_values(ascending=False)[:10].index.values
top_clusters_archetype_counts = normalized_archetype_counts[top_clusters]
top_clusters_archetype_counts

In [None]:
top_clusters_archetype_counts.plot(figsize=(12, 6))
plt.yscale("log")
plt.xticks(top_clusters_archetype_counts.index[::5], rotation=45, ha='right')
plt.xlim([1931, 2013])
plt.xlabel('')
plt.ylabel('Normalized character count')
plt.title('Normalized character counts by cluster: subset 2')
plt.grid(True)
plt.show()

# -----------------------------------
n = 10 # sliding average window size

plt.figure(figsize=(12, 6))

# Iterate over clusters and plot a line for each
for cluster in top_clusters:
    x = top_clusters_archetype_counts[cluster]
    x_avg = np.convolve(x, np.ones(n)/n, mode='valid')
    y = top_clusters_archetype_counts.index
    y_1 = y[round(n/2):-(n-round(n/2))+1]
    plt.plot(y_1, x_avg, label=f'Cluster {cluster}', marker='', linewidth=0.7)

plt.yscale("log")
plt.xticks(top_clusters_archetype_counts.index[::5], rotation=45, ha='right')
plt.xlim([1931, 2013])
plt.xlabel('')
plt.ylabel('Normalized character count')
plt.title(f'Normalized character counts by cluster: subset 1. Sliding average (n={n})')
plt.legend()
plt.grid(True)
plt.show()

#### Interpretation

In [None]:
print_cluster_info(10)
print_cluster_info(5)
print_cluster_info(38)

# Cultural preference

# Movie success based on the archetypes

# Actors success based on the archetypes

First, We'll make a table of what archetypal characters each film contains.

In [4]:
colnames = ["wiki_id","Freebase movie ID"," Movie name","Movie release date", "MovieBoxOfficeRevenue","Movie runtime","Movie languages","Movie countries","Movie genres" ]
df_movies = pd.read_csv("data/MovieSummaries/movie.metadata.tsv", sep='\t',names= colnames)

In [5]:
# read data
df_clusters = pd.read_csv("data/character_clusters.csv")
df_clusters = df_clusters.drop(df_clusters.columns[[0,3,4,5]], axis=1) # dropping unneccesary columns


In [6]:
# merging datasets
df_merged = pd.merge(df_clusters, df_movies, on='wiki_id')

In [7]:
#add column for each archetypes. And they will take binary number.
for i in range(1,51):
    df_merged['archetype{}'.format(i)]=df_merged['cluster'].map(lambda x: 1 if x== i else 0)

In [8]:
# if the movie include the archetype[i], then the column archetype[i] will take 1 , otherwise 0.
def dummy(s):
    if sum(s)>0:
        return 1
    else:
        return 0

# this is just an opperation for getting Box office value after a groupby operation.
def boxoffice(s):
    return sum(s)/len(s)

string= ''    
for i in range(1,51):
    string = string + '\'archetype' +str(i) +'\':dummy,'
string = string[:-1]

In [9]:
df_moviearchetypes = df_merged.groupby('wiki_id').agg({'MovieBoxOfficeRevenue':boxoffice,'archetype1':dummy,'archetype2':dummy,'archetype3':dummy,'archetype4':dummy,'archetype5':dummy,'archetype6':dummy,'archetype7':dummy,'archetype8':dummy,'archetype9':dummy,'archetype10':dummy,'archetype11':dummy,'archetype12':dummy,'archetype13':dummy,'archetype14':dummy,'archetype15':dummy,'archetype16':dummy,'archetype17':dummy,'archetype18':dummy,'archetype19':dummy,'archetype20':dummy,'archetype21':dummy,'archetype22':dummy,'archetype23':dummy,'archetype24':dummy,'archetype25':dummy,'archetype26':dummy,'archetype27':dummy,'archetype28':dummy,'archetype29':dummy,'archetype30':dummy,'archetype31':dummy,'archetype32':dummy,'archetype33':dummy,'archetype34':dummy,'archetype35':dummy,'archetype36':dummy,'archetype37':dummy,'archetype38':dummy,'archetype39':dummy,'archetype40':dummy,'archetype41':dummy,'archetype42':dummy,'archetype43':dummy,'archetype44':dummy,'archetype45':dummy,'archetype46':dummy,'archetype47':dummy,'archetype48':dummy,'archetype49':dummy,'archetype50':dummy})

In [10]:
#drop the raw without Box Office value
df_moviearchetypes = df_moviearchetypes.dropna(subset=['MovieBoxOfficeRevenue'])

In [11]:
#table will look like this
df_moviearchetypes.sample(10)

Unnamed: 0_level_0,MovieBoxOfficeRevenue,archetype1,archetype2,archetype3,archetype4,archetype5,archetype6,archetype7,archetype8,archetype9,...,archetype41,archetype42,archetype43,archetype44,archetype45,archetype46,archetype47,archetype48,archetype49,archetype50
wiki_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
28984353,11647000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5273390,12974636.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
157481,116000000.0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
1031573,12313323.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
27111227,105648706.0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
21381088,11546932.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
74862,4517000.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1233576,216614388.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2932836,16192320.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
434738,267005.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now we got the data.

Our goal is to create a linear regression model with each archetype as a variable to predict the log of the box office revenue.

We will use linear regression model and least square method to fit it.



In [12]:
import statsmodels.formula.api as smf

In [13]:
#Apply log to boxoffice
df_moviearchetypes['MovieBoxOfficeRevenue'] = df_moviearchetypes['MovieBoxOfficeRevenue'].apply(np.log)

In [14]:
#The number of archetypes
n = 50

model_str = "MovieBoxOfficeRevenue ~ "
for i in range(1,n+1):
    model_str += "C(archetype" + str(i) + ")+"

model_str_without_interaction = model_str.strip("+")

for i in range(1,n):
    for j in range(i+1, n+1):
        model_str += "C(archetype" + str(i) + "):C(archetype" + str(j) + ")+"

model_str = model_str.strip("+")


In [15]:
# Declare the model
mod = smf.ols(formula = model_str_without_interaction, data = df_moviearchetypes)

In [16]:
# Fit the model (adding a random seed ensuring consistency)
np.random.seed(2)
res = mod.fit()

In [17]:
res.summary().tables[0]


0,1,2,3
Dep. Variable:,MovieBoxOfficeRevenue,R-squared:,0.108
Model:,OLS,Adj. R-squared:,0.101
Method:,Least Squares,F-statistic:,15.38
Date:,"Tue, 19 Dec 2023",Prob (F-statistic):,6.86e-119
Time:,14:30:40,Log-Likelihood:,-13066.0
No. Observations:,6280,AIC:,26230.0
Df Residuals:,6230,BIC:,26570.0
Df Model:,49,,
Covariance Type:,nonrobust,,


R-squared is very low.
The model is not accurate.
But we can still select which of those archetypes might have an effect on box office revenue by considering the p-values.

Then we will add interaction terms and try again.

In [18]:
#Declare the model
mod = smf.ols(formula = model_str, data = df_moviearchetypes)

In [19]:
# Fit the model 
res = mod.fit()

In [20]:
res.summary().tables[0]

0,1,2,3
Dep. Variable:,MovieBoxOfficeRevenue,R-squared:,0.273
Model:,OLS,Adj. R-squared:,0.097
Method:,Least Squares,F-statistic:,1.553
Date:,"Tue, 19 Dec 2023",Prob (F-statistic):,8.789999999999999e-25
Time:,14:30:56,Log-Likelihood:,-12423.0
No. Observations:,6280,AIC:,27290.0
Df Residuals:,5056,BIC:,35550.0
Df Model:,1223,,
Covariance Type:,nonrobust,,


R-squared is still low but  better than the last one. We can conclude that revenue prediction by archetype combination could be prospective.

In [66]:
# It would be nice if someone could implement regression with three archetypes interaction term. In that case, we cannot use "smf.ols" anymore because of the number of parameters.

## Global sensitivity analysis

Now we try to find out important archetypes without assuming linear model.  We will do variance-based analysis.

If we have N variables of archetypes, box office revenue will be like $f(x_1,x_2\dotsi x_N,\epsilon)$.\
Note that each variable of $x_1,x_2\dotsi x_N$ will take $0$ or $1$ and $\epsilon$ denotes effects of other variables. And we assume $\epsilon$ is independent of other variables.


f can be decomposed uniquely in a following way.
\begin{align}
    f(\boldsymbol{x}) &= f_\phi + \sum_{1<=i<=N}f_i(x_i)+ \sum_{1<=i<j<=N}f_{i,j}(x_i,x_j)+\sum_{1<=i<j<k<=N}f_{i,j,k}(x_i,x_j,x_k)+\dotsi + f_\epsilon(\epsilon)\nonumber\\
    &= \sum_{u\subset\{1,\dotsi,N\}}f_u(\boldsymbol{x}_u) + f_\epsilon(\epsilon)\\
    f_\phi &= \mathbb{E}[f] \nonumber\\
    f_j(x_j) &= \mathbb{E}[f|x_j]-f_\phi \nonumber\\
    f_{i,j}(x_i,x_j) &= \mathbb{E}[f|x_i,x_j]-f_i(x_i)-f_j(x_j) - f_\phi \nonumber \\
    &\vdots \nonumber
\end{align}
This have good properties.
+ $\mathbb{E}[f_u] = 0$ when $u\subset \{1,\dotsi,N\}$ and $u\neq \phi$
+ $\mathbb{C}[f_u,f_v]=0$ when $u,v\subset \{1,\dotsi,N\}$ and $u\neq v$

Using these properties we can prove the following fact.
\begin{equation}
    \mathbb{V}[f] = \sum_{u\subset\{1,\dotsi,N\}}\mathbb{V}[f_u] + \mathbb{V}[f_\epsilon]\\
\end{equation}

We can see ,for example, $\mathbb{V}[f_1]$ as an effect of archetype 1 itself and $\mathbb{V}[f_{1,2}]$ as an  effect of archetype 1and archetype 2.

Now we estimate$ \mathbb{V}[f_u]$.
The basic ideas is that if archetype sets $u\subset\{1,\dotsi,N\}$ is important, then the variance will be small when you fix $\boldsymbol{x}_u$.\
We will resample from data and do Monte-Carlo estimation.

Let $  \boldsymbol{x} = (\boldsymbol{x}_{u}, \boldsymbol{x}_{-u})$

In [66]:
# modify this later

$\hat{\mathbb{V}[f_u]} = \frac{1}{N}\sum_{n=1}^N f(\boldsymbol{x}_{-u}^{(n,1)},\boldsymbol{x}_{u}^{(n)})f(\boldsymbol{x}_{-u}^{(n,2)},\boldsymbol{x}_{u}^{(n)}) - \hat{\mu}^{(1)}  \hat{\mu}^{(2)}$


where $(\boldsymbol{x}_{u}^{(n)})_{n\in[1:N]}$
is an i.i.d. sample with the distribution of $\boldsymbol{x}_{u}$
and where
$\boldsymbol{x}_{-u}^{(n,1)}$and $\boldsymbol{x}_{-u}^{(n,2)}$
conditionally to $\boldsymbol{x}_{u}$ are independent with the distribution
of $\boldsymbol{x}_{-u}$ conditionally to $\boldsymbol{x}_{u} = \boldsymbol{x}_{u}^{(n)}$\\
$\hat{\mu}$ is an estimation of $\mathbb{E}[f]$
 

In [64]:
N=1000

In [31]:
musquare = df_moviearchetypes.sample(N,replace=True).mean()["MovieBoxOfficeRevenue"] *  df_moviearchetypes.sample(N,replace=True).mean()["MovieBoxOfficeRevenue"]

In [33]:
First_Order_Variance =np.zeros(51)

In [53]:
for i in range(1,51):
    variance = 0
    for j in range (N):
        sample_u1 = df_moviearchetypes.sample(1)
        sample_u2 =  df_moviearchetypes[df_moviearchetypes["archetype"+str(i)] == int(sample_u1["archetype"+str(i)])].sample(1)
        variance += float(sample_u1["MovieBoxOfficeRevenue"])* float(sample_u2["MovieBoxOfficeRevenue"])
    variance = variance / N-musquare
    print("variance of archetype"+str(i)+ ":",variance)


variance of archetype1: 0.5351473757099257
variance of archetype2: 1.0253526388780756
variance of archetype3: 0.482157889426162
variance of archetype4: 0.09263158522617232
variance of archetype5: -0.7343832539433492
variance of archetype6: 0.2885012579785098
variance of archetype7: -0.564543107984889
variance of archetype8: 0.3356120813805319
variance of archetype9: -0.543109643362925
variance of archetype10: 0.5556218414711793
variance of archetype11: 0.7279874148010208
variance of archetype12: 0.02198422673660616
variance of archetype13: 0.04017146373911373
variance of archetype14: -0.5894218075685558
variance of archetype15: 0.6052226355710104
variance of archetype16: 0.04062695747552425
variance of archetype17: 0.39916996832818086
variance of archetype18: 0.7971355037618082
variance of archetype19: 0.4339910117784598
variance of archetype20: 0.06470905004687211
variance of archetype21: 0.6418715203192278


KeyboardInterrupt: 

In [65]:
for i in range(1,50):
    for j in range(i+1,51):
        variance = 0
        for k in range (N):
            sample_u1 = df_moviearchetypes.sample(1)
            sample_u2 =  df_moviearchetypes[ (df_moviearchetypes["archetype"+str(i)] == int(sample_u1["archetype"+str(i)]) ) & (df_moviearchetypes["archetype"+str(j)] == int(sample_u1["archetype"+str(j)]) )].sample(1)
            variance += float(sample_u1["MovieBoxOfficeRevenue"])* float(sample_u2["MovieBoxOfficeRevenue"])
        variance = variance / N - musquare
        print("variance of archetype"+str(i)+";"+str(j)+ ":",variance)


variance of archetype1;2: 0.5907112778327814
variance of archetype1;3: 1.2670019285209264
variance of archetype1;4: -2.2119972090181363
variance of archetype1;5: 0.8758552480368849
variance of archetype1;6: 0.5942124222127632
variance of archetype1;7: 0.0994114885908175
variance of archetype1;8: -4.350592083745028
variance of archetype1;9: -0.21193617869346326
variance of archetype1;10: 0.40431840122590756
variance of archetype1;11: -1.0237386156005073
variance of archetype1;12: -1.7591856645370854
variance of archetype1;13: 2.5959443883609765
variance of archetype1;14: -0.9429249915636433
variance of archetype1;15: -1.9728112381641267
variance of archetype1;16: -1.2582149304878953
variance of archetype1;17: 0.4920602396436493
variance of archetype1;18: -0.8715193063255811
variance of archetype1;19: -0.8036296428593914
variance of archetype1;20: 0.5607470482593158
variance of archetype1;21: -3.150038128474023
variance of archetype1;22: 2.1474459755730777
variance of archetype1;23: 0.83

KeyboardInterrupt: 

 We can tell if $\hat{\mathbb{V}[f_u]}$ is big, archetype sets $u$ might have significant effect on revenue wheter positively or negatively.