In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from pyvis.network import Network
from collections import defaultdict
from sklearn.preprocessing import MinMaxScaler
import networkx as nx
from community import community_louvain
import matplotlib.pyplot as plt

In [2]:
df_movie = pd.read_pickle('../data/df_movie.pkl')

In [3]:
# Group by 'Movie_name' and aggregate 'Actor_name', 'Director' and 'Composer' into a single list
grouped_df = df_movie.groupby(['Movie_name', 'Movie_release']).agg({
    'Actor_name': lambda x: list(set(x.dropna())),
    'Director': lambda x: list(set(x.dropna())),
    'Composer': lambda x: list(set(x.dropna()))
}).reset_index()

# Combine all lists into a single list for each movie
grouped_df['All_Credits'] = grouped_df.apply(lambda row: list(set(row['Actor_name'] + row['Director'] + row['Composer'])), axis=1)

# Drop the individual columns
grouped_df = grouped_df.drop(['Actor_name', 'Director', 'Composer'], axis=1)

In [4]:
grouped_df

Unnamed: 0,Movie_name,Movie_release,All_Credits
0,'Til There Was You,1997.0,"[Terence Blanchard, Craig Bierko, Nina Foch, S..."
1,(500) Days of Summer,2009.0,"[Geoffrey Arend, Lexy Hulme, Yvette Nicole Bro..."
2,*batteries not included,1987.0,"[Wendy Schaal, Jessica Tandy, Macintyre Dixon,..."
3,...And Justice for All,1979.0,"[Craig T. Nelson, John Forsythe, Jeffrey Tambo..."
4,10,1979.0,"[Dee Wallace-Stone, Sam J. Jones, James Noble,..."
...,...,...,...
6815,Zoom,2006.0,"[Tim Allen, Michael Cassidy, Jane Hajduk, Asht..."
6816,Zoot Suit,1981.0,"[Robert Beltran, Tony Plana, Edward James Olmo..."
6817,¡Three Amigos!,1986.0,"[Alfonso Arau, Tony Plana, Steve Martin, Chevy..."
6818,Æon Flux,2005.0,"[Pete Postlethwaite, Caroline Chikezie, Yangzo..."


In [5]:
grouped_df['All_pairs'] = grouped_df.apply(lambda row: list(combinations(sorted(set(row['All_Credits'])), 2)), axis=1)


In [6]:
grouped_df.drop(['All_Credits'], axis=1, inplace=True)
grouped_df

Unnamed: 0,Movie_name,Movie_release,All_pairs
0,'Til There Was You,1997.0,"[(Alice Drummond, Christine Ebersole), (Alice ..."
1,(500) Days of Summer,2009.0,"[(Chloë Moretz, Clark Gregg), (Chloë Moretz, G..."
2,*batteries not included,1987.0,"[(Dennis Boutsikaris, Doris Belack), (Dennis B..."
3,...And Justice for All,1979.0,"[(Al Pacino, Christine Lahti), (Al Pacino, Cra..."
4,10,1979.0,"[(Blake Edwards, Bo Derek), (Blake Edwards, Br..."
...,...,...,...
6815,Zoom,2006.0,"[(Alexis Bledel, Ashton Moio), (Alexis Bledel,..."
6816,Zoot Suit,1981.0,"[(Abel Franco, Charles Aidman), (Abel Franco, ..."
6817,¡Three Amigos!,1986.0,"[(Alfonso Arau, Chevy Chase), (Alfonso Arau, J..."
6818,Æon Flux,2005.0,"[(Amelia Warner, Betty Okino), (Amelia Warner,..."


In [7]:
df_all_pairs = grouped_df[['Movie_name', 'Movie_release', 'All_pairs']].copy()
df_all_pairs = df_all_pairs.explode('All_pairs')
df_all_pairs = df_all_pairs.reset_index(drop=True)
df_all_pairs

Unnamed: 0,Movie_name,Movie_release,All_pairs
0,'Til There Was You,1997.0,"(Alice Drummond, Christine Ebersole)"
1,'Til There Was You,1997.0,"(Alice Drummond, Craig Bierko)"
2,'Til There Was You,1997.0,"(Alice Drummond, Dylan McDermott)"
3,'Til There Was You,1997.0,"(Alice Drummond, Jeanne Tripplehorn)"
4,'Til There Was You,1997.0,"(Alice Drummond, Jennifer Aniston)"
...,...,...,...
758120,Æon Flux,2005.0,"(Pete Postlethwaite, Yangzom Brauen)"
758121,Æon Flux,2005.0,"(Sophie Okonedo, Stuart Townsend)"
758122,Æon Flux,2005.0,"(Sophie Okonedo, Yangzom Brauen)"
758123,Æon Flux,2005.0,"(Stuart Townsend, Yangzom Brauen)"


In [8]:
df_all_pairs = df_all_pairs.merge(df_movie[['Movie_name', 'Movie_release', 'Movie_revenue', 'Movie_rating']], on=['Movie_name', 'Movie_release'], how='left')
df_all_pairs.drop_duplicates(inplace=True)
df_all_pairs.dropna(inplace=True)
df_all_pairs.reset_index(drop=True, inplace=True)
df_all_pairs

Unnamed: 0,Movie_name,Movie_release,All_pairs,Movie_revenue,Movie_rating
0,'Til There Was You,1997.0,"(Alice Drummond, Christine Ebersole)",3525125.0,4.8
1,'Til There Was You,1997.0,"(Alice Drummond, Craig Bierko)",3525125.0,4.8
2,'Til There Was You,1997.0,"(Alice Drummond, Dylan McDermott)",3525125.0,4.8
3,'Til There Was You,1997.0,"(Alice Drummond, Jeanne Tripplehorn)",3525125.0,4.8
4,'Til There Was You,1997.0,"(Alice Drummond, Jennifer Aniston)",3525125.0,4.8
...,...,...,...,...,...
665659,Æon Flux,2005.0,"(Pete Postlethwaite, Stuart Townsend)",52304001.0,5.4
665660,Æon Flux,2005.0,"(Pete Postlethwaite, Yangzom Brauen)",52304001.0,5.4
665661,Æon Flux,2005.0,"(Sophie Okonedo, Stuart Townsend)",52304001.0,5.4
665662,Æon Flux,2005.0,"(Sophie Okonedo, Yangzom Brauen)",52304001.0,5.4


In [9]:
df_all_pairs['Credit_1'] = df_all_pairs.apply(lambda row: row['All_pairs'][0], axis=1)
df_all_pairs['Credit_2'] = df_all_pairs.apply(lambda row: row['All_pairs'][1], axis=1)
df_all_pairs

Unnamed: 0,Movie_name,Movie_release,All_pairs,Movie_revenue,Movie_rating,Credit_1,Credit_2
0,'Til There Was You,1997.0,"(Alice Drummond, Christine Ebersole)",3525125.0,4.8,Alice Drummond,Christine Ebersole
1,'Til There Was You,1997.0,"(Alice Drummond, Craig Bierko)",3525125.0,4.8,Alice Drummond,Craig Bierko
2,'Til There Was You,1997.0,"(Alice Drummond, Dylan McDermott)",3525125.0,4.8,Alice Drummond,Dylan McDermott
3,'Til There Was You,1997.0,"(Alice Drummond, Jeanne Tripplehorn)",3525125.0,4.8,Alice Drummond,Jeanne Tripplehorn
4,'Til There Was You,1997.0,"(Alice Drummond, Jennifer Aniston)",3525125.0,4.8,Alice Drummond,Jennifer Aniston
...,...,...,...,...,...,...,...
665659,Æon Flux,2005.0,"(Pete Postlethwaite, Stuart Townsend)",52304001.0,5.4,Pete Postlethwaite,Stuart Townsend
665660,Æon Flux,2005.0,"(Pete Postlethwaite, Yangzom Brauen)",52304001.0,5.4,Pete Postlethwaite,Yangzom Brauen
665661,Æon Flux,2005.0,"(Sophie Okonedo, Stuart Townsend)",52304001.0,5.4,Sophie Okonedo,Stuart Townsend
665662,Æon Flux,2005.0,"(Sophie Okonedo, Yangzom Brauen)",52304001.0,5.4,Sophie Okonedo,Yangzom Brauen


In [10]:
columns_inf = ['year', 'amount','inflation rate']
inflation = pd.read_table('../data/inflation_data.csv', header=None, names=columns_inf,sep=',')
inflation = inflation.drop(index=0)

#From https://www.officialdata.org/us/inflation/1888?amount=1

value_in_2023 = [32.39,33.44,33.81,33.81,33.81,34.19,35.78,36.63,36.63,37.07,
                   37.07,37.07,36.63,36.20,35.78,34.96,34.57,34.96,34.19,32.73,
                   33.44,33.81,32.39,32.39,31.72,31.08,30.77,30.46,28.23,24.04,
                   20.38,17.78,15.38,17.19,18.31,17.99,17.99,17.58,17.38,17.68,
                   17.99,17.99,18.42,20.24,22.46,23.67,22.96,22.46,22.13,21.37,
                   21.82,22.13,21.98,20.93,18.88,17.78,17.48,17.09,15.78,13.80,
                   12.77,12.93,12.77,11.83,11.61,11.52,11.44,11.48,11.31,10.95,
                   10.65,10.57,10.39,10.29,10.19,10.05,9.92,9.77,9.50,9.21,8.84,
                   8.38,7.93,7.60,7.36,6.93,6.24,5.72,5.41,5.08,4.72,4.24,3.73,
                   3.38,3.19,3.09,2.96,2.86,2.81,2.71,2.60,2.48,2.35,2.26,2.19,
                   2.13,2.08,2.02,1.96,1.92,1.89,1.85,1.79,1.74,1.71,1.67,1.63,
                   1.58,1.53,1.48,1.43,1.43,1.41,1.37,1.34,1.32,1.30,1.30,1.28,
                   1.26,1.22,1.20,1.19,1.14,1.05,1]

inflation["Inflation Factor for 2023"] = value_in_2023
inflation["year"] = inflation["year"].astype(float)

df_all_pairs['Inflation Factor for 2023'] = df_all_pairs['Movie_release'].map(inflation.set_index('year')['Inflation Factor for 2023'])
df_all_pairs['2023 valued revenue'] = df_all_pairs['Movie_revenue'] * df_all_pairs['Inflation Factor for 2023']

df_all_pairs = df_all_pairs.sort_values(by=['2023 valued revenue'],ascending = False)
df_all_pairs.head(5)


Unnamed: 0,Movie_name,Movie_release,All_pairs,Movie_revenue,Movie_rating,Credit_1,Credit_2,Inflation Factor for 2023,2023 valued revenue
609896,Titanic,1997.0,"(Bernard Fox, Nicholas Cascone)",2185372000.0,7.9,Bernard Fox,Nicholas Cascone,1.92,4195915000.0
610051,Titanic,1997.0,"(Gloria Stuart, Victor Garber)",2185372000.0,7.9,Gloria Stuart,Victor Garber,1.92,4195915000.0
610038,Titanic,1997.0,"(Frances Fisher, Victor Garber)",2185372000.0,7.9,Frances Fisher,Victor Garber,1.92,4195915000.0
610039,Titanic,1997.0,"(Gloria Stuart, Ioan Gruffudd)",2185372000.0,7.9,Gloria Stuart,Ioan Gruffudd,1.92,4195915000.0
610040,Titanic,1997.0,"(Gloria Stuart, James Cameron)",2185372000.0,7.9,Gloria Stuart,James Cameron,1.92,4195915000.0


In [11]:
df_pairs = pd.read_pickle('../data/df_pairs.pkl')
df_pairs

Unnamed: 0,Movie_name,Movie_release,Actor_pairs,Movie_revenue,Movie_rating,Actor1,Actor2,Age_difference,Film_count_difference,Average_revenue_difference,First_film,First_film_for_one,Number_of_films_together,Same_genre,Genre
0,'Til There Was You,1997.0,"(Alice Drummond, Christine Ebersole)",3525125.0,4.8,Alice Drummond,Christine Ebersole,25.0,3,3.612798e+07,False,False,0,False,
1,'Til There Was You,1997.0,"(Alice Drummond, Craig Bierko)",3525125.0,4.8,Alice Drummond,Craig Bierko,37.0,5,1.416641e+07,False,False,0,False,
2,'Til There Was You,1997.0,"(Alice Drummond, Dylan McDermott)",3525125.0,4.8,Alice Drummond,Dylan McDermott,34.0,1,1.315054e+07,False,False,0,False,
3,'Til There Was You,1997.0,"(Alice Drummond, Jeanne Tripplehorn)",3525125.0,4.8,Alice Drummond,Jeanne Tripplehorn,36.0,2,1.241967e+08,False,False,0,False,
4,'Til There Was You,1997.0,"(Alice Drummond, Jennifer Aniston)",3525125.0,4.8,Alice Drummond,Jennifer Aniston,41.0,3,4.310507e+07,False,False,0,False,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552165,Æon Flux,2005.0,"(Pete Postlethwaite, Stuart Townsend)",52304001.0,5.4,Pete Postlethwaite,Stuart Townsend,27.0,8,6.262462e+07,False,False,0,True,Thriller
552166,Æon Flux,2005.0,"(Pete Postlethwaite, Yangzom Brauen)",52304001.0,5.4,Pete Postlethwaite,Yangzom Brauen,34.0,13,5.939269e+07,False,True,0,True,Thriller
552167,Æon Flux,2005.0,"(Sophie Okonedo, Stuart Townsend)",52304001.0,5.4,Sophie Okonedo,Stuart Townsend,,2,6.540344e+07,False,False,0,True,Thriller
552168,Æon Flux,2005.0,"(Sophie Okonedo, Yangzom Brauen)",52304001.0,5.4,Sophie Okonedo,Yangzom Brauen,,3,6.217151e+07,False,True,0,True,Thriller


In [12]:
df_all_pairs = df_all_pairs.merge(df_pairs[['Movie_name', 'Movie_release', 'Actor_pairs', 'Genre']], left_on=['Movie_name', 'Movie_release', 'All_pairs'], right_on=['Movie_name', 'Movie_release', 'Actor_pairs'], how='left')

In [13]:
df_all_pairs.drop(['Actor_pairs'], axis=1, inplace=True)
df_all_pairs

Unnamed: 0,Movie_name,Movie_release,All_pairs,Movie_revenue,Movie_rating,Credit_1,Credit_2,Inflation Factor for 2023,2023 valued revenue,Genre
0,Titanic,1997.0,"(Bernard Fox, Nicholas Cascone)",2.185372e+09,7.9,Bernard Fox,Nicholas Cascone,1.92,4.195915e+09,
1,Titanic,1997.0,"(Gloria Stuart, Victor Garber)",2.185372e+09,7.9,Gloria Stuart,Victor Garber,1.92,4.195915e+09,
2,Titanic,1997.0,"(Frances Fisher, Victor Garber)",2.185372e+09,7.9,Frances Fisher,Victor Garber,1.92,4.195915e+09,
3,Titanic,1997.0,"(Gloria Stuart, Ioan Gruffudd)",2.185372e+09,7.9,Gloria Stuart,Ioan Gruffudd,1.92,4.195915e+09,
4,Titanic,1997.0,"(Gloria Stuart, James Cameron)",2.185372e+09,7.9,Gloria Stuart,James Cameron,1.92,4.195915e+09,
...,...,...,...,...,...,...,...,...,...,...
666165,Quid Pro Quo,2008.0,"(James Frain, Nick Stahl)",1.186400e+04,6.1,James Frain,Nick Stahl,1.43,1.696552e+04,Thriller
666166,Quid Pro Quo,2008.0,"(James Frain, Matthew Carey)",1.186400e+04,6.1,James Frain,Matthew Carey,1.43,1.696552e+04,Thriller
666167,Quid Pro Quo,2008.0,"(James Frain, Mark Mothersbaugh)",1.186400e+04,6.1,James Frain,Mark Mothersbaugh,1.43,1.696552e+04,
666168,Quid Pro Quo,2008.0,"(James Frain, Kate Burton)",1.186400e+04,6.1,James Frain,Kate Burton,1.43,1.696552e+04,Thriller


In [14]:
unique_directors = df_movie['Director'].dropna().unique()
director_genres = {}

for director in unique_directors:
    genres = df_movie[df_movie['Director'] == director]['Main_genre'].value_counts()
    if len(genres) == 0:
        director_genres[director] = 'None'
    else:
        director_genres[director] = genres.index[0]


In [15]:
unique_composers = df_movie['Composer'].dropna().unique()
composer_genres = {}

for composer in unique_composers:
    genres = df_movie[df_movie['Composer'] == composer]['Main_genre'].value_counts()
    if len(genres) == 0:
        composer_genres[composer] = 'None'
    else:
        composer_genres[composer] = genres.index[0]


In [16]:
df_all_pairs['Genre'] = df_all_pairs.apply(lambda row: row['Genre'] if not pd.isna(row['Genre']) else director_genres[row['Credit_1']] if row['Credit_1'] in director_genres else director_genres[row['Credit_2']] if row['Credit_2'] in director_genres else composer_genres[row['Credit_1']] if row['Credit_1'] in composer_genres else composer_genres[row['Credit_2']], axis=1)

In [17]:
df_all_pairs

Unnamed: 0,Movie_name,Movie_release,All_pairs,Movie_revenue,Movie_rating,Credit_1,Credit_2,Inflation Factor for 2023,2023 valued revenue,Genre
0,Titanic,1997.0,"(Bernard Fox, Nicholas Cascone)",2.185372e+09,7.9,Bernard Fox,Nicholas Cascone,1.92,4.195915e+09,
1,Titanic,1997.0,"(Gloria Stuart, Victor Garber)",2.185372e+09,7.9,Gloria Stuart,Victor Garber,1.92,4.195915e+09,
2,Titanic,1997.0,"(Frances Fisher, Victor Garber)",2.185372e+09,7.9,Frances Fisher,Victor Garber,1.92,4.195915e+09,
3,Titanic,1997.0,"(Gloria Stuart, Ioan Gruffudd)",2.185372e+09,7.9,Gloria Stuart,Ioan Gruffudd,1.92,4.195915e+09,
4,Titanic,1997.0,"(Gloria Stuart, James Cameron)",2.185372e+09,7.9,Gloria Stuart,James Cameron,1.92,4.195915e+09,
...,...,...,...,...,...,...,...,...,...,...
666165,Quid Pro Quo,2008.0,"(James Frain, Nick Stahl)",1.186400e+04,6.1,James Frain,Nick Stahl,1.43,1.696552e+04,Thriller
666166,Quid Pro Quo,2008.0,"(James Frain, Matthew Carey)",1.186400e+04,6.1,James Frain,Matthew Carey,1.43,1.696552e+04,Thriller
666167,Quid Pro Quo,2008.0,"(James Frain, Mark Mothersbaugh)",1.186400e+04,6.1,James Frain,Mark Mothersbaugh,1.43,1.696552e+04,Crime Fiction
666168,Quid Pro Quo,2008.0,"(James Frain, Kate Burton)",1.186400e+04,6.1,James Frain,Kate Burton,1.43,1.696552e+04,Thriller


In [18]:
df2 = df_all_pairs.copy()


#Filter the years to have only the films from 1980 to 1985 first
df2 = df2[(df2['Movie_release'] >= 1980) & (df2['Movie_release'] <= 2023)]


# Step 1: Create a mapping DataFrame for 'Actor_pairs' to 'Actor1', 'Actor2', and 'Genre'
actor_pairs_mapping = df2[['All_pairs', 'Credit_1', 'Credit_2', 'Genre']].drop_duplicates()

# Step 2: Grouping by 'Actor_pairs' and calculating the required metrics along with including 'Genre'
grouped_df = df2.groupby('All_pairs').agg(
    Average_Movie_revenue=pd.NamedAgg(column='2023 valued revenue', aggfunc='mean'),
    Average_Movie_rating=pd.NamedAgg(column='Movie_rating', aggfunc='mean'),
    Count=pd.NamedAgg(column='Movie_name', aggfunc='count')
)

# Reset index in the grouped DataFrame
grouped_df.reset_index(inplace=True)

# Step 3: Merge the aggregated DataFrame with the mapping DataFrame
# Note: The merge may result in multiple rows per actor pair if they have multiple genres.
final_df = pd.merge(grouped_df, actor_pairs_mapping, on='All_pairs')

final_df

Unnamed: 0,All_pairs,Average_Movie_revenue,Average_Movie_rating,Count,Credit_1,Credit_2,Genre
0,"(50 Cent, Adewale Akinnuoye-Agbaje)",7.337919e+07,5.4,1,50 Cent,Adewale Akinnuoye-Agbaje,
1,"(50 Cent, Al Pacino)",1.097485e+08,6.0,1,50 Cent,Al Pacino,
2,"(50 Cent, Alan Blumenfeld)",1.097485e+08,6.0,1,50 Cent,Alan Blumenfeld,Crime Fiction
3,"(50 Cent, Alan Rosenberg)",1.097485e+08,6.0,1,50 Cent,Alan Rosenberg,
4,"(50 Cent, Ambyr Childers)",3.619071e+06,5.6,1,50 Cent,Ambyr Childers,Crime Fiction
...,...,...,...,...,...,...,...
568382,"(Zhenwei Wang, Zhiheng Wang)",5.063677e+08,6.2,1,Zhenwei Wang,Zhiheng Wang,Action/Adventure
568383,"(Zoe Saldana, Zulay Henao)",9.952805e+07,6.2,1,Zoe Saldana,Zulay Henao,
568384,"(Zoe Saldana, Óscar Jaenada)",4.142541e+07,6.2,1,Zoe Saldana,Óscar Jaenada,Thriller
568385,"(Zoe Saldana, Željko Ivanek)",1.540308e+07,8.6,1,Zoe Saldana,Željko Ivanek,Thriller


In [19]:
# Filter to only keep real duos
duos = final_df[final_df['Count'] >=3]

# Creating a copy of the DataFrame slice
duos_standardized = duos.copy()

# Initialize the StandardScaler
standard_scaler = MinMaxScaler()

# Selecting the columns to be normalized
cols_to_normalize = ['Average_Movie_revenue', 'Average_Movie_rating']

# Applying normalization to the selected columns
duos_standardized[cols_to_normalize] = standard_scaler.fit_transform(duos_standardized[cols_to_normalize])

def round_down_to_nearest_05(number):
    return np.floor(number / 0.05) * 0.05

duos_standardized['Average_Movie_revenue'] = duos_standardized['Average_Movie_revenue'].apply(round_down_to_nearest_05)


rating_stand = duos_standardized.sort_values(by=["Average_Movie_rating","Average_Movie_revenue"], ascending= False)


revenue_stand = duos_standardized.copy()
revenue_stand = duos_standardized.sort_values(by=["Average_Movie_revenue","Average_Movie_rating"], ascending= False)


rating_stand.reset_index(drop=True, inplace=True)
rating_stand['rank'] = rating_stand.index + 1   # Adding 1 to start the ranking from 1

revenue_stand.reset_index(drop=True, inplace=True)
revenue_stand['rank'] = revenue_stand.index + 1   # Adding 1 to start the ranking from 1


for i in range(1, len(rating_stand)):
    # Check if the current row has the same speed and mass as the previous row
    if (rating_stand.loc[i, 'Average_Movie_revenue'] == rating_stand.loc[i-1, 'Average_Movie_revenue']) and (rating_stand.loc[i, 'Average_Movie_rating'] == rating_stand.loc[i-1, 'Average_Movie_rating']):
        # Update the rank to be the same as the previous row
        rating_stand.loc[i, 'rank'] = rating_stand.loc[i-1, 'rank']
    
for i in range(1, len(revenue_stand)):
    # Check if the current row has the same speed and mass as the previous row
    if (revenue_stand.loc[i, 'Average_Movie_revenue'] == revenue_stand.loc[i-1, 'Average_Movie_revenue']) and (revenue_stand.loc[i, 'Average_Movie_rating'] == rating_stand.loc[i-1, 'Average_Movie_rating']):
        # Update the rank to be the same as the previous row
        revenue_stand.loc[i, 'rank'] = revenue_stand.loc[i-1, 'rank']

length = len(rating_stand)

rating_stand['rank_ratio']  = (length - (rating_stand['rank']-1))/ length
revenue_stand['rank_ratio']  = (length - (revenue_stand['rank']-1))/ length

def interpolate_color(ratio,start_rgb,end_rgb):

    # Linearly interpolate each color component
    r = start_rgb[0] + (end_rgb[0] - start_rgb[0]) * ratio
    g = start_rgb[1] + (end_rgb[1] - start_rgb[1]) * ratio
    b = start_rgb[2] + (end_rgb[2] - start_rgb[2]) * ratio

    return (r/255, g/255, b/255)

# Function to transform x to y and create a tuple
def transform(x):
    if x >= 0.5:
        start_rgb = (112,85,137)
        end_rgb = (229, 83, 159)
        y = (x - 0.5) * 2
        return interpolate_color(y,start_rgb,end_rgb)
    else:
        y = np.abs((x - 0.5) * 2)
        start_rgb = (57,35,35)
        end_rgb = (112,85,137)        
        return interpolate_color(y,start_rgb,end_rgb)

# Apply the transformation
rating_stand['Color'] = rating_stand['rank_ratio'].apply(transform)
revenue_stand['Color'] = revenue_stand['rank_ratio'].apply(transform)

In [20]:
rating_stand

Unnamed: 0,All_pairs,Average_Movie_revenue,Average_Movie_rating,Count,Credit_1,Credit_2,Genre,rank,rank_ratio,Color
0,"(Andy Serkis, Billy Boyd)",1.00,1.000000,3,Andy Serkis,Billy Boyd,Fantasy Adventure,1,1.000000,"(0.8980392156862745, 0.3254901960784314, 0.623..."
1,"(Andy Serkis, Cate Blanchett)",1.00,1.000000,3,Andy Serkis,Cate Blanchett,,1,1.000000,"(0.8980392156862745, 0.3254901960784314, 0.623..."
2,"(Andy Serkis, Christopher Lee)",1.00,1.000000,3,Andy Serkis,Christopher Lee,,1,1.000000,"(0.8980392156862745, 0.3254901960784314, 0.623..."
3,"(Andy Serkis, David Wenham)",1.00,1.000000,3,Andy Serkis,David Wenham,Fantasy Adventure,1,1.000000,"(0.8980392156862745, 0.3254901960784314, 0.623..."
4,"(Andy Serkis, Dominic Monaghan)",1.00,1.000000,3,Andy Serkis,Dominic Monaghan,Fantasy Adventure,1,1.000000,"(0.8980392156862745, 0.3254901960784314, 0.623..."
...,...,...,...,...,...,...,...,...,...,...
3111,"(David Mann, Tamela Mann)",0.05,0.329843,3,David Mann,Tamela Mann,,3112,0.001605,"(0.4385234966900753, 0.33270407007475644, 0.53..."
3112,"(David Mann, Tyler Perry)",0.05,0.329843,3,David Mann,Tyler Perry,,3112,0.001605,"(0.4385234966900753, 0.33270407007475644, 0.53..."
3113,"(Diane Paloma Eskenazi, Jim Cummings)",0.25,0.272251,3,Diane Paloma Eskenazi,Jim Cummings,Adventure,3114,0.000963,"(0.43880037252384907, 0.3329557753781872, 0.53..."
3114,"(Alan McRae, Victor Wong)",0.00,0.272251,3,Alan McRae,Victor Wong,,3115,0.000642,"(0.438938810440736, 0.3330816280299026, 0.5367..."


In [21]:
def rgb_to_hex(rgb):
    return '#{:02x}{:02x}{:02x}'.format(int(rgb[0]*255), int(rgb[1]*255), int(rgb[2]*255))

# Function to compute average color
def average_color(colors):
    avg = np.mean(colors, axis=0)
    return rgb_to_hex(avg)


In [22]:
# Initialize PyVis Network with white background
net = Network(notebook=True, 
              cdn_resources="remote", 
              bgcolor="#f9f4e3",  # Change to white background
              font_color="black",  # Adjust font color for visibility
              height="calc(100vh - 83px)",
              select_menu=True,)

# Set network options
net.set_options("""
const options = {
  "physics": {
    "forceAtlas2Based": {
      "gravitationalConstant": -84,
      "centralGravity": 0.09,
      "springLength": 20,
      "springConstant": 0.035
    },
    "minVelocity": 0.18,
    "solver": "forceAtlas2Based"
  }
}
""")

# Counting edges for each node and storing edge colors
edge_count = defaultdict(int)
edge_colors = defaultdict(list)
node_roles = defaultdict(str)

for _, row in rating_stand.iterrows():
    edge_count[row['Credit_1']] += 1
    edge_count[row['Credit_2']] += 1
    color = row['Color']
    edge_colors[row['Credit_1']].append(color)
    edge_colors[row['Credit_2']].append(color)
    
    if row['Credit_1'] in unique_directors:
        node_roles[row['Credit_1']] = 'director'
    elif row['Credit_1'] in unique_composers:
        node_roles[row['Credit_1']] = 'composer'
    else:
        node_roles[row['Credit_1']] = 'actor'

    # Check if Credit_2 is a director, composer, or actor
    if row['Credit_2'] in unique_directors:
        node_roles[row['Credit_2']] = 'director'
    elif row['Credit_2'] in unique_composers:
        node_roles[row['Credit_2']] = 'composer'
    else:
        node_roles[row['Credit_2']] = 'actor'

# Add nodes to the network with size based on edge count and color based on average edge color
for node, count in edge_count.items():
    avg_color = average_color(edge_colors[node])

    if node_roles[node] == 'director':
        net.add_node(node, size=count*4+1, color=avg_color, shape='triangle', label=node, title='Director')
    elif node_roles[node] == 'composer':
        net.add_node(node, size=count*2+1, color=avg_color, shape='diamond', label=node, title='Composer' )
    else:
        net.add_node(node, size=count*2+1, color=avg_color, label=node, title='Actor')  # Adjust size scaling factor as needed

# Add edges to the network with color conversion
for _, row in rating_stand.iterrows():
    color_hex = rgb_to_hex(row['Color'])
    net.add_edge(row['Credit_1'], row['Credit_2'], value=row['Count']*2, color=color_hex)

net.save_graph('networks/actors_directors_composers.html')

In [23]:
G = nx.Graph()

for _, row in rating_stand.iterrows():
    #if (row['Credit_1'] in unique_directors or row['Credit_1'] in unique_composers) and (row['Credit_2'] in unique_directors or row['Credit_2'] in unique_composers):
      G.add_edge(row['Credit_1'], row['Credit_2'], weight=row['Count']*2)

# Perform Louvain community detection
partition = community_louvain.best_partition(G)

# Initialize PyVis Network with white background
net = Network(notebook=True, 
              cdn_resources="remote", 
              bgcolor="#f9f4e3",  # Change to white background
              font_color="black",  # Adjust font color for visibility
              height="calc(100vh - 158px)",
              select_menu=True,
              filter_menu=True)

# Set network options
net.set_options("""
const options = {
  "physics": {
    "forceAtlas2Based": {
      "gravitationalConstant": -84,
      "centralGravity": 0.09,
      "springLength": 20,
      "springConstant": 0.035
    },
    "minVelocity": 0.18,
    "solver": "forceAtlas2Based"
  }
}
""")

# Assign community IDs to nodes in the PyVis Network
for node, community_id in partition.items():
    if node_roles[node] == 'director':
        net.add_node(node, size=edge_count[node]*2+1, color=rgb_to_hex(plt.cm.tab20(community_id)[:3]), shape='triangle', label=node, title='Director')
    elif node_roles[node] == 'composer':
        net.add_node(node, size=edge_count[node]*2+1, color=rgb_to_hex(plt.cm.tab20(community_id)[:3]), shape='diamond', label=node, title='Composer' )
    else:
        net.add_node(node, size=edge_count[node]*2+1, color=rgb_to_hex(plt.cm.tab20(community_id)[:3]), label=node, title='Actor')

# Add edges to the network with color conversion
for _, row in rating_stand.iterrows():
    #if (row['Credit_1'] in unique_directors or row['Credit_1'] in unique_composers) and (row['Credit_2'] in unique_directors or row['Credit_2'] in unique_composers):
      net.add_edge(row['Credit_1'], row['Credit_2'], value=row['Count']*2)

net.save_graph('networks/actors_directors_composers_community.html')

In [24]:
df_pairs['Inflation Factor for 2023'] = df_pairs['Movie_release'].map(inflation.set_index('year')['Inflation Factor for 2023'])
df_pairs['2023 valued revenue'] = df_pairs['Movie_revenue'] * df_pairs['Inflation Factor for 2023']

df_pairs = df_pairs.sort_values(by=['2023 valued revenue'],ascending = False)

In [25]:
df2 = df_pairs.copy()


#Filter the years to have only the films from 1980 to 1985 first
df2 = df2[(df2['Movie_release'] >= 1980) & (df2['Movie_release'] <= 2020)]


# Step 1: Create a mapping DataFrame for 'Actor_pairs' to 'Actor1', 'Actor2', and 'Genre'
actor_pairs_mapping = df2[['Actor_pairs', 'Actor1', 'Actor2', 'Genre']].drop_duplicates()

# Step 2: Grouping by 'Actor_pairs' and calculating the required metrics along with including 'Genre'
grouped_df = df2.groupby('Actor_pairs').agg(
    Average_Movie_revenue=pd.NamedAgg(column='2023 valued revenue', aggfunc='mean'),
    Average_Movie_rating=pd.NamedAgg(column='Movie_rating', aggfunc='mean'),
    Count=pd.NamedAgg(column='Movie_name', aggfunc='count')
)

# Reset index in the grouped DataFrame
grouped_df.reset_index(inplace=True)

# Step 3: Merge the aggregated DataFrame with the mapping DataFrame
# Note: The merge may result in multiple rows per actor pair if they have multiple genres.
final_df = pd.merge(grouped_df, actor_pairs_mapping, on='Actor_pairs')

final_df

Unnamed: 0,Actor_pairs,Average_Movie_revenue,Average_Movie_rating,Count,Actor1,Actor2,Genre
0,"(50 Cent, Adewale Akinnuoye-Agbaje)",7.337919e+07,5.4,1,50 Cent,Adewale Akinnuoye-Agbaje,
1,"(50 Cent, Al Pacino)",1.097485e+08,6.0,1,50 Cent,Al Pacino,
2,"(50 Cent, Alan Blumenfeld)",1.097485e+08,6.0,1,50 Cent,Alan Blumenfeld,Crime Fiction
3,"(50 Cent, Alan Rosenberg)",1.097485e+08,6.0,1,50 Cent,Alan Rosenberg,
4,"(50 Cent, Ambyr Childers)",3.619071e+06,5.6,1,50 Cent,Ambyr Childers,Crime Fiction
...,...,...,...,...,...,...,...
474803,"(Zhenwei Wang, Zhiheng Wang)",5.063677e+08,6.2,1,Zhenwei Wang,Zhiheng Wang,Action/Adventure
474804,"(Zoe Saldana, Zulay Henao)",9.952805e+07,6.2,1,Zoe Saldana,Zulay Henao,
474805,"(Zoe Saldana, Óscar Jaenada)",4.142541e+07,6.2,1,Zoe Saldana,Óscar Jaenada,Thriller
474806,"(Zoe Saldana, Željko Ivanek)",1.540308e+07,8.6,1,Zoe Saldana,Željko Ivanek,Thriller


In [27]:
# Filter to only keep real duos
duos = final_df[final_df['Count'] >=3]

# Creating a copy of the DataFrame slice
duos_standardized = duos.copy()

# Initialize the StandardScaler
standard_scaler = MinMaxScaler()

# Selecting the columns to be normalized
cols_to_normalize = ['Average_Movie_revenue', 'Average_Movie_rating']

# Applying normalization to the selected columns
duos_standardized[cols_to_normalize] = standard_scaler.fit_transform(duos_standardized[cols_to_normalize])

def round_down_to_nearest_05(number):
    return np.floor(number / 0.05) * 0.05

duos_standardized['Average_Movie_revenue'] = duos_standardized['Average_Movie_revenue'].apply(round_down_to_nearest_05)


rating_stand = duos_standardized.sort_values(by=["Average_Movie_rating","Average_Movie_revenue"], ascending= False)


revenue_stand = duos_standardized.copy()
revenue_stand = duos_standardized.sort_values(by=["Average_Movie_revenue","Average_Movie_rating"], ascending= False)


rating_stand.reset_index(drop=True, inplace=True)
rating_stand['rank'] = rating_stand.index + 1   # Adding 1 to start the ranking from 1

revenue_stand.reset_index(drop=True, inplace=True)
revenue_stand['rank'] = revenue_stand.index + 1   # Adding 1 to start the ranking from 1


for i in range(1, len(rating_stand)):
    # Check if the current row has the same speed and mass as the previous row
    if (rating_stand.loc[i, 'Average_Movie_revenue'] == rating_stand.loc[i-1, 'Average_Movie_revenue']) and (rating_stand.loc[i, 'Average_Movie_rating'] == rating_stand.loc[i-1, 'Average_Movie_rating']):
        # Update the rank to be the same as the previous row
        rating_stand.loc[i, 'rank'] = rating_stand.loc[i-1, 'rank']
    
for i in range(1, len(revenue_stand)):
    # Check if the current row has the same speed and mass as the previous row
    if (revenue_stand.loc[i, 'Average_Movie_revenue'] == revenue_stand.loc[i-1, 'Average_Movie_revenue']) and (revenue_stand.loc[i, 'Average_Movie_rating'] == rating_stand.loc[i-1, 'Average_Movie_rating']):
        # Update the rank to be the same as the previous row
        revenue_stand.loc[i, 'rank'] = revenue_stand.loc[i-1, 'rank']

length = len(rating_stand)

rating_stand['rank_ratio']  = (length - (rating_stand['rank']-1))/ length
revenue_stand['rank_ratio']  = (length - (revenue_stand['rank']-1))/ length

def interpolate_color(ratio,start_rgb,end_rgb):

    # Linearly interpolate each color component
    r = start_rgb[0] + (end_rgb[0] - start_rgb[0]) * ratio
    g = start_rgb[1] + (end_rgb[1] - start_rgb[1]) * ratio
    b = start_rgb[2] + (end_rgb[2] - start_rgb[2]) * ratio

    return (r/255, g/255, b/255)

# Function to transform x to y and create a tuple
def transform(x):
    if x >= 0.5:
        start_rgb = (112,85,137)
        end_rgb = (229, 83, 159)
        y = (x - 0.5) * 2
        return interpolate_color(y,start_rgb,end_rgb)
    else:
        y = np.abs((x - 0.5) * 2)
        start_rgb = (57,35,35)
        end_rgb = (112,85,137)        
        return interpolate_color(y,start_rgb,end_rgb)

# Apply the transformation
rating_stand['Color'] = rating_stand['rank_ratio'].apply(transform)
revenue_stand['Color'] = revenue_stand['rank_ratio'].apply(transform)

In [28]:
G = nx.Graph()

for _, row in rating_stand.iterrows():
    #if (row['Credit_1'] in unique_directors or row['Credit_1'] in unique_composers) and (row['Credit_2'] in unique_directors or row['Credit_2'] in unique_composers):
      G.add_edge(row['Actor1'], row['Actor2'], weight=row['Count']*2)

# Perform Louvain community detection
partition_without_directors_composers = community_louvain.best_partition(G)

In [31]:
from scipy.stats import ttest_ind

# Extract community assignments for both scenarios
communities_with_directors_and_composers = list(partition.values())
communities_without_directors_and_composers = list(partition_without_directors_composers.values())

# Perform t-test
t_statistic, p_value = ttest_ind(communities_with_directors_and_composers, communities_without_directors_and_composers)

# Print results
print(f'T-Statistic: {t_statistic}')
print(f'P-Value: {p_value}')

T-Statistic: 4.632597140333145
P-Value: 3.821808716680829e-06


Adding directors and composers lead to a significant change in the collaborative network as shown by the small p-value. Directors may collaborate with a broader range of individuals, including other directors, actors, and composers, leading to a different network structure. Composers also tend to have a different pattern of collaboration as they are essentially tied to one or a few directors, while directors create a bridge between different directors-composers pairs. Some actors also take the role of directors, hence they might connect with different actors than in the actors only network.

However, even though the network structure changes, there are not many changes regarding the success within the network. Successful regroupments of individuals are mostly similar between the two networks. We do find differences when looking at smaller regroupments, where more link appears. The relation between director Steven Spielberg and composer John Williams is vertuous. They have collaborated on many movies together and seem to have a great rank ratio.