In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
from statsmodels.stats import diagnostic
import statsmodels.stats as st
from scipy import stats
from itertools import combinations
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, auc, roc_curve
import missingno as msno
import ast
from collections import Counter
import statsmodels.regression.recursive_ls as rls
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pyvis.network import Network

%load_ext autoreload
%autoreload 2

Read the file previously computed for the collaborations of actors

In [18]:
df1 = pd.read_pickle('../data/df_pairs.pkl')

### Convert every movie revenue into their 2023 real value
We take into account the inflation, for example 1 dollar in 1888 is 32.39 dollars today  

In [19]:
columns_inf = ['year', 'amount','inflation rate']
inflation = pd.read_table('data/inflation_data.csv', header=None, names=columns_inf,sep=',')
inflation = inflation.drop(index=0)

#From https://www.officialdata.org/us/inflation/1888?amount=1

value_in_2023 = [32.39,33.44,33.81,33.81,33.81,34.19,35.78,36.63,36.63,37.07,
                   37.07,37.07,36.63,36.20,35.78,34.96,34.57,34.96,34.19,32.73,
                   33.44,33.81,32.39,32.39,31.72,31.08,30.77,30.46,28.23,24.04,
                   20.38,17.78,15.38,17.19,18.31,17.99,17.99,17.58,17.38,17.68,
                   17.99,17.99,18.42,20.24,22.46,23.67,22.96,22.46,22.13,21.37,
                   21.82,22.13,21.98,20.93,18.88,17.78,17.48,17.09,15.78,13.80,
                   12.77,12.93,12.77,11.83,11.61,11.52,11.44,11.48,11.31,10.95,
                   10.65,10.57,10.39,10.29,10.19,10.05,9.92,9.77,9.50,9.21,8.84,
                   8.38,7.93,7.60,7.36,6.93,6.24,5.72,5.41,5.08,4.72,4.24,3.73,
                   3.38,3.19,3.09,2.96,2.86,2.81,2.71,2.60,2.48,2.35,2.26,2.19,
                   2.13,2.08,2.02,1.96,1.92,1.89,1.85,1.79,1.74,1.71,1.67,1.63,
                   1.58,1.53,1.48,1.43,1.43,1.41,1.37,1.34,1.32,1.30,1.30,1.28,
                   1.26,1.22,1.20,1.19,1.14,1.05,1]

inflation["Inflation Factor for 2023"] = value_in_2023
inflation["year"] = inflation["year"].astype(float)

df1['Inflation Factor for 2023'] = df1['Movie_release'].map(inflation.set_index('year')['Inflation Factor for 2023'])
df1['2023 valued revenue'] = df1['Movie_revenue'] * df1['Inflation Factor for 2023']

df1 = df1.sort_values(by=['2023 valued revenue'],ascending = False).drop(columns = ['Movie_revenue','Inflation Factor for 2023'])

 ### In this part we create the dataframe that will be the basis of our network
 First we filter the movies release for the period we want to have the collaborations in, then we calculate the average rating, revenue and the number of collaborations done together. Here you can modify the range of time from the period you want to see the films collaborations.

In [21]:
df2 = df1.copy()

end_of_period = 1995
#Filter the years to have only the films from 1980 to 1985 first
df2 = df2[(df2['Movie_release'] >= 1980) & (df2['Movie_release'] <= end_of_period)]


# Step 1: Create a mapping DataFrame for 'Actor_pairs' to 'Actor1', 'Actor2', and 'Genre'
actor_pairs_mapping = df2[['Actor_pairs', 'Actor1', 'Actor2', 'Genre']].drop_duplicates()

# Step 2: Grouping by 'Actor_pairs' and calculating the required metrics along with including 'Genre'
grouped_df = df2.groupby('Actor_pairs').agg(
    Average_Movie_revenue=pd.NamedAgg(column='2023 valued revenue', aggfunc='mean'),
    Average_Movie_rating=pd.NamedAgg(column='Movie_rating', aggfunc='mean'),
    Count=pd.NamedAgg(column='Movie_name', aggfunc='count')
)

# Reset index in the grouped DataFrame
grouped_df.reset_index(inplace=True)

# Step 3: Merge the aggregated DataFrame with the mapping DataFrame
# Note: The merge may result in multiple rows per actor pair if they have multiple genres.
final_df = pd.merge(grouped_df, actor_pairs_mapping, on='Actor_pairs')

final_df

Unnamed: 0,Actor_pairs,Average_Movie_revenue,Average_Movie_rating,Count,Actor1,Actor2,Genre
0,"(A Martinez, Bryan Larkin)",3.807152e+07,5.7,1,A Martinez,Bryan Larkin,Black comedy
1,"(A Martinez, Ed Begley, Jr.)",3.807152e+07,5.7,1,A Martinez,"Ed Begley, Jr.",
2,"(A Martinez, Elisebeth Peters)",3.807152e+07,5.7,1,A Martinez,Elisebeth Peters,Black comedy
3,"(A Martinez, Linda Hunt)",3.807152e+07,5.7,1,A Martinez,Linda Hunt,
4,"(A Martinez, Maria Pitillo)",3.807152e+07,5.7,1,A Martinez,Maria Pitillo,Black comedy
...,...,...,...,...,...,...,...
130326,"(Yano Anaya, Zack Ward)",5.961890e+07,7.9,1,Yano Anaya,Zack Ward,
130327,"(Yaphet Kotto, Zoë Tamerlis Lund)",2.961426e+07,6.3,1,Yaphet Kotto,Zoë Tamerlis Lund,Thriller
130328,"(Yoko Shimada, Yoshio Harada)",1.335152e+07,5.9,1,Yoko Shimada,Yoshio Harada,Action/Adventure
130329,"(Yusaku Matsuda, Yuya Uchida)",3.328459e+08,7.8,1,Yusaku Matsuda,Yuya Uchida,Thriller


## Create the ranking system for the collaborations 

We have decided that only collaborations occurring at least three times will be featured in our analysis. This criterion is based on the rationale that in the movie industry, a partnership isn't typically recognized as a significant "duo" if they have only collaborated once or twice. Our focus is on observing the development of these collaborations over time and how they form clusters, reflecting either successful or less favorable outcomes. Initially, our network will encompass films released from 1980 to 1995. Subsequently, we will extend the timeline up to 2010, and finally, we will consider a broad range from 1980 to 2023, resulting in a more intricate network.

The collaborations are assessed using two primary ranking criteria. The first method ranks duos based on the average rating of their joint projects. In cases where ratings are identical, the average revenue is used as a secondary factor to differentiate the ranks. Duos with the same values in both criteria will receive the same rank. The second method prioritizes ranking based on revenue, followed by average ratings. The duo's position in these rankings will be indicative of their performance level and will influence their representation in the network graph.


In [22]:
from sklearn.preprocessing import MinMaxScaler


# Filter to only keep real duos
duos = final_df[final_df['Count'] >=3]

# Creating a copy of the DataFrame slice
duos_standardized = duos.copy()

# Initialize the StandardScaler
standard_scaler = MinMaxScaler()

# Selecting the columns to be normalized
cols_to_normalize = ['Average_Movie_revenue', 'Average_Movie_rating']

# Applying normalization to the selected columns
duos_standardized[cols_to_normalize] = standard_scaler.fit_transform(duos_standardized[cols_to_normalize])

def round_down_to_nearest_05(number):
    return np.floor(number / 0.05) * 0.05

duos_standardized['Average_Movie_revenue'] = duos_standardized['Average_Movie_revenue'].apply(round_down_to_nearest_05)


rating_stand = duos_standardized.sort_values(by=["Average_Movie_rating","Average_Movie_revenue"], ascending= False)


revenue_stand = duos_standardized.copy()
revenue_stand = duos_standardized.sort_values(by=["Average_Movie_revenue","Average_Movie_rating"], ascending= False)


rating_stand.reset_index(drop=True, inplace=True)
rating_stand['rank'] = rating_stand.index + 1   # Adding 1 to start the ranking from 1

revenue_stand.reset_index(drop=True, inplace=True)
revenue_stand['rank'] = revenue_stand.index + 1   # Adding 1 to start the ranking from 1


for i in range(1, len(rating_stand)):
    # Check if the current row has the same speed and mass as the previous row
    if (rating_stand.loc[i, 'Average_Movie_revenue'] == rating_stand.loc[i-1, 'Average_Movie_revenue']) and (rating_stand.loc[i, 'Average_Movie_rating'] == rating_stand.loc[i-1, 'Average_Movie_rating']):
        # Update the rank to be the same as the previous row
        rating_stand.loc[i, 'rank'] = rating_stand.loc[i-1, 'rank']
    
for i in range(1, len(revenue_stand)):
    # Check if the current row has the same speed and mass as the previous row
    if (revenue_stand.loc[i, 'Average_Movie_revenue'] == revenue_stand.loc[i-1, 'Average_Movie_revenue']) and (revenue_stand.loc[i, 'Average_Movie_rating'] == rating_stand.loc[i-1, 'Average_Movie_rating']):
        # Update the rank to be the same as the previous row
        revenue_stand.loc[i, 'rank'] = revenue_stand.loc[i-1, 'rank']

length = len(rating_stand)

rating_stand['rank_ratio']  = (length - (rating_stand['rank']-1))/ length
revenue_stand['rank_ratio']  = (length - (revenue_stand['rank']-1))/ length

def interpolate_color(ratio,start_rgb,end_rgb):

    # Linearly interpolate each color component
    r = start_rgb[0] + (end_rgb[0] - start_rgb[0]) * ratio
    g = start_rgb[1] + (end_rgb[1] - start_rgb[1]) * ratio
    b = start_rgb[2] + (end_rgb[2] - start_rgb[2]) * ratio

    return (r/255, g/255, b/255)

def transform(x):
    if x >= 0.5:
        start_rgb = (112,85,137)
        end_rgb = (229, 83, 159)
        y = (x - 0.5) * 2
        return interpolate_color(y,start_rgb,end_rgb)
    else:
        y = np.abs((x - 0.5) * 2)
        start_rgb = (57,35,35)
        end_rgb = (112,85,137)        
        return interpolate_color(y,start_rgb,end_rgb)
    
# Apply the transformation
rating_stand['Color'] = rating_stand['rank_ratio'].apply(transform)
revenue_stand['Color'] = revenue_stand['rank_ratio'].apply(transform)

## Creation the networks between the actors

This is the legend of our graph : 

Nodes: These symbolize the actors. The size of a node correlates with the number of unique collaborations an actor has engaged in.

Edges: These signify collaborations between pairs of actors. The thickness of an edge reflects the frequency of collaborations between the actors involved.

Color Scheme: This aspect denotes the success level of collaborations. Collaborations are then organized based on two criteria: rating and film revenue. The ranking determines the color of the edge, with higher ranks resulting in pinker edges and lower ranks leading to browner edges. For an actor (node), their color is a composite of the shades from all the edges (collaborations) they have participated in.

In [23]:
def rgb_to_hex(rgb):
    return '#{:02x}{:02x}{:02x}'.format(int(rgb[0]*255), int(rgb[1]*255), int(rgb[2]*255))

# Function to compute average color
def average_color(colors):
    avg = np.mean(colors, axis=0)
    return rgb_to_hex(avg)

First we do the rating based network : 

In [24]:
from pyvis.network import Network
from collections import defaultdict


# Initialize PyVis Network with white background
net = Network(notebook=True, 
              cdn_resources="remote", 
              bgcolor="#e9dcc8",  # Change to the specified color
              font_color="#392323",  # Adjust font color for visibility
              height="calc(100vh - 83px)",
              select_menu=True)

# Set network options
net.set_options("""
const options = {
  "physics": {
    "forceAtlas2Based": {
      "gravitationalConstant": -84,
      "centralGravity": 0.09,
      "springLength": 20,
      "springConstant": 0.035
    },
    "minVelocity": 0.18,
    "solver": "forceAtlas2Based"
  }
}
""")

# Counting edges for each node and storing edge colors
edge_count = defaultdict(int)
edge_colors = defaultdict(list)

for _, row in rating_stand.iterrows():
    edge_count[row['Actor1']] += 1
    edge_count[row['Actor2']] += 1
    color = row['Color']
    edge_colors[row['Actor1']].append(color)
    edge_colors[row['Actor2']].append(color)

# Add nodes to the network with size based on edge count and color based on average edge color
for node, count in edge_count.items():
    avg_color = average_color(edge_colors[node])
    net.add_node(node, size=count*5+1, color=avg_color)  # Adjust size scaling factor as needed

# Add edges to the network with color conversion
for _, row in rating_stand.iterrows():
    color_hex = rgb_to_hex(row['Color'])
    net.add_edge(row['Actor1'], row['Actor2'], value=row['Count']*5, color=color_hex)

# Save or show the network
net.show("rating_network_1980_1995.html")



rating_network_1980_1995.html


Then we do the revenue based one : 

In [25]:
from pyvis.network import Network
from collections import defaultdict

# Function to convert RGB to Hexadecimal
def rgb_to_hex(rgb):
    return '#{:02x}{:02x}{:02x}'.format(int(rgb[0]*255), int(rgb[1]*255), int(rgb[2]*255))

# Function to compute average color
def average_color(colors):
    avg = np.mean(colors, axis=0)
    return rgb_to_hex(avg)

# Initialize PyVis Network with white background
net = Network(notebook=True, 
              cdn_resources="remote", 
              bgcolor="#e9dcc8",  # Change to the specified color
              font_color="#392323",  # Adjust font color for visibility
              height="calc(100vh - 83px)",
              select_menu=True)

# Set network options
net.set_options("""
const options = {
  "physics": {
    "forceAtlas2Based": {
      "gravitationalConstant": -84,
      "centralGravity": 0.09,
      "springLength": 20,
      "springConstant": 0.035
    },
    "minVelocity": 0.18,
    "solver": "forceAtlas2Based"
  }
}
""")

# Counting edges for each node and storing edge colors
edge_count = defaultdict(int)
edge_colors = defaultdict(list)

for _, row in revenue_stand.iterrows():
    edge_count[row['Actor1']] += 1
    edge_count[row['Actor2']] += 1
    color = row['Color']
    edge_colors[row['Actor1']].append(color)
    edge_colors[row['Actor2']].append(color)

# Add nodes to the network with size based on edge count and color based on average edge color
for node, count in edge_count.items():
    avg_color = average_color(edge_colors[node])
    net.add_node(node, size=count*5+1, color=avg_color)  # Adjust size scaling factor as needed

# Add edges to the network with color conversion
for _, row in revenue_stand.iterrows():
    color_hex = rgb_to_hex(row['Color'])
    net.add_edge(row['Actor1'], row['Actor2'], value=row['Count']*2+3, color=color_hex)

# Save or show the network
net.show("revenue_network_1980_1995.html")


revenue_network_1980_1995.html


(Some HTML and CSS modifications for a prettier graph)

In [27]:
def modify_html(file_path, old_text, new_text, css, font_link, old_color_str, new_color):
    
    with open(file_path, 'r', encoding='iso-8859-1') as file:
        html_content = file.read()

    # Replace the old option text with the new text
    modified_html = html_content.replace(old_text, new_text)

    # Change the loading bar color
    new_color_str = f"background: {new_color};"
    modified_html = modified_html.replace(old_color_str, new_color_str)

     # Insert the Google Fonts link and custom CSS
    head_content = f"{font_link}<style>{css}</style>"
    modified_html = modified_html.replace('</head>', f'{head_content}</head>')

    with open(file_path, 'w', encoding='iso-8859-1') as file:
         file.write(modified_html)
    return

# Path to your HTML file
html_file_path = "rating_network.html"

# Text to be replaced and the new text
old_option_text = "Select a Node by ID"
new_option_text = "Get the truth about your favorite actor......"

# Google Fonts link for Poppins
font_link = '<link href="https://fonts.googleapis.com/css2?family=Poppins&display=swap" rel="stylesheet">'

# Define the custom CSS for using Poppins font
custom_css = """
body, .vis-network {
    font-family: 'Poppins', sans-serif;
}
"""

# Original and new color for the loading bar
old_color_str = "background: rgb(0, 173, 246);" # Original color string
new_color = "rgba(210, 180, 140, 0.39)" # New color

# Modify "rating_network.html"
html_file_path_rating = "rating_network_1980_1995.html"
modify_html(html_file_path_rating, old_option_text, new_option_text, custom_css, font_link, old_color_str, new_color)

# Now modify "revenue_network.html"
html_file_path_revenue = "revenue_network_1980_1995.html"
modify_html(html_file_path_revenue, old_option_text, new_option_text, custom_css, font_link, old_color_str, new_color)

