In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
from statsmodels.stats import diagnostic
import statsmodels.stats as st
from scipy import stats
from itertools import combinations
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, auc, roc_curve
import missingno as msno
import ast
from collections import Counter
import statsmodels.regression.recursive_ls as rls
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pyvis.network import Network

%load_ext autoreload
%autoreload 2

In [2]:
df = pd.read_pickle('../data/df_movie.pkl')
df.head()

Unnamed: 0,Movie_name,Movie_release,Movie_revenue,Movie_runtime,Movie_languages,Movie_countries,Main_genre,Sec_Genre,Movie_rating,Producer,...,Actor_name,Actor_age_release,Inflation Factor for 2023,2023 valued revenue,Avg_revenue_per_film_at_release,Longevity,Number_of_film_at_release,Avg_rating_per_film_at_release,First_film,Actor_main_genre
403088,The Fox and the Hound,1981.0,63456988.0,83.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}",Adventure,Children's/Family,,,...,'Squeeks' the Caterpillar,,3.38,214484600.0,63456990.0,0.0,1,,True,Adventure
400285,Miss March,2009.0,4591629.0,90.0,"{""/m/05zjd"": ""Portuguese Language"", ""/m/02h40l...","{""/m/09c7w0"": ""United States of America""}",Road movie,Sex comedy,,,...,40 Glocc,29.0,1.43,6566029.0,4591629.0,0.0,1,,True,Road movie
71882,Get Rich or Die Tryin',2005.0,46442528.0,117.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}",Crime Fiction,Hip hop movies,5.4,Jimmy Iovine,...,50 Cent,30.0,1.58,73379190.0,46442530.0,0.0,1,5.4,True,Crime Fiction
419847,Home of the Brave,2006.0,499620.0,106.0,"{""/m/0jzc"": ""Arabic Language"", ""/m/02h40lc"": ""...","{""/m/09c7w0"": ""United States of America"", ""/m/...",Drama,War film,5.6,,...,50 Cent,31.0,1.53,764418.6,23471070.0,1.0,2,5.5,False,Crime Fiction
126916,Righteous Kill,2008.0,76747202.0,100.0,"{""/m/06b_j"": ""Russian Language"", ""/m/02h40lc"":...","{""/m/09c7w0"": ""United States of America""}",Thriller,Crime Fiction,6.0,,...,50 Cent,33.0,1.43,109748500.0,41229780.0,3.0,3,5.666667,False,Crime Fiction


In [3]:
df1 = pd.read_pickle('../data/df_pairs.pkl')
df1.head()

Unnamed: 0,Movie_name,Movie_release,Actor_pairs,Movie_revenue,Movie_rating,Actor1,Actor2,Age_difference,Film_count_difference,Average_revenue_difference,First_film,First_film_for_one,Number_of_films_together,Same_genre,Genre
0,'Til There Was You,1997.0,"(Alice Drummond, Christine Ebersole)",3525125.0,4.8,Alice Drummond,Christine Ebersole,25.0,3,36127980.0,False,False,0,False,
1,'Til There Was You,1997.0,"(Alice Drummond, Craig Bierko)",3525125.0,4.8,Alice Drummond,Craig Bierko,37.0,5,14166410.0,False,False,0,False,
2,'Til There Was You,1997.0,"(Alice Drummond, Dylan McDermott)",3525125.0,4.8,Alice Drummond,Dylan McDermott,34.0,1,13150540.0,False,False,0,False,
3,'Til There Was You,1997.0,"(Alice Drummond, Jeanne Tripplehorn)",3525125.0,4.8,Alice Drummond,Jeanne Tripplehorn,36.0,2,124196700.0,False,False,0,False,
4,'Til There Was You,1997.0,"(Alice Drummond, Jennifer Aniston)",3525125.0,4.8,Alice Drummond,Jennifer Aniston,41.0,3,43105070.0,False,False,0,False,


In [4]:
df2 = df1.copy()

#Filter the years to have only the films from 1980 to 1985 first
df2 = df2[(df2['Movie_release'] >= 1980) & (df2['Movie_release'] <= 2020)]


# Step 1: Create a mapping DataFrame for 'Actor_pairs' to 'Actor1', 'Actor2', and 'Genre'
actor_pairs_mapping = df2[['Actor_pairs', 'Actor1', 'Actor2', 'Genre']].drop_duplicates()

# Step 2: Grouping by 'Actor_pairs' and calculating the required metrics along with including 'Genre'
grouped_df = df2.groupby('Actor_pairs').agg(
    Average_Movie_revenue=pd.NamedAgg(column='Movie_revenue', aggfunc='mean'),
    Average_Movie_rating=pd.NamedAgg(column='Movie_rating', aggfunc='mean'),
    Count=pd.NamedAgg(column='Movie_name', aggfunc='count')
)

# Reset index in the grouped DataFrame
grouped_df.reset_index(inplace=True)

# Step 3: Merge the aggregated DataFrame with the mapping DataFrame
# Note: The merge may result in multiple rows per actor pair if they have multiple genres.
final_df = pd.merge(grouped_df, actor_pairs_mapping, on='Actor_pairs')

final_df

Unnamed: 0,Actor_pairs,Average_Movie_revenue,Average_Movie_rating,Count,Actor1,Actor2,Genre
0,"(50 Cent, Adewale Akinnuoye-Agbaje)",46442528.0,5.4,1,50 Cent,Adewale Akinnuoye-Agbaje,
1,"(50 Cent, Al Pacino)",76747202.0,6.0,1,50 Cent,Al Pacino,
2,"(50 Cent, Alan Blumenfeld)",76747202.0,6.0,1,50 Cent,Alan Blumenfeld,Crime Fiction
3,"(50 Cent, Alan Rosenberg)",76747202.0,6.0,1,50 Cent,Alan Rosenberg,
4,"(50 Cent, Ambyr Childers)",2566717.0,5.6,1,50 Cent,Ambyr Childers,Crime Fiction
...,...,...,...,...,...,...,...
474803,"(Zhenwei Wang, Zhiheng Wang)",359126022.0,6.2,1,Zhenwei Wang,Zhiheng Wang,Action/Adventure
474804,"(Zoe Saldana, Zulay Henao)",70587268.0,6.2,1,Zoe Saldana,Zulay Henao,
474805,"(Zoe Saldana, Óscar Jaenada)",29379723.0,6.2,1,Zoe Saldana,Óscar Jaenada,Thriller
474806,"(Zoe Saldana, Željko Ivanek)",11494838.0,8.6,1,Zoe Saldana,Željko Ivanek,Thriller


In [5]:
from sklearn.preprocessing import MinMaxScaler


# Filter to only keep real duos
duos = final_df[final_df['Count'] >=3]

# Creating a copy of the DataFrame slice
duos_standardized = duos.copy()

# Initialize the StandardScaler
standard_scaler = MinMaxScaler()

# Selecting the columns to be normalized
cols_to_normalize = ['Average_Movie_revenue', 'Average_Movie_rating']

# Applying normalization to the selected columns
duos_standardized[cols_to_normalize] = standard_scaler.fit_transform(duos_standardized[cols_to_normalize])

def round_down_to_nearest_05(number):
    return np.floor(number / 0.05) * 0.05

duos_standardized['Average_Movie_revenue'] = duos_standardized['Average_Movie_revenue'].apply(round_down_to_nearest_05)


rating_stand = duos_standardized.sort_values(by=["Average_Movie_rating","Average_Movie_revenue"], ascending= False)


revenue_stand = duos_standardized.copy()
revenue_stand = duos_standardized.sort_values(by=["Average_Movie_revenue","Average_Movie_rating"], ascending= False)


rating_stand.reset_index(drop=True, inplace=True)
rating_stand['rank'] = rating_stand.index + 1   # Adding 1 to start the ranking from 1

revenue_stand.reset_index(drop=True, inplace=True)
revenue_stand['rank'] = revenue_stand.index + 1   # Adding 1 to start the ranking from 1


for i in range(1, len(rating_stand)):
    # Check if the current row has the same speed and mass as the previous row
    if (rating_stand.loc[i, 'Average_Movie_revenue'] == rating_stand.loc[i-1, 'Average_Movie_revenue']) and (rating_stand.loc[i, 'Average_Movie_rating'] == rating_stand.loc[i-1, 'Average_Movie_rating']):
        # Update the rank to be the same as the previous row
        rating_stand.loc[i, 'rank'] = rating_stand.loc[i-1, 'rank']
    
for i in range(1, len(revenue_stand)):
    # Check if the current row has the same speed and mass as the previous row
    if (revenue_stand.loc[i, 'Average_Movie_revenue'] == revenue_stand.loc[i-1, 'Average_Movie_revenue']) and (revenue_stand.loc[i, 'Average_Movie_rating'] == rating_stand.loc[i-1, 'Average_Movie_rating']):
        # Update the rank to be the same as the previous row
        revenue_stand.loc[i, 'rank'] = revenue_stand.loc[i-1, 'rank']

length = len(rating_stand)

rating_stand['rank_ratio']  = (length - (rating_stand['rank']-1))/ length
revenue_stand['rank_ratio']  = (length - (revenue_stand['rank']-1))/ length

# Function to transform x to y and create a tuple
def transform(x):
    if x >= 0.5:
        y = (x - 0.5) * 2  
        return (0, y, 0.3)
    else:
        y = np.abs((x - 0.5) * 2)
        return (y, 0, 0.3)

# Apply the transformation
rating_stand['Color'] = rating_stand['rank_ratio'].apply(transform)
revenue_stand['Color'] = revenue_stand['rank_ratio'].apply(transform)

rating_stand

Unnamed: 0,Actor_pairs,Average_Movie_revenue,Average_Movie_rating,Count,Actor1,Actor2,Genre,rank,rank_ratio,Color
0,"(Andy Serkis, Billy Boyd)",1.00,1.000000,3,Andy Serkis,Billy Boyd,Fantasy Adventure,1,1.000000,"(0, 1.0, 0.3)"
1,"(Andy Serkis, Cate Blanchett)",1.00,1.000000,3,Andy Serkis,Cate Blanchett,,1,1.000000,"(0, 1.0, 0.3)"
2,"(Andy Serkis, Christopher Lee)",1.00,1.000000,3,Andy Serkis,Christopher Lee,,1,1.000000,"(0, 1.0, 0.3)"
3,"(Andy Serkis, David Wenham)",1.00,1.000000,3,Andy Serkis,David Wenham,Fantasy Adventure,1,1.000000,"(0, 1.0, 0.3)"
4,"(Andy Serkis, Dominic Monaghan)",1.00,1.000000,3,Andy Serkis,Dominic Monaghan,Fantasy Adventure,1,1.000000,"(0, 1.0, 0.3)"
...,...,...,...,...,...,...,...,...,...,...
2172,"(Loretta Devine, Regina Hall)",0.00,0.116906,4,Loretta Devine,Regina Hall,,2172,0.002756,"(0.994487827285255, 0, 0.3)"
2173,"(Anthony Anderson, Loretta Devine)",0.00,0.115108,3,Anthony Anderson,Loretta Devine,Crime Fiction,2174,0.001837,"(0.99632521819017, 0, 0.3)"
2174,"(David Mann, Tamela Mann)",0.05,0.079137,3,David Mann,Tamela Mann,,2175,0.001378,"(0.9972439136426274, 0, 0.3)"
2175,"(David Mann, Tyler Perry)",0.05,0.079137,3,David Mann,Tyler Perry,,2175,0.001378,"(0.9972439136426274, 0, 0.3)"


In [6]:
import networkx as nx
import community as community_louvain
import numpy as np
from collections import defaultdict

# Create a new NetworkX graph
G = nx.Graph()

# Add nodes and edges from the DataFrame
for _, row in rating_stand.iterrows():
    G.add_edge(row['Actor1'], row['Actor2'], weight=row['Count'])

# Detect communities (clusters) using the Louvain method
partition = community_louvain.best_partition(G, weight='weight')

# Group nodes by their cluster
clusters = defaultdict(list)
for node, cluster_id in partition.items():
    clusters[cluster_id].append(node)

# Filter clusters with more than 5 nodes
large_clusters = {k: v for k, v in clusters.items() if len(v) > 5}

# Analysis for each large cluster
cluster_averages = {}

for cluster_id, nodes in large_clusters.items():
    # Extract rows from DataFrame that belong to the current cluster
    cluster_rows = rating_stand[(rating_stand['Actor1'].isin(nodes)) | (rating_stand['Actor2'].isin(nodes))]

    # Calculate average rank for the cluster
    average_rank = cluster_rows['rank'].mean() if not cluster_rows.empty else None

    # Store the results
    cluster_averages[cluster_id] = {
        'actors': nodes,
        'average_rank': average_rank
    }

# Sort clusters by average rank in ascending order
sorted_cluster_averages = dict(sorted(cluster_averages.items(), key=lambda item: item[1]['average_rank']))

# List to store the cluster analysis results
cluster_analysis = []

# Add cluster analysis results to the list
for cluster_id, info in sorted_cluster_averages.items():
    cluster_analysis.append({
        'cluster_id': cluster_id,
        'actors': info['actors'],
        'average_rank': info['average_rank']
    })

# Print the cluster analysis results
for cluster in cluster_analysis:
    print(f"Cluster {cluster['average_rank']}: {cluster['actors']}")

Cluster 386.95714285714286: ['Andy Serkis', 'Billy Boyd', 'Cate Blanchett', 'Christopher Lee', 'David Wenham', 'Dominic Monaghan', 'Elijah Wood', 'Hugo Weaving', 'Ian McKellen', 'John Rhys-Davies', 'Liv Tyler', 'Orlando Bloom', 'Sala Baker', 'Sean Astin', 'Sean Bean', 'Viggo Mortensen', 'Peter Jackson', 'Ian Holm', 'Brad Dourif', 'Hugh Jackman', 'Rebecca Romijn', 'Aaron Stanford', 'Anna Paquin', 'Bruce Davison', 'Famke Janssen', 'Halle Berry', 'James Marsden', 'Patrick Stewart', 'Jacek Koman', 'Richard Roxburgh', 'Brent Spiner', 'Gates McFadden', 'Jonathan Frakes', 'LeVar Burton', 'Marina Sirtis', 'Michael Dorn', 'Danny Mann', 'Miriam Margolyes', 'Magda Szubanski', 'Gretchen Mol']
Cluster 437.3171206225681: ['Gary Oldman', 'Alfred Enoch', 'Timothy Spall', 'Bonnie Wright', 'Daniel Radcliffe', 'David Bradley', 'Devon Murray', 'Emma Watson', 'Geraldine Somerville', 'James Phelps', 'Jamie Waylett', 'Joshua Herdman', 'Maggie Smith', 'Mark Williams', 'Matthew David Lewis', 'Michael Gambon', 

# **MOVIES GENRES**

In [7]:
import itertools
from collections import defaultdict, Counter
from tqdm import tqdm

# Create a dictionary to store movie genre counts and average rank for each cluster
cluster_info = defaultdict(lambda: {'movie_genre_counts': defaultdict(int), 'total_movie_count': 0, 'average_rank': 0})

for cluster_id, actors in tqdm(large_clusters.items(), desc='Processing clusters'):
    # Find all unique pairs of actors within this cluster
    actor_pairs = list(itertools.combinations(actors, 2))
    
    # A set to keep track of movies we've already counted for this cluster to avoid duplicates
    movies_counted = set()
    
    for actor1, actor2 in actor_pairs:
        # Get the movies both actors appeared in together
        movies_actor1 = set(df[df['Actor_name'] == actor1]['Movie_name'])
        movies_actor2 = set(df[df['Actor_name'] == actor2]['Movie_name'])
        movies_together = movies_actor1.intersection(movies_actor2)
        
        # Filter out movies that have already been counted for this cluster
        unique_movies_together = movies_together - movies_counted
        movies_counted.update(unique_movies_together)
        
        # Count the genres from these movies
        for movie in unique_movies_together:
            movie_genres = df[df['Movie_name'] == movie]['Main_genre'].unique()
            for genre in movie_genres:
                cluster_info[cluster_id]['movie_genre_counts'][genre] += 1
    
    # Update the total movie count for the cluster
    cluster_info[cluster_id]['total_movie_count'] = sum(cluster_info[cluster_id]['movie_genre_counts'].values())
    
    # Calculate average rank for the cluster using movie genres
    cluster_rows = rating_stand[rating_stand['Actor1'].isin(actors) & rating_stand['Actor2'].isin(actors)]
    average_rank = cluster_rows['rank'].mean() if not cluster_rows.empty else None
    cluster_info[cluster_id]['average_rank'] = average_rank

# Sort the clusters by average rank in ascending order
sorted_clusters = sorted(cluster_info.items(), key=lambda item: item[1]['average_rank'] if item[1]['average_rank'] is not None else float('inf'))

# Convert the counts to ratios and merge small genres into "Other Genres"
for cluster_id, info in cluster_info.items():
    total_movies = info['total_movie_count']
    other_genres_count = 0

    if total_movies > 0:
        # Calculate ratios and find genres to be merged
        genres_to_merge = []
        for genre, count in info['movie_genre_counts'].items():
            ratio = count / total_movies
            if ratio < 0.02:  # If the ratio is less than 2%
                other_genres_count += count
                genres_to_merge.append(genre)

        # Remove genres that are merged into "Other Genres"
        for genre in genres_to_merge:
            del info['movie_genre_counts'][genre]

        # If there are any 'Other Genres', add them to the dictionary
        if other_genres_count > 0:
            info['movie_genre_counts']['Other Genres'] = other_genres_count

        # Finally, convert the counts to ratios
        for genre in info['movie_genre_counts']:
            info['movie_genre_counts'][genre] /= total_movies

# Now you can print the sorted clusters with their genre ratios and average rank
for cluster_id, info in sorted_clusters:
    print(f"Cluster {cluster_id}:")
    print(f"  Average Rank: {info['average_rank']}")
    if info['total_movie_count'] > 0:
        for genre, ratio in info['movie_genre_counts'].items():
            print(f"  {genre}: {ratio:.2%}")  # Updated to display as percentage
    print()

Processing clusters: 100%|██████████| 24/24 [02:10<00:00,  5.43s/it]

Cluster 0:
  Average Rank: 306.89230769230767
  Fantasy Adventure: 8.16%
  Adventure: 14.29%
  Monster movie: 2.04%
  Thriller: 26.53%
  Parody: 10.20%
  Horror: 2.04%
  Biography: 2.04%
  Period piece: 2.04%
  Romantic drama: 4.08%
  Fantasy: 2.04%
  Jukebox musical: 2.04%
  Action: 2.04%
  Romantic comedy: 2.04%
  Science Fiction: 4.08%
  Alien Film: 2.04%
  Mystery: 2.04%
  Sports: 2.04%
  Sword and Sandal: 2.04%
  LGBT: 2.04%
  Crime Fiction: 6.12%

Cluster 2:
  Average Rank: 431.99604743083006
  Thriller: 18.06%
  Drama: 4.17%
  Fantasy Adventure: 5.56%
  Crime Fiction: 9.72%
  Adventure: 11.11%
  Computer Animation: 2.78%
  Costume drama: 4.17%
  Romantic comedy: 9.72%
  Family Film: 4.17%
  Science Fiction: 6.94%
  Romantic drama: 2.78%
  Other Genres: 20.83%

Cluster 3:
  Average Rank: 768.2708333333334
  Coming of age: 2.86%
  History: 2.86%
  Crime Fiction: 11.43%
  Buddy film: 2.86%
  Adventure: 25.71%
  Romantic comedy: 20.00%
  Adventure Comedy: 2.86%
  Thriller: 5.71%
  P




### **Dashapp code but cannot be used for the website**

In [8]:
import dash
from dash import dcc, html, Input, Output
import plotly.graph_objs as go

# Assuming `cluster_info` is updated as per your latest structure...

# Initialize the Dash app
app = dash.Dash(__name__)

# Sort clusters by average rank in ascending order and prepare data for plotting
sorted_cluster_ids = sorted(cluster_info, key=lambda x: cluster_info[x]['average_rank'] or float('inf'))
cluster_ids = [f"Cluster {cluster_id}" for cluster_id in sorted_cluster_ids]
average_ranks = [round(cluster_info[cluster_id]['average_rank']) for cluster_id in sorted_cluster_ids]

# Create the bar chart
bar_chart = go.Figure(data=[
    go.Bar(x=cluster_ids, y=average_ranks, name='Average Rank')
])

# App layout with the pie chart to the right of the bar chart
app.layout = html.Div([
    dcc.Graph(id='bar-chart', figure=bar_chart),
    dcc.Graph(id='pie-chart'),
], style={'display': 'flex'})

# Callback for updating pie-chart based on clicked bar
@app.callback(
    Output('pie-chart', 'figure'),
    [Input('bar-chart', 'clickData')]
)
def display_click_data(clickData):
    if clickData:
        # Extract the cluster id from the clicked point
        cluster_id = int(clickData['points'][0]['x'].split()[1])
        # Update the dictionary key to 'movie_genre_counts'
        genres = list(cluster_info[cluster_id]['movie_genre_counts'].keys())
        counts = list(cluster_info[cluster_id]['movie_genre_counts'].values())
        # Normalize the counts to ratios
        total = sum(counts)
        ratios = [count / total for count in counts]
        # Create the pie chart based on the selected cluster
        pie_chart = go.Figure(data=[go.Pie(labels=genres, values=ratios)])
        return pie_chart
    else:
        # If no bar is clicked yet, return an empty figure
        return go.Figure()

# Save the bar chart as an HTML file   
bar_chart.write_html("../plots/plot_movies_q5.html")

# Run the Dash app
if __name__ == '__main__':
    app.run_server(debug=True)

## **This code can be used for website**

In [9]:
import ipywidgets as widgets
import plotly.graph_objs as go
from IPython.display import display

# Assuming `cluster_info` is updated as per your latest structure...

# Prepare data for plotting
sorted_cluster_ids = sorted(cluster_info, key=lambda x: cluster_info[x]['average_rank'] or float('inf'))
cluster_ids = [f"Cluster {cluster_id}" for cluster_id in sorted_cluster_ids]
average_ranks = [round(cluster_info[cluster_id]['average_rank']) for cluster_id in sorted_cluster_ids]

# Create the bar chart
bar_chart = go.Figure(data=[
    go.Bar(x=cluster_ids, y=average_ranks, name='Average Rank')
])
bar_chart.update_layout(clickmode='event+select')

# Initialize the pie chart
pie_chart = go.FigureWidget()

# Function to update pie chart based on selected bar
def update_pie_chart(change):
    cluster_id_str = change['new']  # Corrected to access the 'new' value from the change dictionary
    if cluster_id_str:
        cluster_id = int(cluster_id_str.split()[1])
        genres = list(cluster_info[cluster_id]['movie_genre_counts'].keys())
        counts = list(cluster_info[cluster_id]['movie_genre_counts'].values())

        # Normalize the counts to ratios
        total = sum(counts)
        ratios = [count / total for count in counts]

        # Update the pie chart
        pie_chart.data = []
        pie_chart.add_trace(go.Pie(labels=genres, values=ratios))

# Interactive widget for selecting bar
bar_select = widgets.Dropdown(
    options=cluster_ids,
    description='Select Cluster:',
    disabled=False,
)
bar_select.observe(update_pie_chart, names='value')

# Display widgets and initial charts
display(bar_select)
display(bar_chart)
display(pie_chart)

Dropdown(description='Select Cluster:', options=('Cluster 0', 'Cluster 2', 'Cluster 3', 'Cluster 13', 'Cluster…

FigureWidget({
    'data': [], 'layout': {'template': '...'}
})

In [29]:
import scipy.stats as stats
from statistics import median
import math

# Initialize a list to store the results
t_test_results = []

# Get a list of all unique genres across all clusters
all_genres = set()
for info in cluster_info.values():
    all_genres.update(info['movie_genre_counts'].keys())  # Make sure this key matches your data structure

# Perform T-tests for each genre
for genre in all_genres:
    genre_ratios = []
    average_ranks = []

    # Extract genre ratio and average rank for each cluster
    for cluster_id, info in cluster_info.items():
        if info['total_movie_count'] > 0 and genre in info['movie_genre_counts']:  # Adjust the key here as well
            genre_ratio = info['movie_genre_counts'][genre]  # Adjust the key here as well
            genre_ratios.append(genre_ratio)
            average_ranks.append(info['average_rank'])

    # Split data based on median genre ratio
    median_ratio = median(genre_ratios)
    high_ratio_ranks = [rank for ratio, rank in zip(genre_ratios, average_ranks) if ratio > median_ratio]
    low_ratio_ranks = [rank for ratio, rank in zip(genre_ratios, average_ranks) if ratio <= median_ratio]

    # Perform a T-test
    t_stat, p_value = stats.ttest_ind(high_ratio_ranks, low_ratio_ranks, nan_policy='omit')

    # Check for NaN values
    if not math.isnan(t_stat) and not math.isnan(p_value):
        t_test_results.append((genre, t_stat, p_value))

# Print the non-NaN T-test results for each genre
for genre, t_stat, p_value in t_test_results:
    print(f"Genre: {genre}")
    print(f"  T-statistic: {t_stat}, P-value: {p_value}")
    print()

Genre: Children's/Family
  T-statistic: 0.31108397462319104, P-value: 0.7682882721761619

Genre: Anti-war
  T-statistic: -0.21715728148527474, P-value: 0.8482255070493716

Genre: Sports
  T-statistic: -0.8043495033589552, P-value: 0.438233666774269

Genre: Biographical film
  T-statistic: 0.007845715891245164, P-value: 0.9941157885405132

Genre: Comedy-drama
  T-statistic: -0.049311666992489976, P-value: 0.9651525635523521

Genre: Buddy film
  T-statistic: -0.24011451565207556, P-value: 0.818234732107234

Genre: Tragedy
  T-statistic: -1.0609603545071324, P-value: 0.36655548035759805

Genre: Cult
  T-statistic: -0.35592369661680395, P-value: 0.7311010183707939

Genre: Romantic drama
  T-statistic: -0.4556507171292711, P-value: 0.6646663524008103

Genre: Swashbuckler films
  T-statistic: -0.42372302484359975, P-value: 0.7448500902699474

Genre: LGBT
  T-statistic: 0.7519323092202886, P-value: 0.4678762021112801

Genre: Satire
  T-statistic: 2.0768769701781022, P-value: 0.285671677236012

# **ACTORS GENRES**

In [18]:
# Create a dictionary to store genre counts and average rank for each cluster
cluster_info = defaultdict(lambda: {'genre_counts': defaultdict(int), 'total_genre_count': 0, 'average_rank': 0})

for cluster_id, nodes in large_clusters.items():
    # For each actor in the cluster
    for actor in nodes:
        # Find the main genre of the actor
        main_genres = df[df['Actor_name'] == actor]['Actor_main_genre'].unique()
        # Count each genre
        for genre in main_genres:
            cluster_info[cluster_id]['genre_counts'][genre] += 1
            cluster_info[cluster_id]['total_genre_count'] += 1

    # Calculate average rank for the cluster
    cluster_rows = rating_stand[(rating_stand['Actor1'].isin(nodes)) | (rating_stand['Actor2'].isin(nodes))]
    average_rank = cluster_rows['rank'].mean() if not cluster_rows.empty else None
    cluster_info[cluster_id]['average_rank'] = average_rank

# Sort the clusters by average rank in ascending order
sorted_clusters = sorted(cluster_info.items(), key=lambda item: item[1]['average_rank'] if item[1]['average_rank'] is not None else float('inf'))

# Print the sorted clusters with their genre ratios and average rank
for cluster_id, info in sorted_clusters:
    print(f"Cluster {cluster_id}:")
    print(f"  Average Rank: {info['average_rank']}")
    if info['total_genre_count'] > 0:
        for genre, count in info['genre_counts'].items():
            ratio = count / info['total_genre_count']
            print(f"  {genre}: {ratio:.2f}")
    print()

Cluster 0:
  Average Rank: 386.95714285714286
  Fantasy Adventure: 0.20
  Thriller: 0.62
  Crime Fiction: 0.07
  Drama: 0.03
  Adventure: 0.07

Cluster 2:
  Average Rank: 437.3171206225681
  Thriller: 0.34
  Fantasy Adventure: 0.45
  Costume drama: 0.02
  Adventure: 0.04
  Science Fiction: 0.02
  Crime Fiction: 0.04
  Romantic comedy: 0.09

Cluster 9:
  Average Rank: 638.8095238095239
  Science Fiction: 0.30
  Thriller: 0.40
  Romantic comedy: 0.20
  Adventure: 0.10

Cluster 44:
  Average Rank: 813.3018867924528
  Thriller: 0.27
  Romantic comedy: 0.32
  Adventure: 0.23
  Crime Fiction: 0.05
  Action/Adventure: 0.05
  Holiday Film: 0.05
  Parody: 0.05

Cluster 34:
  Average Rank: 977.7045454545455
  Thriller: 0.78
  Drama: 0.04
  Romantic comedy: 0.04
  Cult: 0.02
  Crime Fiction: 0.07
  Ensemble Film: 0.02
  Adventure: 0.02

Cluster 10:
  Average Rank: 1046.7898550724638
  Romantic comedy: 0.25
  Thriller: 0.33
  Crime Fiction: 0.23
  Parody: 0.05
  Buddy film: 0.10
  LGBT: 0.03
  Roa

In [19]:
import dash
from dash import dcc, html, Input, Output, State
import plotly.graph_objs as go
import pandas as pd

# Assuming `cluster_info` is a dictionary with cluster ids as keys and dictionaries with 'average_rank' and 'genre_counts' as values

# Initialize the Dash app
app = dash.Dash(__name__)

# Sort clusters by average rank in ascending order and prepare data for plotting
sorted_cluster_ids = sorted(cluster_info, key=lambda x: cluster_info[x]['average_rank'])
cluster_ids = [f"Cluster {cluster_id}" for cluster_id in sorted_cluster_ids]
average_ranks = [round(cluster_info[cluster_id]['average_rank']) for cluster_id in sorted_cluster_ids]

# Create the bar chart
bar_chart = go.Figure(data=[
    go.Bar(x=cluster_ids, y=average_ranks, name='Average Rank')
])

# Set up the callback for clicking the bar, which will not have any output but will update the pie chart via clientside callback
bar_chart.update_layout(clickmode='event+select')

# App layout with the pie chart to the right of the bar chart
app.layout = html.Div([
    dcc.Graph(id='bar-chart', figure=bar_chart),
    dcc.Graph(id='pie-chart'),
], style={'display': 'flex'})

# Callback for updating pie-chart based on clicked bar
@app.callback(
    Output('pie-chart', 'figure'),
    [Input('bar-chart', 'clickData')],
    [State('bar-chart', 'figure')]
)
def display_click_data(clickData, figure):
    if clickData:
        # Extract the cluster id from the clicked point
        cluster_id = int(clickData['points'][0]['x'].split()[1])
        genres = list(cluster_info[cluster_id]['genre_counts'].keys())
        counts = list(cluster_info[cluster_id]['genre_counts'].values())
        # Create the pie chart based on the selected cluster
        pie_chart = go.Figure(data=[go.Pie(labels=genres, values=counts)])
        return pie_chart
    else:
        # If no bar is clicked yet, return an empty figure
        return go.Figure()
    
# Save the bar chart as an HTML file
bar_chart.write_html("../plots/plot_actors_q5.html")

# Run the Dash app
if __name__ == '__main__':
    app.run_server(debug=True)

In [20]:
import ipywidgets as widgets
import plotly.graph_objs as go
from IPython.display import display

# Assuming `cluster_info` is a dictionary with cluster ids as keys and dictionaries with 'average_rank' and 'genre_counts' as values

# Prepare data for plotting
sorted_cluster_ids = sorted(cluster_info, key=lambda x: cluster_info[x]['average_rank'])
cluster_ids = [f"Cluster {cluster_id}" for cluster_id in sorted_cluster_ids]
average_ranks = [round(cluster_info[cluster_id]['average_rank']) for cluster_id in sorted_cluster_ids]

# Create the bar chart
bar_chart = go.Figure(data=[
    go.Bar(x=cluster_ids, y=average_ranks, name='Average Rank')
])

# Initialize the pie chart
pie_chart = go.FigureWidget()

# Function to update pie chart based on selected bar
def update_pie_chart(change):
    cluster_id_str = change['new']
    if cluster_id_str:
        cluster_id = int(cluster_id_str.split()[1])
        genres = list(cluster_info[cluster_id]['genre_counts'].keys())
        counts = list(cluster_info[cluster_id]['genre_counts'].values())

        # Update the pie chart
        pie_chart.data = []
        pie_chart.add_trace(go.Pie(labels=genres, values=counts))

# Interactive widget for selecting bar
bar_select = widgets.Dropdown(
    options=cluster_ids,
    description='Select Cluster:',
    disabled=False,
)
bar_select.observe(update_pie_chart, names='value')

# Display widgets and initial charts
display(bar_select)
display(bar_chart)
display(pie_chart)

Dropdown(description='Select Cluster:', options=('Cluster 0', 'Cluster 2', 'Cluster 9', 'Cluster 44', 'Cluster…

FigureWidget({
    'data': [], 'layout': {'template': '...'}
})

In [24]:
import scipy.stats as stats
from statistics import median
import math

# Initialize a list to store the results
t_test_results = []

# Get a list of all unique genres across all clusters
all_genres = set()
for info in cluster_info.values():
    all_genres.update(info['genre_counts'].keys())

# Perform T-tests for each genre
for genre in all_genres:
    genre_ratios = []
    average_ranks = []

    # Extract genre ratio and average rank for each cluster
    for cluster_id, info in cluster_info.items():
        if info['total_genre_count'] > 0 and genre in info['genre_counts']:
            genre_ratio = info['genre_counts'][genre] / info['total_genre_count']
            genre_ratios.append(genre_ratio)
            average_ranks.append(info['average_rank'])

    # Split data based on median genre ratio
    median_ratio = median(genre_ratios)
    high_ratio_ranks = [rank for ratio, rank in zip(genre_ratios, average_ranks) if ratio > median_ratio]
    low_ratio_ranks = [rank for ratio, rank in zip(genre_ratios, average_ranks) if ratio <= median_ratio]

    # Perform a T-test
    t_stat, p_value = stats.ttest_ind(high_ratio_ranks, low_ratio_ranks, nan_policy='omit')

    # Check for NaN values
    if not math.isnan(t_stat) and not math.isnan(p_value):
        t_test_results.append((genre, t_stat, p_value))

# Print the non-NaN T-test results for each genre
for genre, t_stat, p_value in t_test_results:
    print(f"Genre: {genre}")
    print(f"  T-statistic: {t_stat}, P-value: {p_value}")
    print()

Genre: Romantic comedy
  T-statistic: 0.2217531564920889, P-value: 0.8279515046675615

Genre: Drama
  T-statistic: 1.1795004100519173, P-value: 0.3035602223696104

Genre: Crime Fiction
  T-statistic: 2.9085291356846388, P-value: 0.01220641492402976

Genre: Family Film
  T-statistic: 5.997377411521753, P-value: 0.10518205672064193

Genre: Sports
  T-statistic: 0.6759788580827808, P-value: 0.6215793115870516

Genre: Thriller
  T-statistic: -0.02184559668160145, P-value: 0.9828114700251053

Genre: Science Fiction
  T-statistic: 0.3512600443192874, P-value: 0.7486145630794654

Genre: Parody
  T-statistic: 0.9524259347402088, P-value: 0.372601818224183

Genre: Fantasy Adventure
  T-statistic: -0.5276994613705112, P-value: 0.6908823703478878

Genre: Action/Adventure
  T-statistic: 1.6727219592917053, P-value: 0.23635116609420023

Genre: Adventure
  T-statistic: -0.13261040932918045, P-value: 0.8974190393757402

Genre: LGBT
  T-statistic: 1.1129940881323561, P-value: 0.3815511974868291



# **ANSWER TO THE QUESTION**

**5. What role does genre play in shaping collaborative dynamics, and how do successful collaborations differ across genres?**

In our first analysis, we meticulously calculated the ratios of movie genres for each cluster, basing our calculations on the films in which actors within the cluster participated. This approach provided a collective genre profile for each cluster. Conversely, our second analysis adopted a different perspective, focusing on the main genres associated with the actors themselves. This method offered an individualistic genre representation of each cluster. Upon initial inspection of both datasets, there wasn't an immediately discernible correlation between specific genres and the clusters' ranks.

The analysis took a compelling turn when we applied T-tests to each genre across the clusters. A significant finding emerged from this statistical examination, particularly with the crime-fiction genre. When analyzed in the context of the actors' main genres, crime-fiction yielded a p-value below 0.05, indicating a statistically significant association. Delving deeper into this discovery, especially concentrating on the crime-fiction genre, revealed a notable pattern: an increased prevalence of this genre corresponded with lower cluster rankings, suggesting a potential inverse relationship.

Extending this statistical scrutiny to the movie genre ratios, we observed a parallel trend for crime-fiction, which again manifested a p-value under 0.05. This consistency in results underscores the genre's impact on cluster rankings. However, the study uncovered an intriguing contrast with genres such as "fantasy" and "fantasy adventure," where the p-values were less than 0.05. This intriguing outcome implies that unlike crime-fiction, which appears to negatively influence a cluster's average rank, the presence of fantasy or fantasy adventure genres could contribute positively to enhancing a cluster's rank.

The significant correlation between the prevalence of crime-fiction and lower cluster rankings suggests that clusters heavily skewed towards this genre might face certain challenges. This could be attributed to the demanding nature of crime-fiction narratives, which often require intense, dramatic performances and may not always appeal to a broad audience. Such a genre-specific focus could limit the versatility and appeal of the cluster's collective filmography.

On the other hand, the positive correlation observed with fantasy and fantasy adventure genres hints at a different dynamic. These genres, often characterized by imaginative storytelling and visual spectacle, might offer more opportunities for creative expression and broad audience appeal. This can lead to a more diverse and engaging portfolio of work, potentially contributing to higher cluster rankings.

These patterns underscore the idea that the collective genre profile of a cluster is not just a reflection of individual actor preferences, but a strategic element that can influence the cluster's overall success. The study suggests that a balanced and diverse genre portfolio within a cluster might be more conducive to achieving higher rankings, possibly due to wider audience appeal and greater opportunities for showcasing a range of acting skills and storylines. This insight could be valuable for actors and industry professionals in forming collaborative groups and choosing projects, emphasizing the importance of genre diversity as a strategic consideration in their career trajectories.