In [None]:
import collections
import math
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import pandas as pd
import seaborn as sns
import community
from itertools import count

# Analysis of the attributes

In [None]:
actors_agg_adj = np.load("sparse_agg_actor_adj.npy")
actors_agg_df = pd.read_pickle("actors_agg_df.pkl")
actors_graph = nx.from_numpy_matrix(actors_agg_adj)

In [None]:
# Add name and gender as node attribute
name_dict = {}
gender_dict = {}
attributes = actors_agg_df[['actors', 'gender']].values
for index, attr in enumerate(attributes):
          name_dict[index] = attr[0]    
          gender_dict[index] = attr[1]

nx.set_node_attributes(actors_graph, name_dict, 'name')
nx.set_node_attributes(actors_graph, gender_dict, 'gender')

In [None]:
# Add eigenvector centrality as node attribute
eigenvector_dict = nx.eigenvector_centrality(actors_graph)
nx.set_node_attributes(actors_graph, eigenvector_dict, 'eigenvector')

In [None]:
# Create communities using community detection library
communities_dict = community.best_partition(actors_graph)
# Add community as attribute
nx.set_node_attributes(actors_graph, communities_dict, 'community')

In [None]:
# write in gefx format
DATA_PATH = "data"
nx.write_gexf(actors_graph, f"{DATA_PATH}/louvain_graph.gexf")

In [None]:
# Extract communities to an array of objects
communities = []
for i in range(max(communities_dict.values()) + 1):
    communities.append([actors_graph.node[n] for n in actors_graph.nodes() if actors_graph.node[n]['community'] == i])

In [None]:
# Extract communities to an array of objects
communities = []
for i in range(max(communities_dict.values()) + 1):
    communities.append([actors_graph.node[n] for n in actors_graph.nodes() if actors_graph.node[n]['community'] == i])

In [None]:
# Create dataframe from extracted communities 
communities_df = [pd.DataFrame(n) for n in communities]

In [None]:
# Find representatives of each class by eigenvector centrality
for index, community in enumerate(communities_df):
    print(f'Representatives of community {index} with a size of {len(community.index)} actors\n{community.nlargest(10, "eigenvector")}\n')

We can observe that in community 2 we have notable A-list actors while in the smallest one (community 4) we have celebrities who might have participated in some movies but are not really actors.

# Visualization

In [None]:
# Visualize with colors per community
groups = set(nx.get_node_attributes(actors_graph,'community').values())
mapping = dict(zip(sorted(groups),count()))
nodes = actors_graph.nodes()
colors = [mapping[actors_graph.node[n]['community']] for n in nodes]

# Drawing nodes and edges separately to add colors
plt.figure(figsize=(28, 7))
pos = nx.spring_layout(actors_graph)
ec = nx.draw_networkx_edges(actors_graph, pos, alpha=0.1, width=0.7)
nc = nx.draw_networkx_nodes(actors_graph, pos, nodelist=nodes, node_color=colors, 
                            with_labels=False, node_size=10, cmap=plt.cm.jet)
plt.colorbar(nc)
plt.axis('off')
plt.title("Louvain Communities")
plt.savefig("plots/louvain.pdf")
plt.show()

We can observe that community 2 (the famous A-list actors) tend to work together more often, and also with a selection of actors from other communities. Some mid-tier actors are popular and mix with many other communities, while some are stuck in their own community. The celebrities who are not really actors (community 4) are very separated from other actors communities.

# Explain communities with logistic regression on signals

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
actors_id_df = actors_agg_df[["actors"]].reset_index()
actors_id_df.head()

In [None]:
communities_df = pd.concat(communities_df).drop(columns=["gender","eigenvector"])
print(len(communities_df))
communities_df.head()

In [None]:
community_labels_df = communities_df.merge(actors_id_df,left_on="name",right_on="actors").drop(columns=["name"]).set_index("actor_id")
print(np.unique(community_labels_df.index))
print(len(np.unique(community_labels_df.index)))
community_labels_df.head()


In [None]:
community_labels_df=community_labels_df.drop(columns=["actors"])
community_labels_df.head()

In [None]:
actors_dataset_df = actors_agg_df.join(community_labels_df)[["budget","popularity","revenue","vote_average","vote_count","community"]]
actors_dataset_df.head()

### Save dataset with communities

In [None]:
actors_dataset_df.to_pickle("actors_dataset_df.pkl")

In [None]:
actors_dataset = actors_dataset_df.values
X = actors_dataset[:,1:-1]
y = actors_dataset[:,-1]
train_features,test_features,train_labels,test_labels=train_test_split(X,y,test_size=0.2)

In [None]:
# Fit a logistic regression model
# Your code here
max_iter = 10000000
std_scaler = StandardScaler()
scaled_train_features = std_scaler.fit_transform(train_features)
model = LogisticRegression(
    solver="liblinear", multi_class="auto", max_iter=max_iter, C=10000
)
model.fit(scaled_train_features, train_labels)
scaled_test_features = std_scaler.transform(test_features)
predict_train = model.predict(scaled_train_features)
predict_test = model.predict(scaled_test_features)

In [None]:
accuracy=accuracy_score(test_labels,predict_test)
print("Accuracy: " +str(accuracy))

# Boxplots of signals on communities

In [None]:
signals = ["budget","popularity","vote_average","vote_count"]
fig, ax = plt.subplots(4,1,figsize=(10,15),sharex= True)
actors_dataset_df.boxplot(signals,ax=ax,by="community",)