In [7]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from pyvis.network import Network
%matplotlib inline

In [8]:
# Load the data
DATA_PATH = 'clean_data/'
characters_df = pd.read_csv(DATA_PATH + 'characters.csv')
movies_df = pd.read_csv(DATA_PATH + 'movies.csv')
europe_movies_df = pd.read_csv(DATA_PATH + 'europe_movies.csv')

In [9]:
# Only keep the Asian movies
movies_df = pd.merge(movies_df, europe_movies_df, left_on='id', right_on='movie_id')
movies_df = movies_df.drop(columns=['movie_id'])

# Merge the movies and characters dataframes
df = pd.merge(characters_df, movies_df, left_on='movie_id', right_on='id')
df = df.drop(columns=['id_y'])
df = df.rename(columns={'name_x': 'character_name', 'name_y': 'movie_name', 'id_x': 'character_id', 'id_y': 'movie_id'})
display(df.head())
print(df.shape)

Unnamed: 0,movie_wiki_id,movie_id,movie_release_date,character_name,actor_birth_date,actor_gender,actor_height,actor_ethinicity_id,actor_name,actor_age,actor_map_id,character_id,actor_id,wiki_id,movie_name,release_date,revenue,runtime,summary
0,261236,/m/01mrr1,1983,,1950-06-20,F,,,Gudrun Landgrebe,32.0,/m/02vb3cv,,/m/09d6hv,261236,A Woman in Flames,1983,,106.0,"Eva, an upper class housewife, becomes frustra..."
1,261236,/m/01mrr1,1983,,1950-08-02,M,,,Mathieu Carrière,32.0,/m/02tbd9f,,/m/06prxs,261236,A Woman in Flames,1983,,106.0,"Eva, an upper class housewife, becomes frustra..."
2,261236,/m/01mrr1,1983,,1947-06-18,M,,,Hanns Zischler,35.0,/m/02vdfng,,/m/09k3x_,261236,A Woman in Flames,1983,,106.0,"Eva, an upper class housewife, becomes frustra..."
3,2238856,/m/06yc6v,2005-01,Michael,,M,,,Hector Elias,,/m/0bf1hh6,/m/0g98dth,/m/05pxr49,2238856,Me and You and Everyone We Know,2005-01,8012838.0,91.0,The structure of the film consists of several ...
4,2238856,/m/06yc6v,2005-01,Richard Swersey,1959-09-11,M,1.78,,John Hawkes,,/m/0k0jnm,/m/0g98dqk,/m/0785v8,2238856,Me and You and Everyone We Know,2005-01,8012838.0,91.0,The structure of the film consists of several ...


(66995, 19)


In [10]:
# FIXME - Keep only the first rows
df = df[:100]
print(df.shape)

(100, 19)


In [11]:
# Get the list of actors
actors = df['actor_name'].dropna().unique()
print(f"There are {len(actors)} actors in our dataset")

# Get the list of connections between actors
# An actor is connected to another if they played in the same movie
connections = set()
for movie_id in df['movie_id'].dropna().unique():
	actors_in_movie = df[df['movie_id'] == movie_id]['actor_name'].dropna().unique()
	for actor in actors_in_movie:
		for other_actor in actors_in_movie:
			if actor != other_actor:
				connections.add(tuple(sorted([actor, other_actor])))
print(f"There are {len(connections)} connections between actors")

There are 98 actors in our dataset
There are 500 connections between actors


In [12]:
# Create a graph
graph = nx.Graph()

# Add nodes and edges
graph.add_nodes_from(actors)
graph.add_edges_from(connections)

# Draw the interactive graph
network = Network(notebook=True, height='1000px', width='100%')
network.repulsion()
network.from_nx(graph)
network.show('actors_connections.html')

actors_connections.html
