In [3]:
import pandas as pd
import holoviews as hv
import numpy as np
import itertools
from holoviews import opts, dim

hv.extension('bokeh')
hv.output(size=200)

In [4]:
dataframe = pd.read_csv('data.csv')


From our dataset we will extract production from United States which have cast information

In [5]:
by_cast = dataframe[['title', 'country', 'cast']].copy()
by_cast = by_cast[(by_cast['cast'].notnull())
                  & (by_cast['country'] == 'United States')]

The cast column is made up of actor names separated by commas. We need to separate them and also remove whitespace.

In [6]:
by_cast['cast'] = by_cast['cast'].apply(lambda row_value: list(
    map(lambda actor_name: actor_name.strip(), str(row_value).split(","))))

Let's split the cast arrays into separate rows by using explode.

In [7]:
all_actors = by_cast.explode("cast")
all_actors

Unnamed: 0,title,country,cast
3,9,United States,Elijah Wood
3,9,United States,John C. Reilly
3,9,United States,Jennifer Connelly
3,9,United States,Christopher Plummer
3,9,United States,Crispin Glover
...,...,...,...
7781,Zoom,United States,Ryan Newman
7781,Zoom,United States,Michael Cassidy
7781,Zoom,United States,Spencer Breslin
7781,Zoom,United States,Rip Torn


Now we can count the number of productions each actor starred in

In [8]:
cast_counts = all_actors.groupby(by='cast', as_index=False).agg(number_prods=('cast', 'count'))
cast_counts

Unnamed: 0,cast,number_prods
0,2 Chainz,1
1,50 Cent,4
2,A Boogie Wit tha Hoodie,1
3,A.D. Miles,3
4,A.J. LoCascio,3
...,...,...
10664,Zydrunas Savickas,1
10665,k.d. lang,1
10666,vivienne Rutherford,1
10667,Álvaro Rodríguez,1


Using this, we can extract the actors which are part of at least 5 productions (in order to not have too much data).
We will also extract these actor names in an array and into a map of indices.

In [9]:
popular_actors = cast_counts[cast_counts['number_prods'] > 5]

popular_actors_list = popular_actors['cast'].values
popular_map = dict(zip(popular_actors_list, range(len(popular_actors_list))))
popular_map_inv = dict(zip(range(len(popular_actors_list)), popular_actors_list))



Now we need to compute a matrix of occurences that for each actor will say in how many movies he or she has player with all the other actors. It's size will be `num_of_actors * num_of_actors` 

In [10]:
# Create a new empty matrix for occurences between actors
matrix = np.zeros((len(popular_actors_list), len(popular_actors_list)))

for i in by_cast.index:
    # Get the current movie cast
    current_group = by_cast['cast'][i]

    # Extract actors which are part of popular list
    current_group = filter(lambda x: x in popular_actors_list, current_group)

    # Convert actor names into indices
    indices = list(map(lambda x: popular_map[x], current_group))

    # Get all the permutations of actors in pairs
    # Example: [x, y, z] => [[x, y], [x, z], [y, x], [y, z], [z, x], [z, y]]
    pairs = list(itertools.permutations(indices, 2))

    # Use each pair of actors to update the matrix of occurences
    for pair in pairs:
        matrix[pair[0], pair[1]] += 1
matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

We can also remove actors that don't have too many collaborations

In [11]:
to_drop = []
for i in range(len(popular_actors_list)):
    nb_collaborations = np.sum(matrix[:, i])
    if nb_collaborations < 15:
        to_drop.append(i)

matrix = np.delete(matrix,to_drop, axis = 0)
matrix = np.delete(matrix,to_drop, axis = 1)
popular_actors_list = np.delete(popular_actors_list, to_drop, axis=0)

The Chord diagram implementation of holoviews library requires the input to be like: `[edge_1, edge_2, value]` so we will convert our matrix into this representation.

In [12]:
links = []
for i in range(len(matrix)):
    for j in range(len(matrix)):
        links.append([i, j, int(matrix[i, j])])
links_dataframe = pd.DataFrame(links, columns=["source", "target", "value"])
links_dataframe = links_dataframe[links_dataframe['value'] > 0]

The .select(x, None) seems to display connections having a weight of at least x.

In [13]:
nodes = hv.Dataset(pd.DataFrame(popular_actors_list, columns=["name"]),
                   'index')

chord = hv.Chord((links_dataframe, nodes)).select(value=(5, None))
chord.opts(
    opts.Chord(cmap='Category20c',
               edge_cmap='Category20c',
               edge_color=dim('source').str(),
               labels='name',
               node_color=dim('index').str()))