In [1]:
# Library imports
import numpy as np
import pandas as pd
import json

In [2]:
# Read the dataset
df = pd.read_csv('../data/primary_dataset.csv')

# Standardize column names
df.columns = ['rank', 'title', 'genres', 'description', 'director', 'actors', 'year', 'runtime', 'rating', 'votes', 'revenue', 'metascore']

# Split actors:
df["actors"] = df["actors"].str.split(",")
df = df.explode("actors").reset_index(drop=True)
# Remove spaces
df["actors"] = df["actors"].str.strip()

# Count pairs:
result = df.groupby(['director', 'actors']).size().reset_index().rename(columns={0:'count'})
result = result[result['count'] > 1].reset_index()

directors = result['director'].drop_duplicates()
actors = result['actors'].drop_duplicates()

# Iterate rows:
nodes = list()
names = list()
for director in directors:
    nodes.append({'id': director, 'group' : 1})
    names.append(director)
for actor in actors:
    nodes.append({'id': actor, 'group' : 2})
    names.append(actor)

links = list()
for index, row in result.iterrows():
    links.append({"source": row['director'], "target": row['actors'], "value": row['count']})

json_file = {
    'nodes': nodes,
    'links': links
}

with open('result.json', 'w') as fp:
    json.dump(json_file, fp)

In [3]:
result

Unnamed: 0,index,director,actors,count
0,17,Adam McKay,John C. Reilly,2
1,24,Adam McKay,Will Ferrell,3
2,245,Anthony Russo,Chris Evans,2
3,249,Anthony Russo,Scarlett Johansson,2
4,257,Antoine Fuqua,Denzel Washington,2
...,...,...,...,...
154,3701,Wes Ball,Kaya Scodelario,2
155,3702,Wes Ball,Thomas Brodie-Sangster,2
156,3742,Woody Allen,Jesse Eisenberg,2
157,3776,Zack Snyder,Amy Adams,2
