In this notebook we generate the datasets that are used directly in the visualization

In [1]:
!head -1 data/tmdb_100k.csv

id,budget,genres,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_year,revenue,vote_average,vote_count,cast,crew


In [2]:
import os
import json
import numpy as np
import pandas as pd
from collections import Counter

In [6]:
def save_json(fname, data):
    with open(os.path.join('data/', fname), mode='w') as fp:
        json.dump(data, fp, indent=2)

In [7]:
def my_eval(x):
    try:
        return eval(x)
    except TypeError:
        return np.nan

dtype = {
    'id': int,
    'budget': float,
    'genres': str,
    'keywords': str,
    'original_language': str,
    'original_title': str,
    'overview': str,
    'popularity': float,
    'production_companies': str,
    'production_countries': str,
    'release_year': int,
    'revenue': float,
    'vote_average': float,
    'vote_count': float,
    'cast': str,
    'crew': str
}

new_names = {
    'id': 'id',
    'budget': 'budget',
    'genres': 'genres',
    'keywords': 'keywords',
    'original_language': 'originalLanguage',
    'original_title': 'originalTitle',
    'overview': 'overview',
    'popularity': 'popularity',
    'production_companies': 'productionCompanies',
    'production_countries': 'productionCountries',
    'release_year': 'releaseYear',
    'revenue': 'revenue',
    'vote_average': 'voteAverage',
    'vote_count': 'voteCount',
    'cast': 'cast',
    'crew': 'crew'
}

In [8]:
df = pd.read_csv('data/tmdb_100k.csv', engine='python', dtype=dtype)

df = df[df.original_language == 'en']

df.genres = df.genres.apply(my_eval)
df.keywords = df.keywords.apply(my_eval)
df.production_companies = df.production_companies.apply(my_eval)
df.production_countries = df.production_countries.apply(my_eval)
df.cast = df.cast.apply(my_eval)
df.crew = df.crew.apply(my_eval)

df.rename(columns=new_names, inplace=True)

In [16]:
def known_gender_lead(cast):
    return cast[0][2] != 'Not specified'

def has_director(crew):
    directors = list(filter(lambda x: x[2] != 'Not specified' and x[3] == 'Director', crew))
    if not directors:
        return False
    return True

def gender_lead(cast):
    return cast[0][2]

def gender_director(crew):
    directors = filter(lambda x: x[2] != 'Not specified' and x[3] == 'Director', crew)
    cnt = Counter(map(lambda x: x[2], directors)).most_common()
    return cnt[0][0]

def get_main_actor_name(cast):
    actors = list(filter(lambda x: x[2] != 'Not specified', cast))
    return actors[0][1]

def get_director_name(crew):
    directors = filter(lambda x: x[2] != 'Not specified' and x[3] == 'Director', crew)
    cnt = Counter(directors).most_common()
    return cnt[0][0][:3][1]

In [10]:
df.dropna(subset=['budget', 'revenue', 'popularity', 'genres', 'releaseYear'], inplace=True)

df = df[(df.budget >= 0) & (df.revenue >= 0) & (df.popularity >= 0)]

df.dropna(subset=['cast', 'crew'], inplace=True)

df = df[df.cast.apply(known_gender_lead)]
df = df[df.crew.apply(has_director)]

df.reset_index(drop=True, inplace=True)

df['genderLead'] = df.cast.apply(gender_lead)
df['genderDirector'] = df.crew.apply(gender_director)

In [17]:
df['nameLead'] = df.cast.apply(get_main_actor_name)
df['nameDirector'] = df.crew.apply(get_director_name)

In [18]:
df.head()

Unnamed: 0,id,budget,genres,keywords,originalLanguage,originalTitle,overview,popularity,productionCompanies,productionCountries,releaseYear,revenue,voteAverage,voteCount,cast,crew,genderLead,genderDirector,nameLead,nameDirector
0,5,4000000.0,"[Crime, Comedy]","[hotel, new year's eve, witch, bet, hotel room...",en,Four Rooms,It's Ted the Bellhop's first night on the job....,14.252,"[Miramax, A Band Apart]",[United States of America],1995,4257354.0,5.9,1533.0,"[(3129, Tim Roth, Male), (3130, Jennifer Beals...","[(3110, Allison Anders, Female, Director), (31...",Male,Male,Tim Roth,Allison Anders
1,6,21000000.0,"[Action, Thriller, Crime]","[chicago, usa, drug dealer, boxing match, esca...",en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",9.373,"[Universal Pictures, Largo Entertainment, JVC]","[Japan, United States of America]",1993,12136938.0,6.5,143.0,"[(2880, Emilio Estevez, Male), (9777, Cuba Goo...","[(2042, Stephen Hopkins, Male, Director), (520...",Male,Male,Emilio Estevez,Stephen Hopkins
2,11,11000000.0,"[Adventure, Action, Science Fiction]","[android, galaxy, hermit, death star, lightsab...",en,Star Wars,Princess Leia is captured and held hostage by ...,59.177,"[Lucasfilm, 20th Century Fox]",[United States of America],1977,775398007.0,8.2,13374.0,"[(2, Mark Hamill, Male), (3, Harrison Ford, Ma...","[(1, George Lucas, Male, Director), (1, George...",Male,Male,Mark Hamill,George Lucas
3,12,94000000.0,"[Animation, Family]","[parent child relationship, harbor, anthropomo...",en,Finding Nemo,"Nemo, an adventurous young clownfish, is unexp...",33.802,[Pixar],[United States of America],2003,940335536.0,7.8,13300.0,"[(13, Albert Brooks, Male), (14, Ellen DeGener...","[(7, Andrew Stanton, Male, Screenplay), (7, An...",Male,Male,Albert Brooks,Andrew Stanton
4,13,55000000.0,"[Comedy, Drama, Romance]","[vietnam veteran, hippie, washington d.c., men...",en,Forrest Gump,A man with a low IQ has accomplished great thi...,33.955,[Paramount],[United States of America],1994,677387716.0,8.4,17591.0,"[(31, Tom Hanks, Male), (32, Robin Wright, Fem...","[(37, Alan Silvestri, Male, Original Music Com...",Male,Male,Tom Hanks,Robert Zemeckis


# Timeline

In [19]:
df_timeline = df[['releaseYear',
                 'id',
                 'budget',
                 'revenue',
                 'popularity',
                  'originalTitle',
                 'genderLead',
                 'genderDirector',
                 'nameLead',
                 'nameDirector']]

df_timeline.set_index('releaseYear', inplace=True)

df_timeline.head()

Unnamed: 0_level_0,id,budget,revenue,popularity,originalTitle,genderLead,genderDirector,nameLead,nameDirector
releaseYear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1995,5,4000000.0,4257354.0,14.252,Four Rooms,Male,Male,Tim Roth,Allison Anders
1993,6,21000000.0,12136938.0,9.373,Judgment Night,Male,Male,Emilio Estevez,Stephen Hopkins
1977,11,11000000.0,775398007.0,59.177,Star Wars,Male,Male,Mark Hamill,George Lucas
2003,12,94000000.0,940335536.0,33.802,Finding Nemo,Male,Male,Albert Brooks,Andrew Stanton
1994,13,55000000.0,677387716.0,33.955,Forrest Gump,Male,Male,Tom Hanks,Robert Zemeckis


In [20]:
data = {}

for i, row in df_timeline.iterrows():
    if i not in data:
        data[i] = []
    data[i].append(row.to_dict())

# save the dataset used in the scatter plot
save_json('gender-representation.json', data)

In [21]:
del df_timeline

# Bar Chart

In [18]:
df_bar = df[['releaseYear', 'genres', 'genderLead', 'genderDirector', 'popularity', 'id']]

In [19]:
genres = df_bar.explode('genres')['genres'].unique()

In [20]:
years = df_bar['releaseYear'].unique()

data = {}
for year in years:
    year_data = {}
    df_tmp = df_bar[df_bar['releaseYear'] == year]
    df_tmp = df_tmp[['genres', 'genderLead', 'releaseYear', 'genderDirector']]
    df_tmp = df_tmp.explode('genres')
    counts = df_tmp.groupby(['genres', 'genderLead'])['releaseYear'].count().rename("count")
    normalized_counts = counts / counts.groupby(level=0).sum() * 100
    normalized_counts = normalized_counts.reset_index(level=1)
    normalized_counts = normalized_counts.sort_values(by='genderLead')
    year_data['actor'] = {}
    for i, row in normalized_counts.iterrows():
        if i not in year_data['actor']:
            year_data['actor'][i]  = {'Male': 0, 'Female': 0}
        year_data['actor'][i][row['genderLead']] = row['count']
    counts = df_tmp.groupby(['genres', 'genderDirector'])['releaseYear'].count().rename("count")
    normalized_counts = counts / counts.groupby(level=0).sum() * 100
    normalized_counts = normalized_counts.reset_index(level=1)
    normalized_counts = normalized_counts.sort_values(by='genderDirector')
    year_data['director'] = {}
    for i, row in normalized_counts.iterrows():
        if i not in year_data['director']:
            year_data['director'][i]  = {'Male': 0, 'Female': 0}
        year_data['director'][i][row['genderDirector']] = row['count']
    data[str(year)] = year_data

In [21]:
# save the main dataset used in the bar chart
save_json('bar-chart.json', data)

In [22]:
import collections


years = df_bar['releaseYear'].unique()

nested_dict = lambda: collections.defaultdict(nested_dict)
data = nested_dict()

for year in years:
    df_tmp = df_bar[df_bar['releaseYear'] == year]
    df_tmp = df_tmp[['genres', 'genderLead', 'releaseYear', 'popularity', 'id', 'genderDirector']]
    df_tmp = df_tmp.explode('genres')
    category_values = df_tmp['genres'].unique()
    for c_v in category_values:
        for gender in ['Male', 'Female']:
            top_ids = df_tmp[(df_tmp['genres']==c_v) & (df_tmp['genderLead']==gender)].sort_values('popularity', ascending=False).head(10)['id'].values
            data['genres'][str(year)][c_v]['actor'][gender] = top_ids.tolist()
            top_ids = df_tmp[(df_tmp['genres']==c_v) & (df_tmp['genderDirector']==gender)].sort_values('popularity', ascending=False).head(10)['id'].values
            data['genres'][str(year)][c_v]['director'][gender] = top_ids.tolist()

In [23]:
# save the additional dataset used in the bar chart
save_json('popular-movies.json', data)

In [24]:
del df_bar

# Collaborations

In [26]:
def get_main_actor(cast):
    actors = list(filter(lambda x: x[2] != 'Not specified', cast))
    return actors[0]

def get_director(crew):
    directors = filter(lambda x: x[2] != 'Not specified' and x[3] == 'Director', crew)
    cnt = Counter(directors).most_common()
    return cnt[0][0][:3]


df_collab = df[['releaseYear', 'cast', 'crew']]

df_collab.cast = df_collab.cast.apply(get_main_actor)
df_collab.crew = df_collab.crew.apply(get_director)

df_collab.reset_index(drop=True, inplace=True)

df_collab[['sourceId', 'source', 'sourceGender']] = pd.DataFrame(df_collab.crew.tolist())
df_collab[['targetId', 'target', 'targetGender']] = pd.DataFrame(df_collab.cast.tolist())

df_collab.drop(columns=['cast', 'crew'], inplace=True)

df_collab.reset_index(drop=True, inplace=True)

df_collab.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,releaseYear,sourceId,source,sourceGender,targetId,target,targetGender
0,1995,3110,Allison Anders,Female,3129,Tim Roth,Male
1,1993,2042,Stephen Hopkins,Male,2880,Emilio Estevez,Male
2,1977,1,George Lucas,Male,2,Mark Hamill,Male
3,2003,7,Andrew Stanton,Male,13,Albert Brooks,Male
4,1994,24,Robert Zemeckis,Male,31,Tom Hanks,Male


In [90]:
df_collab.shape

(5243, 5)

In [28]:
df_tmp_links = df_collab.groupby(['releaseYear',
                               'sourceId',
                               'source',
                               'sourceGender',
                               'targetId',
                               'target',
                               'targetGender']).size().reset_index(name='value')

df_tmp_links.source = df_tmp_links.source + ', Director'
df_tmp_links.target = df_tmp_links.target + ', Actor'

df_tmp_links.head()

Unnamed: 0,releaseYear,sourceId,source,sourceGender,targetId,target,targetGender,value
0,1915,8636,"Cecil B. DeMille, Director",Male,558287,"Fannie Ward, Actor",Female,1
1,1915,100036,"D. W. Griffith, Director",Male,8828,"Lillian Gish, Actor",Female,1
2,1916,100036,"D. W. Griffith, Director",Male,8828,"Lillian Gish, Actor",Female,1
3,1918,150176,"F. Richard Jones, Director",Male,89563,"Mabel Normand, Actor",Female,1
4,1921,13848,"Charlie Chaplin, Director",Male,13848,"Charlie Chaplin, Actor",Male,1


In [29]:
df_tmp_nodes_1 = df_tmp_links[['releaseYear', 'sourceId', 'source', 'sourceGender']].copy()
df_tmp_nodes_1.columns = ['releaseYear', 'tmdbId', 'id' ,'gender']
df_tmp_nodes_1['isDirector'] = True

df_tmp_nodes_2 = df_tmp_links[['releaseYear', 'targetId', 'target', 'targetGender']].copy()
df_tmp_nodes_2.columns = ['releaseYear', 'tmdbId', 'id', 'gender']
df_tmp_nodes_2['isDirector'] = False

df_tmp_links.drop(columns=['sourceGender', 'targetGender'], inplace=True)

df_tmp_nodes = pd.concat((df_tmp_nodes_1, df_tmp_nodes_2), ignore_index=True).drop_duplicates()

df_tmp_nodes.head()

Unnamed: 0,releaseYear,tmdbId,id,gender,isDirector
0,1915,8636,"Cecil B. DeMille, Director",Male,True
1,1915,100036,"D. W. Griffith, Director",Male,True
2,1916,100036,"D. W. Griffith, Director",Male,True
3,1918,150176,"F. Richard Jones, Director",Male,True
4,1921,13848,"Charlie Chaplin, Director",Male,True


In [30]:
df_tmp_links.set_index('releaseYear', inplace=True)
df_tmp_nodes.set_index('releaseYear', inplace=True)

In [31]:
data = {}

for i, row in df_tmp_links.iterrows():
    if i not in data:
        data[i] = { 'nodes': [], 'links': [] }
    data[i]['links'].append(row.to_dict())

for i, row in df_tmp_nodes.iterrows():
    if i not in data:
        data[i] = { 'nodes': [], 'links': [] }
    data[i]['nodes'].append(row.to_dict())

In [32]:
# save the final dataset used in the network visualization
save_json('collaborations.json', data)