In [1]:
import pandas as pd
import re
import itertools
import networkx as nx

In [2]:
df = pd.read_csv('./data/film_festivals.csv', keep_default_na=False)

In [3]:
df = df[~(df['Director'].isin(['N/A', '']) & df['Writer'].isin(['N/A', '']) & df['Actors'].isin(['N/A', '']))]
df['Director'] = df['Director'].apply(lambda row: re.sub('\(.*?\)', '', row))
df['Writer'] = df['Writer'].apply(lambda row: re.sub(' \(.*?\)', '', row))
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title_english,title_original,director,country,winner,festival,year_festival,link_film,link_director,...,BoxOffice,Production,Website,Response,InternetMovieDatabaseRating,RottenTomatoesRating,MetacriticRating,Error,totalSeasons,Ratings
0,0,Adieu Bonaparte,وداعا بونابرت,Youssef Chahine,Egypt,0,cannes,1985,https://en.wikipedia.org/wiki/Adieu_Bonaparte,https://en.wikipedia.org/wiki/Youssef_Chahine,...,,,,True,6.4/10,,,,,
1,1,Birdy,,Alan Parker,United States,0,cannes,1985,https://en.wikipedia.org/wiki/Birdy_(film),https://en.wikipedia.org/wiki/Alan_Parker,...,,Sony Pictures Home Entertainment,,True,7.3/10,85%,,,,
2,2,Bliss,,Ray Lawrence,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/Bliss_(1985_film),https://en.wikipedia.org/wiki/Ray_Lawrence_(fi...,...,,Starmaker Entertainment,,True,6.9/10,,,,,
3,4,The Coca-Cola Kid,,Dušan Makavejev,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/The_Coca-Cola_Kid,https://en.wikipedia.org/wiki/Du%C5%A1an_Makav...,...,,Cinecom Pictures,,True,6.0/10,44%,,,,
4,5,Colonel Redl,Oberst Redl,István Szabó,Hungary,0,cannes,1985,https://en.wikipedia.org/wiki/Colonel_Redl,https://en.wikipedia.org/wiki/Istv%C3%A1n_Szab...,...,,,,True,7.6/10,,,,,


In [4]:
df_cannes_2010_2019 = df[(df['festival'] == 'cannes') & (df['year_festival'] >= 2010)].reset_index(drop=True)
df_cannes_2010_2019[['Director', 'Writer', 'Actors']]

Unnamed: 0,Director,Writer,Actors
0,Mike Leigh,Mike Leigh,"Jim Broadbent, Ruth Sheen, Lesley Manville, Ol..."
1,Alejandro G. Iñárritu,"Alejandro G. Iñárritu, Alejandro G. Iñárritu, ...","Javier Bardem, Maricel Álvarez, Hanaa Bouchaib..."
2,Nikita Mikhalkov,"Nikita Mikhalkov, Vladimir Moiseenko, Aleksand...","Nikita Mikhalkov, Oleg Menshikov, Nadezhda Mik..."
3,Abbas Kiarostami,"Abbas Kiarostami, Caroline Eliacheff","Juliette Binoche, William Shimell, Jean-Claude..."
4,Xiaoshuai Wang,Yishu Yang,"Bingbing Fan, Feier Li, Hao Qin, Xueqi Wang"
5,Doug Liman,"Jez Butterworth, John-Henry Butterworth, Josep...","Naomi Watts, Sonya Davison, Vanessa Chong, Ana..."
6,Sang-soo Im,"Ki-young Kim, Sang-soo Im","Do-yeon Jeon, Jung-jae Lee, Yuh Jung Youn, Woo..."
7,Sergey Loznitsa,Sergey Loznitsa,"Viktor Nemets, Vladimir Golovin, Aleksey Vertk..."
8,Xavier Beauvois,"Xavier Beauvois, Etienne Comar","Lambert Wilson, Michael Lonsdale, Olivier Rabo..."
9,Mathieu Amalric,"Mathieu Amalric, Raphaëlle Desplechin, Philipp...","Miranda Colclasure, Suzanne Ramsey, Dirty Mart..."


In [5]:
def get_nodelist(directors_from_df, writers_from_df, actors_from_df):
    directors = directors_from_df.split(', ')
    directors = [(director, 'director') for director in directors]
    writers = writers_from_df.split(', ')
    writers = [(writer, 'writer') for writer in writers]
    actors = actors_from_df.split(', ')
    actors = [(actor, 'actor') for actor in actors]
    people = directors
    people.extend(actors) # the other of the extends determines the order of priority: director -> actor -> writer for people who have multiple roles
    people.extend(writers)
    
    name = [person[0] for person in people]
    role = [person[1] for person in people]
    nodelist = pd.DataFrame({'Name': name,
                             'Role': role})
    nodelist = nodelist.drop_duplicates(subset='Name').reset_index(drop=True)
    return nodelist

In [6]:
def get_edgelist(nodelist):
    edges = [combo + (1,) for combo in itertools.combinations(nodelist['Name'], 2)]
    edgelist = pd.DataFrame(edges, columns=['Source', 'Target', 'Weight'])
    return edges, edgelist

In [7]:
def get_film_edges(directors_from_df, writers_from_df, actors_from_df):
    nodelist_film = get_nodelist(directors_from_df,
                                 writers_from_df,
                                 actors_from_df)
    edges_film, edgelist_film = get_edgelist(nodelist_film)
    return edges_film    

In [8]:
df_cannes_2010_2019_directors = df_cannes_2010_2019['Director'].str.cat(sep=', ') # think about removing 'N/A'
df_cannes_2010_2019_writers = df_cannes_2010_2019['Writer'].str.cat(sep=', ')
df_cannes_2010_2019_actors = df_cannes_2010_2019['Actors'].str.cat(sep=', ')

df_cannes_2010_2019_nodelist = get_nodelist(df_cannes_2010_2019_directors,
                                            df_cannes_2010_2019_writers,
                                            df_cannes_2010_2019_actors)
df_cannes_2010_2019_node_names = list(df_cannes_2010_2019_nodelist['Name'])
df_cannes_2010_2019_node_names

['Mike Leigh',
 'Alejandro G. Iñárritu',
 'Nikita Mikhalkov',
 'Abbas Kiarostami',
 'Xiaoshuai Wang',
 'Doug Liman',
 'Sang-soo Im',
 'Sergey Loznitsa',
 'Xavier Beauvois',
 'Mathieu Amalric',
 'Daniele Luchetti',
 'Takeshi Kitano',
 'Julien Houillon',
 'Chang-dong Lee',
 'Bertrand Tavernier',
 'Ken Loach',
 'Mahamat-Saleh Haroun',
 'Kornél Mundruczó',
 'Apichatpong Weerasethakul',
 'Michel Hazanavicius',
 'Nicolas Winding Refn',
 'Joseph Cedar',
 'Naomi Kawase',
 'Takashi Miike',
 'Aki Kaurismäki',
 'Bertrand Bonello',
 'Jean-Pierre Dardenne',
 'Luc Dardenne',
 'Lars von Trier',
 'Markus Schleinzer',
 'Kathrin Resetarits',
 'Nuri Bilge Ceylan',
 'Alain Cavalier',
 'Maïwenn',
 'Pedro Almodóvar',
 'Julia Leigh',
 'Radu Mihaileanu',
 'Paolo Sorrentino',
 'Terrence Malick',
 'Nanni Moretti',
 'Lynne Ramsay',
 'Yusri Nasrullah',
 'Cristian Mungiu',
 'David Cronenberg',
 'Leos Carax',
 'Thomas Vinterberg',
 'Sang-soo Hong',
 'Andrew Dominik',
 'John Hillcoat',
 'Banafsheh Violet Modaressi',

In [9]:
df_cannes_2010_2019_edges = df_cannes_2010_2019.apply(lambda row: get_film_edges(row['Director'], row['Writer'], row['Actors']), axis=1)
df_cannes_2010_2019_edges = df_cannes_2010_2019_edges.tolist()
df_cannes_2010_2019_edges = [item for sublist in df_cannes_2010_2019_edges for item in sublist]
df_cannes_2010_2019_edges

[('Mike Leigh', 'Jim Broadbent', 1),
 ('Mike Leigh', 'Ruth Sheen', 1),
 ('Mike Leigh', 'Lesley Manville', 1),
 ('Mike Leigh', 'Oliver Maltman', 1),
 ('Jim Broadbent', 'Ruth Sheen', 1),
 ('Jim Broadbent', 'Lesley Manville', 1),
 ('Jim Broadbent', 'Oliver Maltman', 1),
 ('Ruth Sheen', 'Lesley Manville', 1),
 ('Ruth Sheen', 'Oliver Maltman', 1),
 ('Lesley Manville', 'Oliver Maltman', 1),
 ('Alejandro G. Iñárritu', 'Javier Bardem', 1),
 ('Alejandro G. Iñárritu', 'Maricel Álvarez', 1),
 ('Alejandro G. Iñárritu', 'Hanaa Bouchaib', 1),
 ('Alejandro G. Iñárritu', 'Guillermo Estrella', 1),
 ('Alejandro G. Iñárritu', 'Nicolás Giacobone', 1),
 ('Alejandro G. Iñárritu', 'Armando Bo', 1),
 ('Javier Bardem', 'Maricel Álvarez', 1),
 ('Javier Bardem', 'Hanaa Bouchaib', 1),
 ('Javier Bardem', 'Guillermo Estrella', 1),
 ('Javier Bardem', 'Nicolás Giacobone', 1),
 ('Javier Bardem', 'Armando Bo', 1),
 ('Maricel Álvarez', 'Hanaa Bouchaib', 1),
 ('Maricel Álvarez', 'Guillermo Estrella', 1),
 ('Maricel Álvar

In [10]:
M = nx.MultiGraph()
M.add_nodes_from(df_cannes_2010_2019_node_names)
M.add_weighted_edges_from(df_cannes_2010_2019_edges)

In [11]:
M.edges(data=True)

MultiEdgeDataView([('Mike Leigh', 'Jim Broadbent', {'weight': 1}), ('Mike Leigh', 'Ruth Sheen', {'weight': 1}), ('Mike Leigh', 'Lesley Manville', {'weight': 1}), ('Mike Leigh', 'Oliver Maltman', {'weight': 1}), ('Mike Leigh', 'Timothy Spall', {'weight': 1}), ('Mike Leigh', 'Paul Jesson', {'weight': 1}), ('Mike Leigh', 'Dorothy Atkinson', {'weight': 1}), ('Mike Leigh', 'Marion Bailey', {'weight': 1}), ('Alejandro G. Iñárritu', 'Javier Bardem', {'weight': 1}), ('Alejandro G. Iñárritu', 'Maricel Álvarez', {'weight': 1}), ('Alejandro G. Iñárritu', 'Hanaa Bouchaib', {'weight': 1}), ('Alejandro G. Iñárritu', 'Guillermo Estrella', {'weight': 1}), ('Alejandro G. Iñárritu', 'Nicolás Giacobone', {'weight': 1}), ('Alejandro G. Iñárritu', 'Armando Bo', {'weight': 1}), ('Nikita Mikhalkov', 'Oleg Menshikov', {'weight': 1}), ('Nikita Mikhalkov', 'Nadezhda Mikhalkova', {'weight': 1}), ('Nikita Mikhalkov', 'Sergey Makovetskiy', {'weight': 1}), ('Nikita Mikhalkov', 'Vladimir Moiseenko', {'weight': 1}), 

In [12]:
# https://stackoverflow.com/questions/15590812/networkx-convert-multigraph-into-simple-graph-with-weighted-edges
# create weighted graph from M
G = nx.Graph() 
for u,v,data in M.edges(data=True):
    w = data['weight'] if 'weight' in data else 1.0
    if G.has_edge(u,v):
        G[u][v]['weight'] += w
    else:
        G.add_edge(u, v, weight=w)

print(G.edges(data=True))

[('Mike Leigh', 'Jim Broadbent', {'weight': 1}), ('Mike Leigh', 'Ruth Sheen', {'weight': 1}), ('Mike Leigh', 'Lesley Manville', {'weight': 1}), ('Mike Leigh', 'Oliver Maltman', {'weight': 1}), ('Mike Leigh', 'Timothy Spall', {'weight': 1}), ('Mike Leigh', 'Paul Jesson', {'weight': 1}), ('Mike Leigh', 'Dorothy Atkinson', {'weight': 1}), ('Mike Leigh', 'Marion Bailey', {'weight': 1}), ('Jim Broadbent', 'Ruth Sheen', {'weight': 1}), ('Jim Broadbent', 'Lesley Manville', {'weight': 1}), ('Jim Broadbent', 'Oliver Maltman', {'weight': 1}), ('Ruth Sheen', 'Lesley Manville', {'weight': 1}), ('Ruth Sheen', 'Oliver Maltman', {'weight': 1}), ('Lesley Manville', 'Oliver Maltman', {'weight': 1}), ('Timothy Spall', 'Paul Jesson', {'weight': 1}), ('Timothy Spall', 'Dorothy Atkinson', {'weight': 1}), ('Timothy Spall', 'Marion Bailey', {'weight': 1}), ('Paul Jesson', 'Dorothy Atkinson', {'weight': 1}), ('Paul Jesson', 'Marion Bailey', {'weight': 1}), ('Dorothy Atkinson', 'Marion Bailey', {'weight': 1}),

In [13]:
nx.write_gexf(G, 'cannes_2010_2019.gexf')