In [1]:
import pandas as pd
import re
import itertools
import networkx as nx

In [2]:
df = pd.read_csv('./data/film_festivals.csv', keep_default_na=False)

In [3]:
df = df[~(df['Director'].isin(['N/A', '']) & df['Writer'].isin(['N/A', '']) & df['Actors'].isin(['N/A', '']))]
df['Director'] = df['Director'].apply(lambda row: re.sub('\(.*?\)', '', row))
df['Writer'] = df['Writer'].apply(lambda row: re.sub(' \(.*?\)', '', row))
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,id,title_english,title_original,director,country,winner,festival,year_festival,link_film,link_director,...,BoxOffice,Production,Website,Response,InternetMovieDatabaseRating,RottenTomatoesRating,MetacriticRating,Error,totalSeasons,Ratings
0,0,Adieu Bonaparte,وداعا بونابرت,Youssef Chahine,Egypt,0,cannes,1985,https://en.wikipedia.org/wiki/Adieu_Bonaparte,https://en.wikipedia.org/wiki/Youssef_Chahine,...,,,,True,6.4/10,,,,,
1,1,Birdy,,Alan Parker,United States,0,cannes,1985,https://en.wikipedia.org/wiki/Birdy_(film),https://en.wikipedia.org/wiki/Alan_Parker,...,,Sony Pictures Home Entertainment,,True,7.3/10,85%,,,,
2,2,Bliss,,Ray Lawrence,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/Bliss_(1985_film),https://en.wikipedia.org/wiki/Ray_Lawrence_(fi...,...,,Starmaker Entertainment,,True,6.9/10,,,,,
3,4,The Coca-Cola Kid,,Dušan Makavejev,Australia,0,cannes,1985,https://en.wikipedia.org/wiki/The_Coca-Cola_Kid,https://en.wikipedia.org/wiki/Du%C5%A1an_Makav...,...,,Cinecom Pictures,,True,6.0/10,44%,,,,
4,5,Colonel Redl,Oberst Redl,István Szabó,Hungary,0,cannes,1985,https://en.wikipedia.org/wiki/Colonel_Redl,https://en.wikipedia.org/wiki/Istv%C3%A1n_Szab...,...,,,,True,7.6/10,,,,,


In [4]:
df_cannes_2010_2019 = df[(df['festival'] == 'cannes') & (df['year_festival'] >= 2010)].reset_index(drop=True)
df_cannes_2010_2019[['Director', 'Writer', 'Actors']].head()

Unnamed: 0,Director,Writer,Actors
0,Mike Leigh,Mike Leigh,"Jim Broadbent, Ruth Sheen, Lesley Manville, Ol..."
1,Alejandro G. Iñárritu,"Alejandro G. Iñárritu, Alejandro G. Iñárritu, ...","Javier Bardem, Maricel Álvarez, Hanaa Bouchaib..."
2,Nikita Mikhalkov,"Nikita Mikhalkov, Vladimir Moiseenko, Aleksand...","Nikita Mikhalkov, Oleg Menshikov, Nadezhda Mik..."
3,Abbas Kiarostami,"Abbas Kiarostami, Caroline Eliacheff","Juliette Binoche, William Shimell, Jean-Claude..."
4,Xiaoshuai Wang,Yishu Yang,"Bingbing Fan, Feier Li, Hao Qin, Xueqi Wang"


In [5]:
def get_nodelist(directors_from_df, writers_from_df, actors_from_df):
    # directors
    directors = directors_from_df.split(', ')
    directors = [director for director in directors if director != 'N/A']
    directors = [(director, 'director') for director in directors]
    # writers
    writers = writers_from_df.split(', ')
    writers = [writer for writer in writers if writer != 'N/A']
    writers = [(writer, 'writer') for writer in writers]
    # actors
    actors = actors_from_df.split(', ')
    actors = [actor for actor in actors if actor != 'N/A']
    actors = [(actor, 'actor') for actor in actors]
    # people: directors, actors, and writers
    people = directors
    people.extend(actors) # the order of the extends determines the order of priority: director -> actor -> writer for people who have multiple roles
    people.extend(writers)
    
    name = [person[0] for person in people]
    role = [person[1] for person in people]
    nodelist = pd.DataFrame({'Name': name,
                             'Role': role})
    nodelist = nodelist.drop_duplicates(subset='Name').reset_index(drop=True)
    return nodelist

In [6]:
def get_edgelist(nodelist):
    edges = [combo + (1,) for combo in itertools.combinations(nodelist['Name'], 2)]
    edgelist = pd.DataFrame(edges, columns=['Source', 'Target', 'Weight'])
    return edges, edgelist

In [7]:
def get_film_edges(directors_from_df, writers_from_df, actors_from_df):
    nodelist_film = get_nodelist(directors_from_df,
                                 writers_from_df,
                                 actors_from_df)
    edges_film, edgelist_film = get_edgelist(nodelist_film)
    return edges_film    

In [8]:
def get_graph(df):
    df_directors = df['Director'].str.cat(sep=', ')
    df_writers = df['Writer'].str.cat(sep=', ')
    df_actors = df['Actors'].str.cat(sep=', ')
    
    df_nodelist = get_nodelist(df_directors,
                               df_writers,
                               df_actors)
    df_node_names = list(df_nodelist['Name'])
    
    df_edges = df.apply(lambda row: get_film_edges(row['Director'], row['Writer'], row['Actors']), axis=1)
    df_edges = df_edges.tolist()
    df_edges = [item for sublist in df_edges for item in sublist]
    
    M = nx.MultiGraph()
    M.add_nodes_from(df_node_names)
    M.add_weighted_edges_from(df_edges)
    
    # https://stackoverflow.com/questions/15590812/networkx-convert-multigraph-into-simple-graph-with-weighted-edges
    # create weighted graph from M
    G = nx.Graph() 
    for u,v,data in M.edges(data=True):
        w = data['weight'] if 'weight' in data else 1.0
        if G.has_edge(u,v):
            G[u][v]['weight'] += w
        else:
            G.add_edge(u, v, weight=w)
    
    return G

In [9]:
G = get_graph(df_cannes_2010_2019)
# print(G.nodes(data=True))
print(G.edges(data=True))

[('Mike Leigh', 'Jim Broadbent', {'weight': 1}), ('Mike Leigh', 'Ruth Sheen', {'weight': 1}), ('Mike Leigh', 'Lesley Manville', {'weight': 1}), ('Mike Leigh', 'Oliver Maltman', {'weight': 1}), ('Mike Leigh', 'Timothy Spall', {'weight': 1}), ('Mike Leigh', 'Paul Jesson', {'weight': 1}), ('Mike Leigh', 'Dorothy Atkinson', {'weight': 1}), ('Mike Leigh', 'Marion Bailey', {'weight': 1}), ('Jim Broadbent', 'Ruth Sheen', {'weight': 1}), ('Jim Broadbent', 'Lesley Manville', {'weight': 1}), ('Jim Broadbent', 'Oliver Maltman', {'weight': 1}), ('Ruth Sheen', 'Lesley Manville', {'weight': 1}), ('Ruth Sheen', 'Oliver Maltman', {'weight': 1}), ('Lesley Manville', 'Oliver Maltman', {'weight': 1}), ('Timothy Spall', 'Paul Jesson', {'weight': 1}), ('Timothy Spall', 'Dorothy Atkinson', {'weight': 1}), ('Timothy Spall', 'Marion Bailey', {'weight': 1}), ('Paul Jesson', 'Dorothy Atkinson', {'weight': 1}), ('Paul Jesson', 'Marion Bailey', {'weight': 1}), ('Dorothy Atkinson', 'Marion Bailey', {'weight': 1}),

In [10]:
nx.write_gexf(G, './data/network/cannes_2010_2019.gexf')