In [1]:
import os

In [None]:
# move working dir up to parent, allows us to import from utils without too many shenanigans
os.chdir(os.pardir)

In [1]:
import pandas as pd
import networkx as nx
import datetime
from math import sqrt, pow, log2
from matplotlib.pyplot import figure
from utils.data_generated import load_movie_metadata
from utils.data_initial import load_cmu_character_metadata, load_imdb_name_basics, load_imdb_title_crew

In [2]:
#https://towardsdatascience.com/from-dataframe-to-network-graph-bbb35c8ab675

In [3]:
START_DATE = pd.to_datetime(datetime.date(2000,1,1))
END_DATE = pd.to_datetime(datetime.date(2010,1,1))

In [4]:
movie_df = load_movie_metadata(legacy=True)
movie_df = movie_df.loc[(movie_df['release_date'] >= START_DATE) & (movie_df['release_date'] < END_DATE)]
movie_df.head()

Unnamed: 0,movie_id_wikipedia,movie_id_freebase,title_id_imdb,movie_name,release_date,box_office_revenue,runtime,languages,countries,is_adult,genres_cmu,genres_imdb
0,975900,/m/03vyhn,tt0228333,Ghosts of Mars,2001-08-24,14010832.0,98.0,[English Language],[United States of America],0,"[Thriller, Science Fiction, Horror, Adventure,...","[Action, Horror, Sci-Fi]"
126,21926710,/m/05p45cv,tt0892904,White on Rice,2009-01-01,,82.0,[],[United States of America],0,"[Romantic comedy, Romance Film, Comedy, Indie]",[Comedy]
135,20604092,/m/0523t_1,tt13667080,Anbu Thozhi,2007-08-17,,,[Tamil Language],[India],0,[Romance Film],[Drama]
137,156558,/m/014k4y,tt0255819,Baby Boy,2001-06-27,29381649.0,123.0,[English Language],[United States of America],0,"[Crime Fiction, Drama, Coming of age]","[Crime, Drama, Romance]"
204,25960460,/m/0b6kc_5,tt0166158,Daddy and Them,2001-01-01,,101.0,[English Language],"[United States of America, Netherlands]",0,"[Black comedy, Comedy]","[Comedy, Drama]"


In [5]:
char_df = load_cmu_character_metadata()
char_df = char_df.merge(movie_df, left_on='movie_id_wikipedia', right_on='movie_id_wikipedia', how='inner')[['title_id_imdb', 'actor_name']].dropna()
char_df.head()

Unnamed: 0,title_id_imdb,actor_name
0,tt0228333,Wanda De Jesus
1,tt0228333,Natasha Henstridge
2,tt0228333,Ice Cube
3,tt0228333,Jason Statham
4,tt0228333,Clea DuVall


In [6]:
crew_df = load_imdb_title_crew()
crew_df = crew_df.merge(movie_df, left_on='title_id', right_on='title_id_imdb', how='inner')[['title_id', 'directors', 'writers']]

#no need to filter because movie_df was already filtered and we did an inner join with it
#crew_df = crew_df.loc[(crew_df['release_date'] >= START_DATE) & (crew_df['release_date'] < END_DATE)]
#print(len(crew_df))

directors_df = crew_df[['title_id', 'directors']]
directors_df = directors_df.explode('directors').reset_index(drop=True)

writers_df = crew_df[['title_id', 'writers']]
writers_df = writers_df.explode('writers').reset_index(drop=True)

del crew_df

In [7]:
names_df = load_imdb_name_basics()[['person_name_id', 'person_name']]

directors_df = names_df.merge(directors_df, left_on='person_name_id', right_on='directors', how='inner')[['title_id', 'person_name_id', 'person_name']]

writers_df = names_df.merge(writers_df, left_on='person_name_id', right_on='writers')[['title_id', 'person_name_id', 'person_name']]

del names_df

In [8]:
BASE_SIZE = 1
BASE_ALPHA = 0.01
MUL_WEIGHT = 0.3
PARAMETER_WEIGHT = 0.9
TRESHOLD_SIZE = 24

In [9]:
#Create Empty Graph
G = nx.Graph()

#Add actors to the graph
for _, row in char_df.iterrows():
    actor = row['actor_name']

    movie_id = row['title_id_imdb']
        
    if not G.has_node(actor):
        G.add_node(actor, size=BASE_SIZE, alpha=BASE_ALPHA, movies={movie_id}, role='Actor', color='green')
    else:
        G.nodes[actor]['movies'].add(movie_id)
        G.nodes[actor]['size'] = G.nodes[actor]['size'] + 1

G.order()

41258

In [10]:
#Add directors to the graph
for _, row in directors_df.iterrows():
    director = row['person_name']

    movie_id = row['title_id']

    if not G.has_node(director):
        G.add_node(director, size=BASE_SIZE, alpha=BASE_ALPHA, movies={movie_id}, role='Director', color='red')
    else:
        G.nodes[director]['movies'].add(movie_id)
        G.nodes[director]['size'] = G.nodes[director]['size'] + 1

G.order()

48105

In [11]:
#Add writers to the graph
for _, row in writers_df.iterrows():
    writer = row['person_name']

    movie_id = row['title_id']

    if not G.has_node(writer):
        G.add_node(writer, size=BASE_SIZE, alpha=BASE_ALPHA, movies={movie_id}, role='Writer', color='blue')
    else:
        G.nodes[writer]['movies'].add(movie_id)
        G.nodes[writer]['size'] = G.nodes[writer]['size'] + 1

G.order()

55558

In [12]:
count = 0
for node1 in G.nodes(data='movies'):
    if count % 1000 == 0:
        print('Processed', count, '/', G.order(), 'nodes so far')
    count = count + 1
    name1 = node1[0]
    movies1 = node1[1]

    for node2 in G.nodes(data='movies'):
        name2 = node2[0]

        if not G.has_edge(name1, name2) and not name1 == name2:
            movies2 = node2[1]
            common_movies = movies1.intersection(movies2)
            
            if len(common_movies) > 0:
                G.add_edge(name1, name2, weight=len(common_movies) * MUL_WEIGHT)
                G.nodes[name1]['alpha'] = pow(G.nodes[name1]['alpha'], pow(PARAMETER_WEIGHT, len(common_movies)))
                G.nodes[name2]['alpha'] = pow(G.nodes[name2]['alpha'], pow(PARAMETER_WEIGHT, len(common_movies)))

print(G.order())
print(G.size())
print(G.size(weight='weight') / MUL_WEIGHT)

Processed 0 / 55558 nodes so far
Processed 1000 / 55558 nodes so far
Processed 2000 / 55558 nodes so far
Processed 3000 / 55558 nodes so far
Processed 4000 / 55558 nodes so far
Processed 5000 / 55558 nodes so far
Processed 6000 / 55558 nodes so far
Processed 7000 / 55558 nodes so far
Processed 8000 / 55558 nodes so far
Processed 9000 / 55558 nodes so far
Processed 10000 / 55558 nodes so far
Processed 11000 / 55558 nodes so far
Processed 12000 / 55558 nodes so far
Processed 13000 / 55558 nodes so far
Processed 14000 / 55558 nodes so far
Processed 15000 / 55558 nodes so far
Processed 16000 / 55558 nodes so far
Processed 17000 / 55558 nodes so far
Processed 18000 / 55558 nodes so far
Processed 19000 / 55558 nodes so far
Processed 20000 / 55558 nodes so far
Processed 21000 / 55558 nodes so far
Processed 22000 / 55558 nodes so far
Processed 23000 / 55558 nodes so far
Processed 24000 / 55558 nodes so far
Processed 25000 / 55558 nodes so far
Processed 26000 / 55558 nodes so far
Processed 2700

In [None]:
sizes = nx.get_node_attributes(G, 'size')
colors = nx.get_node_attributes(G, 'color')
alphas = nx.get_node_attributes(G, 'alpha')
weights = nx.get_edge_attributes(G, 'weight')
nodelist = G.nodes()
pos = nx.spring_layout(G, k=1.7/sqrt(G.order()))

figure(figsize=(32,32))

nx.draw_networkx_nodes(G,pos,
                       nodelist=sizes.keys(),
                       node_size=[x*log2(max(2,x)) for x in list(sizes.values())],
                       node_color=list(colors.values()),
                       alpha=list(alphas.values()))
nx.draw_networkx_edges(G,pos,
                       edgelist=weights.keys(),
                       width=list(weights.values()),
                       edge_color='lightblue',
                       alpha=0.5)
big_nodes = []
for node in G.nodes(data='size'):
    if node[1] >= TRESHOLD_SIZE:
        big_nodes.append(node)
nx.draw_networkx_labels(G.subgraph(big_nodes), pos=pos,
                        labels={node[0]:node[0] for node in big_nodes}
                        #font_color='white'
                        )

In [None]:
'''
from math import log2

figure(figsize=(256,256))

nx.draw_networkx_nodes(G,pos,
                       nodelist=sizes.keys(),
                       node_size=[log2(x)*log2(max(log2(x),2)) for x in list(sizes.values())],
                       node_color=list(colors.values()),
                       alpha=list(alphas.values()))
nx.draw_networkx_edges(G,pos,
                       edgelist=weights.keys(),
                       width=list(weights.values()),
                       edge_color='lightblue',
                       alpha=0.5)
big_nodes = []
for node in G.nodes(data='size'):
    if node[1] >= pow(2,24):
        big_nodes.append(node)
nx.draw_networkx_labels(G.subgraph(big_nodes), pos=pos,
                        labels={node[0]:node[0] for node in big_nodes}
                        #font_color='white'
                        )
'''

In [67]:
'''
import pickle

with open('../generated/graphs/adw_200001_200912_graph.pickle', 'wb') as handle:
    pickle.dump(G, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('../generated/graphs/adw_200001_200912_pos.pickle', 'wb') as handle:
    pickle.dump(pos, handle, protocol=pickle.HIGHEST_PROTOCOL)

#with open('filename.pickle', 'rb') as handle:
#    b = pickle.load(handle)
'''