In [1]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import networkx as nx

%load_ext autoreload
%autoreload 2

## Skip until *Create network ...* (everything before is pre-processing)

### Loading data

In [2]:
# reading the csv as data frames
movies = pd.read_csv('../data/movies_metadata.csv')
credits = pd.read_csv('../data/credits.csv')

# set movie id as index
movies.set_index('id', inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


### Extracting actors (from the exploration notebook)

In [3]:
# extract all actors
cast_in_movie = []
for _, r in credits.cast.iteritems():
    for g in ast.literal_eval(r):
        cast_in_movie.append([g["name"], g["id"]])

# convert actor movie to dataframe
cast_in_movie = pd.DataFrame(cast_in_movie)
cast_in_movie.columns = ["actor", "movie_id"]

In [4]:
# group actors and count number of movies + sort
sorted_actors = cast_in_movie.groupby('actor').count().rename(columns={"movie_id": "movie_count"}).sort_values(by="movie_count", ascending=False)

# keep only actors that were in 20 or more movies
top_actors = sorted_actors[sorted_actors.movie_count >= 20]

In [5]:
top_actors

Unnamed: 0_level_0,movie_count
actor,Unnamed: 1_level_1
Bess Flowers,241
Christopher Lee,148
John Wayne,125
Samuel L. Jackson,123
Gérard Depardieu,110
...,...
Tony Roberts,20
Vic Tayback,20
Andrew Duggan,20
Donnie Wahlberg,20


**Create actor nodes dataframe**

In [6]:
actor_nodes = pd.DataFrame(top_actors.index).rename(columns={'actor': 'id'})
# write to json
actor_nodes.to_json('small_data/actor_nodes.json', orient='records')

### Get list of actors per movie id

In [7]:
# extract the list of actors for a movie
def get_actors(cast):
    cast_actors = []
    for x in ast.literal_eval(cast):
        # filter for actors with more than 20 movies
        if x['name'] in top_actors.index:
            cast_actors.append(x['name'])

    return cast_actors

In [8]:
movie_cast = credits[['cast', 'id']]
# extract the list of actors for each movies 
movie_cast['actors'] = movie_cast.cast.apply(lambda x: get_actors(x))
movie_cast = movie_cast.drop('cast', axis=1)

In [9]:
movie_cast.rename(columns={'id': 'movie_id'})
# remove movies with an empty list of selected actors
movie_cast = movie_cast[movie_cast.actors.apply(lambda x: len(x) > 0)]

In [10]:
movie_cast

Unnamed: 0,id,actors
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney..."
1,8844,"[Robin Williams, Kirsten Dunst, Bonnie Hunt, B..."
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop..."
3,31357,"[Angela Bassett, Loretta Devine, Dennis Haysbe..."
4,11862,"[Steve Martin, Diane Keaton, Martin Short, Geo..."
...,...,...
45467,390959,"[James Gleason, Chris Parnell]"
45468,289923,[Joshua Leonard]
45469,222848,[Ron Jeremy]
45470,30840,"[Patrick Bergin, Uma Thurman, David Morrissey,..."


### Create actor connections 

In [11]:
def get_connections(list):
    # creates list of all possible pairs in list 
    return [(a,b) for i, a in enumerate(list) for b in list[i + 1:]]

In [12]:
actor_pairs = movie_cast.actors.apply(lambda x: get_connections(x)).explode().tolist()
# cleaning
actor_pairs = [x for x in actor_pairs if str(x) != 'nan']

In [13]:
actor_pairs

[('Tom Hanks', 'Tim Allen'),
 ('Tom Hanks', 'Don Rickles'),
 ('Tom Hanks', 'Jim Varney'),
 ('Tom Hanks', 'Wallace Shawn'),
 ('Tom Hanks', 'John Ratzenberger'),
 ('Tom Hanks', 'Laurie Metcalf'),
 ('Tom Hanks', 'R. Lee Ermey'),
 ('Tim Allen', 'Don Rickles'),
 ('Tim Allen', 'Jim Varney'),
 ('Tim Allen', 'Wallace Shawn'),
 ('Tim Allen', 'John Ratzenberger'),
 ('Tim Allen', 'Laurie Metcalf'),
 ('Tim Allen', 'R. Lee Ermey'),
 ('Don Rickles', 'Jim Varney'),
 ('Don Rickles', 'Wallace Shawn'),
 ('Don Rickles', 'John Ratzenberger'),
 ('Don Rickles', 'Laurie Metcalf'),
 ('Don Rickles', 'R. Lee Ermey'),
 ('Jim Varney', 'Wallace Shawn'),
 ('Jim Varney', 'John Ratzenberger'),
 ('Jim Varney', 'Laurie Metcalf'),
 ('Jim Varney', 'R. Lee Ermey'),
 ('Wallace Shawn', 'John Ratzenberger'),
 ('Wallace Shawn', 'Laurie Metcalf'),
 ('Wallace Shawn', 'R. Lee Ermey'),
 ('John Ratzenberger', 'Laurie Metcalf'),
 ('John Ratzenberger', 'R. Lee Ermey'),
 ('Laurie Metcalf', 'R. Lee Ermey'),
 ('Robin Williams', 'Kirste

In [14]:
actor_connections_df = pd.DataFrame(actor_pairs, columns=['source', 'target'])
actor_connections_df['weigth'] = 1
actor_edges_df = actor_connections_df.groupby(['source', 'target']).weigth.count().reset_index()
actor_edges_df

Unnamed: 0,source,target,weigth
0,'Snub' Pollard,Anna Lee,1
1,'Snub' Pollard,Anne Shirley,1
2,'Snub' Pollard,Arthur Tovey,1
3,'Snub' Pollard,Bert Stevens,4
4,'Snub' Pollard,Bess Flowers,1
...,...,...,...
342242,Моррис Честнат,Taye Diggs,2
342243,Моррис Честнат,Terrence Howard,2
342244,Моррис Честнат,Tess Harper,1
342245,Моррис Честнат,Tom Lister Jr.,1


In [None]:
# write to csv
actor_connections_df.to_csv('small_data/actor_edges.csv', index=False)

In [24]:
co_actors_df = pd.DataFrame(actor_connections_df.groupby('target')['source'].apply(list)).reset_index()
co_actors_df = co_actors_df.rename(columns={'target': 'actor', 'source': 'co_actors'})
co_actors_df.to_csv('small_data/co_actors.csv', index=False)

### **Create network** of first degree connections for one actor

In [4]:
# read from json
actor_nodes_df = pd.read_json('actor_nodes.json')
actor_connections_df = pd.read_json('actor_edges.json')
actor_neighbours_df = pd.read_csv('small_data/co_actors.csv')

In [29]:
def get_actor_connections(actor, n):
    # get his/her first degree neighbours
    one_actor_from_source = actor_connections_df[actor_connections_df.source == actor]
    one_actor_from_target = actor_connections_df[actor_connections_df.target == actor]

    top20_connections = pd.concat([one_actor_from_source, one_actor_from_target], axis = 0)
    top20_connections = top20_connections.sort_values('weigth', ascending=False).head(n)

    return top20_connections

def get_neighbour_nodes(actor_edges):

    one_actor_nodes = pd.concat([actor_edges.source, actor_edges.target], axis=0).to_frame(name='id')

    return one_actor_nodes.drop_duplicates()

In [35]:
# choose the actor
actor_name = 'Tom Cruise'

one_actor_edges = get_actor_connections(actor_name, 50)
# to json
one_actor_edges.to_json('one_actor_edges.json', orient='records', force_ascii=False)

In [36]:
# get all the nodes for the actor connections
one_actor_nodes = get_neighbour_nodes(one_actor_edges)
# to json
one_actor_nodes.to_json('one_actor_nodes.json', orient='records', force_ascii=False)

### **Create network** examples for 10 actors with most connections

In [4]:
actors_sample = actor_nodes_df.sample(30)
actors_sample

Unnamed: 0,id
1802,Malin Åkerman
1208,William Benedict
640,Rob Lowe
3305,George Gaynes
2376,Egon Brecher
3433,Miranda Otto
3253,Cy Kendall
508,Steven Seagal
3548,Sheryl Lee
3478,Katharine Alexander


In [49]:
from_source = actor_connections_df[actor_connections_df.source.apply(lambda x: x in actors_sample.id.tolist())]
sample_connections = from_source[from_source.target.apply(lambda x: x in actors_sample.id.tolist())]

In [50]:
sample_connections

Unnamed: 0,source,target,weigth
8727,Stephen McHattie,George Lucas,1
36445,Peter Maloney,Seth Green,1
37133,Ian Holm,Luke Perry,1
49804,Ed Lauter,William Prince,1
54200,Jon Voight,Seth Green,1
56646,Jon Voight,Ali Larter,1
208497,Stephen McHattie,Ethan Suplee,1
215632,Toni Collette,Nick Searcy,1
234358,Kelsey Grammer,Jon Voight,1
243682,Ali Larter,Jerry O'Connell,1


In [51]:
in_source = actors_sample[actors_sample.id.apply(lambda x: x in sample_connections.source.tolist())]
in_target = actors_sample[actors_sample.id.apply(lambda x: x in sample_connections.target.tolist())]
not_in_source = in_target[in_target.id.apply(lambda x: x not in in_source.id.tolist())]

In [52]:
actors_sample_clean = pd.concat([in_source, not_in_source], axis=0)

In [55]:
# write to json
actors_sample_clean.to_json('small_data/sample_actor_nodes.json', orient='records')
sample_connections.to_json('small_data/sample_actor_edges.json', orient='records')