In [1]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as pltx
import networkx as nx

%load_ext autoreload
%autoreload 2

## Skip until *Create network ...* (everything before is pre-processing)

### Loading data

In [2]:
# reading the csv as data frames
movies = pd.read_csv('../data/movies_metadata.csv')
credits = pd.read_csv('../data/credits.csv')

# set movie id as index
movies.set_index('id', inplace=True)

  exec(code_obj, self.user_global_ns, self.user_ns)


### Extracting actors (from the exploration notebook)

In [3]:
# extract all actors
cast_in_movie = []
for _, r in credits.cast.iteritems():
    for g in ast.literal_eval(r):
        cast_in_movie.append([g["name"], g["id"]])

# convert actor movie to dataframe
cast_in_movie = pd.DataFrame(cast_in_movie)
cast_in_movie.columns = ["actor", "movie_id"]

In [4]:
# group actors and count number of movies + sort
sorted_actors = cast_in_movie.groupby('actor').count().rename(columns={"movie_id": "movie_count"}).sort_values(by="movie_count", ascending=False)

# keep only actors that were in 20 or more movies
top_actors = sorted_actors[sorted_actors.movie_count >= 20]

In [5]:
top_actors

Unnamed: 0_level_0,movie_count
actor,Unnamed: 1_level_1
Bess Flowers,241
Christopher Lee,148
John Wayne,125
Samuel L. Jackson,123
Gérard Depardieu,110
...,...
Tony Roberts,20
Vic Tayback,20
Andrew Duggan,20
Donnie Wahlberg,20


**Create actor nodes dataframe**

In [6]:
actor_nodes = pd.DataFrame(top_actors.index).rename(columns={'actor': 'id'})
# write to json
actor_nodes.to_json('small_data/actor_nodes.json', orient='records')

### Get list of actors per movie id

In [6]:
# extract the list of actors for a movie
def get_actors(cast):
    cast_actors = []
    for x in ast.literal_eval(cast):
        # filter for actors with more than 20 movies
        if x['name'] in top_actors.index:
            cast_actors.append(x['name'])

    return cast_actors

In [7]:
movie_cast = credits[['cast', 'id']]
# extract the list of actors for each movies 
movie_cast['actors'] = movie_cast.cast.apply(lambda x: get_actors(x))
movie_cast = movie_cast.drop('cast', axis=1)

In [8]:
movie_cast.rename(columns={'id': 'movie_id'})
# remove movies with an empty list of selected actors
movie_cast = movie_cast[movie_cast.actors.apply(lambda x: len(x) > 0)]

In [9]:
movie_cast

Unnamed: 0,id,actors
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney..."
1,8844,"[Robin Williams, Kirsten Dunst, Bonnie Hunt, B..."
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop..."
3,31357,"[Angela Bassett, Loretta Devine, Dennis Haysbe..."
4,11862,"[Steve Martin, Diane Keaton, Martin Short, Geo..."
...,...,...
45467,390959,"[James Gleason, Chris Parnell]"
45468,289923,[Joshua Leonard]
45469,222848,[Ron Jeremy]
45470,30840,"[Patrick Bergin, Uma Thurman, David Morrissey,..."


### Create actor connections 

In [12]:
def get_connections(list):
    # creates list of all possible pairs in list 
    return [(a,b) for i, a in enumerate(list) for b in list[i + 1:]]

In [13]:
movie_cast

Unnamed: 0,id,actors
0,862,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney..."
1,8844,"[Robin Williams, Kirsten Dunst, Bonnie Hunt, B..."
2,15602,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop..."
3,31357,"[Angela Bassett, Loretta Devine, Dennis Haysbe..."
4,11862,"[Steve Martin, Diane Keaton, Martin Short, Geo..."
...,...,...
45467,390959,"[James Gleason, Chris Parnell]"
45468,289923,[Joshua Leonard]
45469,222848,[Ron Jeremy]
45470,30840,"[Patrick Bergin, Uma Thurman, David Morrissey,..."


In [21]:
actor_pairs_series = movie_cast.actors.apply(lambda x: get_connections(x)).explode()

In [18]:
actor_pairs = movie_cast.actors.apply(lambda x: get_connections(x)).explode().tolist()
# cleaning
actor_pairs = [x for x in actor_pairs if str(x) != 'nan']

In [23]:
actor_pairs_series = actor_pairs_series[actor_pairs_series.apply(lambda x: str(x) != 'nan')]

In [24]:
actor_pairs_series

0                  (Tom Hanks, Tim Allen)
0                (Tom Hanks, Don Rickles)
0                 (Tom Hanks, Jim Varney)
0              (Tom Hanks, Wallace Shawn)
0          (Tom Hanks, John Ratzenberger)
                       ...               
45470    (David Morrissey, Jeroen Krabbé)
45470    (Jürgen Prochnow, Jeroen Krabbé)
45473         (Adam Baldwin, James Remar)
45473          (Adam Baldwin, Tom Wright)
45473           (James Remar, Tom Wright)
Name: actors, Length: 414497, dtype: object

In [53]:
# actor_pairs_series.to_frame().merge(movie_cast, index)
pair_movies = movie_cast.drop('actors', axis=1).merge(actor_pairs_series.to_frame(), left_index=True, right_index=True)

In [54]:
pair_movies

Unnamed: 0,id,actors
0,862,"(Tom Hanks, Tim Allen)"
0,862,"(Tom Hanks, Don Rickles)"
0,862,"(Tom Hanks, Jim Varney)"
0,862,"(Tom Hanks, Wallace Shawn)"
0,862,"(Tom Hanks, John Ratzenberger)"
...,...,...
45470,30840,"(David Morrissey, Jeroen Krabbé)"
45470,30840,"(Jürgen Prochnow, Jeroen Krabbé)"
45473,67758,"(Adam Baldwin, James Remar)"
45473,67758,"(Adam Baldwin, Tom Wright)"


In [60]:
# check if relations in the other direction
already = set()
def reverse_reversed(actors):
    if (actors in already):
        return actors[::-1]
    else:
        already.add(actors[::-1])
        return actors
pair_movies.actors = pair_movies.actors.apply(reverse_reversed)

In [61]:
pair_movies[['source', 'target']] = pair_movies.actors.tolist()

In [62]:
pair_movies.drop('actors', axis=1, inplace=True)

In [63]:
pair_movies['weight'] = 1


In [64]:
# remove self-connection
pair_movies = pair_movies[pair_movies.source != pair_movies.target]

In [72]:
pair_movies = pair_movies.groupby(['source', 'target']).agg({'weight': 'sum', 'id': list})

In [79]:
pair_movies.rename({'id': 'movie_ids'}, axis=1, inplace=True)

In [77]:
pair_movies.reset_index(inplace=True)

In [81]:
pair_movies.to_json('improved_actor_edges.json', orient='records')