In [45]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval
from itertools import combinations

pd.options.mode.chained_assignment = None

In [46]:
data_path = "./data"
if os.listdir(data_path) != ['tmdb_5000_credits.csv', 'tmdb_5000_movies.csv']:
    print("[ERROR] Please download and unzip the dataset in a subdirectory './data'.")
else:
    print("[INFO] The dataset is correctly placed.")

[ERROR] Please download and unzip the dataset in a subdirectory './data'.


### Loading the data

In [47]:
tmdb_credits = pd.read_csv(os.path.join(data_path, "tmdb_5000_credits.csv"))
tmdb_credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [48]:
tmdb_movies = pd.read_csv(os.path.join(data_path, "tmdb_5000_movies.csv"))
tmdb_movies.head(2)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500


In [49]:
def mapping_casts(list_casts, useful_keys = {"id", "gender", "name"}):
    """Mapping casts to keep useful information."""
    result = []
    for cast in list_casts:
        filtering = {k:v for k,v in cast.items() if k in useful_keys}
        if len(filtering) == len(useful_keys):
            result.append(filtering)
    return result
mapping_casts(literal_eval(tmdb_credits.cast[0]))[:5]

[{'gender': 2, 'id': 65731, 'name': 'Sam Worthington'},
 {'gender': 1, 'id': 8691, 'name': 'Zoe Saldana'},
 {'gender': 1, 'id': 10205, 'name': 'Sigourney Weaver'},
 {'gender': 2, 'id': 32747, 'name': 'Stephen Lang'},
 {'gender': 1, 'id': 17647, 'name': 'Michelle Rodriguez'}]

In [50]:
tmdb_credits_clean = tmdb_credits.copy()
tmdb_credits_clean['cast'] = tmdb_credits_clean['cast'].apply(lambda x: mapping_casts(literal_eval(x)))
tmdb_credits_clean.head()

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{'gender': 2, 'id': 65731, 'name': 'Sam Worth...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{'gender': 2, 'id': 85, 'name': 'Johnny Depp'...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[{'gender': 2, 'id': 8784, 'name': 'Daniel Cra...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[{'gender': 2, 'id': 3894, 'name': 'Christian ...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[{'gender': 2, 'id': 60900, 'name': 'Taylor Ki...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [51]:
tmdb_credits_clean_cast = tmdb_credits_clean[["movie_id", "title", "cast"]].explode("cast")


def get_gender(cast):
    splitted = str(cast).split()
    if len(splitted)>1:
        return splitted[1][0]
    else:
        -1
        
def get_id(cast):
    splitted = str(cast).split()
    if len(splitted)>1:
        return splitted[3].replace(",", "")
    else:
        -1
        
def get_name(cast):
    splitted = str(cast).split(", ")
    if len(splitted)>1:
        return splitted[2].replace("'", "").replace(",", "").replace("name: ", "").replace("}", "")
    else:
        -1

tmdb_credits_clean_cast['gender'] = tmdb_credits_clean_cast.cast.apply(get_gender)
tmdb_credits_clean_cast = tmdb_credits_clean_cast[tmdb_credits_clean_cast.gender != -1]

tmdb_credits_clean_cast['person_id'] = tmdb_credits_clean_cast.cast.apply(get_id)
tmdb_credits_clean_cast = tmdb_credits_clean_cast[tmdb_credits_clean_cast.person_id != -1]

tmdb_credits_clean_cast['name'] = tmdb_credits_clean_cast.cast.apply(get_name)
tmdb_credits_clean_cast = tmdb_credits_clean_cast[tmdb_credits_clean_cast.name != -1]

tmdb_credits_clean_cast.drop(columns=['cast'], inplace=True)
tmdb_credits_clean_cast

Unnamed: 0,movie_id,title,gender,person_id,name
0,19995,Avatar,2,65731,Sam Worthington
0,19995,Avatar,1,8691,Zoe Saldana
0,19995,Avatar,1,10205,Sigourney Weaver
0,19995,Avatar,2,32747,Stephen Lang
0,19995,Avatar,1,17647,Michelle Rodriguez
...,...,...,...,...,...
4802,25975,My Date with Drew,2,21315,Eric Roberts
4802,25975,My Date with Drew,0,2171,Griffin Dunne
4802,25975,My Date with Drew,2,2231,Samuel L. Jackson
4802,25975,My Date with Drew,2,14407,Matt LeBlanc


In [52]:
create_pair = lambda x : pd.DataFrame(list(combinations(x.values,2)), 
                            columns=['actor_1','actor_2'])

combo = (tmdb_credits_clean_cast.groupby('movie_id')['name'].apply(create_pair)
                               .reset_index(level=1, drop=True)
                               .reset_index())
combo['check_string'] = combo.apply(lambda row: ''.join(sorted([row['actor_1'], row['actor_2'], str(row['movie_id'])])), axis=1)
combo.drop_duplicates('check_string', inplace=True)
combo.drop(columns=['check_string'], inplace=True)
combo

Unnamed: 0,movie_id,actor_1,actor_2
0,5,Tim Roth,Antonio Banderas
1,5,Tim Roth,Jennifer Beals
2,5,Tim Roth,Madonna
3,5,Tim Roth,Marisa Tomei
4,5,Tim Roth,Bruce Willis
...,...,...,...
2042952,433715,Nicole Smolen,Ariana Stephens
2042953,433715,Nicole Smolen,Bryson Funk
2042954,433715,Kim Baldwin,Ariana Stephens
2042955,433715,Kim Baldwin,Bryson Funk


In [53]:
combo = combo[combo.actor_1 != combo.actor_2]
combo

Unnamed: 0,movie_id,actor_1,actor_2
0,5,Tim Roth,Antonio Banderas
1,5,Tim Roth,Jennifer Beals
2,5,Tim Roth,Madonna
3,5,Tim Roth,Marisa Tomei
4,5,Tim Roth,Bruce Willis
...,...,...,...
2042952,433715,Nicole Smolen,Ariana Stephens
2042953,433715,Nicole Smolen,Bryson Funk
2042954,433715,Kim Baldwin,Ariana Stephens
2042955,433715,Kim Baldwin,Bryson Funk


In [59]:
narrowed_movies = tmdb_movies[['id', 'genres', 'overview', 'original_title', 'release_date', 'popularity', 'vote_average', 'vote_count', 'production_companies']]
def parse_genres(genres):
    L=[]
    for genre in genres:
        L.append(genre['name'])
    if len(L) == 0:
        return np.nan
    return L

def parse_production_companies(production_companies):
    L=[]
    for company in production_companies:
        L.append(company['name'])
    if len(L) == 0:
        return np.nan
    return L

narrowed_movies["genres"] = narrowed_movies.genres.apply(lambda x: parse_genres(literal_eval(x)))
narrowed_movies["production_companies"] = narrowed_movies.production_companies.apply(lambda x: parse_production_companies(literal_eval(x)))
narrowed_movies.head()

Unnamed: 0,id,genres,overview,original_title,release_date,popularity,vote_average,vote_count,production_companies
0,19995,"[Action, Adventure, Fantasy, Science Fiction]","In the 22nd century, a paraplegic Marine is di...",Avatar,2009-12-10,150.437577,7.2,11800,"[Ingenious Film Partners, Twentieth Century Fo..."
1,285,"[Adventure, Fantasy, Action]","Captain Barbossa, long believed to be dead, ha...",Pirates of the Caribbean: At World's End,2007-05-19,139.082615,6.9,4500,"[Walt Disney Pictures, Jerry Bruckheimer Films..."
2,206647,"[Action, Adventure, Crime]",A cryptic message from Bond’s past sends him o...,Spectre,2015-10-26,107.376788,6.3,4466,"[Columbia Pictures, Danjaq, B24]"
3,49026,"[Action, Crime, Drama, Thriller]",Following the death of District Attorney Harve...,The Dark Knight Rises,2012-07-16,112.31295,7.6,9106,"[Legendary Pictures, Warner Bros., DC Entertai..."
4,49529,"[Action, Adventure, Science Fiction]","John Carter is a war-weary, former military ca...",John Carter,2012-03-07,43.926995,6.1,2124,[Walt Disney Pictures]


In [69]:
grouping = combo.groupby(by=['actor_1', 'actor_2'])['movie_id'].apply(list)
grouping = grouping.reset_index()
grouping

In [112]:
import json

numbers = range(3, 11)


def select_common_movies(n):
    """ Return the dataframe of actors with a list of common movies with length at least n. """
    temp = grouping[grouping['movie_id'].map(len) >= n]
    temp['actor_1'] = temp['actor_1'].apply(lambda name: name.replace('"', ''))
    temp['actor_2'] = temp['actor_2'].apply(lambda name: name.replace('"', ''))
    return temp

def get_unique_movie_ids(n):
    """ Return the unique movie_ids for n common movies. """
    slist = []
    for x in select_common_movies(n).movie_id:
        slist.extend(x)
    return sorted(list(dict.fromkeys(slist)))

def get_movie_info(n):
    """ Get movie info for the select movies with n common movies. """
    return narrowed_movies[narrowed_movies.id.isin(get_unique_movie_ids(n))].dropna()


def generate_dataset_common_movies(numbers):
    """ Return several datasets of common movies out of a list of numbers. """
    for n in numbers:
        common_movies_df = select_common_movies(n)
        unique_actors = list((common_movies_df.actor_1.append(common_movies_df.actor_2)).unique())
        nodes = list(map(lambda name: {"id":name}, unique_actors))

        def create_link(row):
            """Create a link from a pandas row from actor pairs with movie id."""
            d = {}
            d['source'] = row.actor_1
            d['target'] = row.actor_2
            d['movie_id'] = row.movie_id
            return d
        links = list(common_movies_df.apply(lambda row: create_link(row), axis=1))

        def create_movie_info(row):
            """Create movie info entries to further loading."""
            d = {}
            d['movie_id'] = row.id
            d['genres'] = row.genres
            d['overview'] = row.overview
            d['original_title'] = row.original_title
            d['release_date'] = row.release_date
            d['popularity'] = row.popularity
            d['vote_average'] = row.vote_average
            d['vote_count'] = row.vote_count
            d['production_companies'] = row.production_companies
            return d
        movie_infos = list(get_movie_info(n).apply(lambda row: create_movie_info(row), axis=1))

        data = {}
        data['nodes']=nodes
        data['links']=links
        data['movies_info'] = movie_infos
        
        print("[INFO]: Dataset with at least {} common movies has {} actors, {} links and {} different movies.".format(n,\
                                                                                                              len(nodes),\
                                                                                                              len(links),\
                                                                                                              len(movie_infos)))

        with open('network/data/dataset_{}_common_movies.json'.format(n), 'w') as outfile:
            json.dump(data, outfile)
            
generate_dataset_common_movies(numbers)

[INFO]: Dataset with at least 3 common movies has 1343 actors, 3620 links and 1376 different movies.
[INFO]: Dataset with at least 4 common movies has 446 actors, 955 links and 628 different movies.
[INFO]: Dataset with at least 5 common movies has 173 actors, 338 links and 305 different movies.
[INFO]: Dataset with at least 6 common movies has 91 actors, 184 links and 178 different movies.
[INFO]: Dataset with at least 7 common movies has 37 actors, 31 links and 95 different movies.
[INFO]: Dataset with at least 8 common movies has 14 actors, 11 links and 52 different movies.
[INFO]: Dataset with at least 9 common movies has 12 actors, 9 links and 43 different movies.
[INFO]: Dataset with at least 10 common movies has 8 actors, 5 links and 38 different movies.


In [114]:
data['movies_info']

KeyError: 'movies_info'

In [58]:
len(unique_actors)

91

### Trying with the 50 actors with most movie participation

In [93]:
most_50_actors = set(tmdb_credits_clean_cast['name'].dropna().apply(lambda name: name.replace('"', '')).value_counts().head(70).index)
mask3 = dataset_to_export['actor_1'].isin(most_50_actors)
mask4 = dataset_to_export['actor_2'].isin(most_50_actors)
dataset_to_export2 = dataset_to_export[mask3 & mask4]

In [94]:
movie_info2 = dataset_to_export2[['movie_id', 'genres', 'overview',
       'original_title', 'release_date', 'popularity', 'vote_average',
       'vote_count', 'production_companies']]
movie_info2 = movie_info2.drop_duplicates(subset='movie_id')

import json
unique_actors2 = list((dataset_to_export2.actor_1.append(dataset_to_export2.actor_2)).unique())
nodes = list(map(lambda name: {"id":name.replace('"', '')}, unique_actors2))

def create_link(row):
    """Create a link from a pandas row from actor pairs with movie id."""
    d = {}
    d['source'] = row.actor_1
    d['target'] = row.actor_2
    d['movie_id'] = row.movie_id
    return d
links = list(dataset_to_export2.apply(lambda row: create_link(row), axis=1))

def create_movie_info(row):
    """Create movie info entries to further loading."""
    d = {}
    d['movie_id'] = row.movie_id
    d['genres'] = row.genres
    d['overview'] = row.overview
    d['original_title'] = row.original_title
    d['release_date'] = row.release_date
    d['popularity'] = row.popularity
    d['vote_average'] = row.vote_average
    d['vote_count'] = row.vote_count
    d['production_companies'] = row.production_companies
    return d
movie_infos = list(dataset_to_export2.apply(lambda row: create_movie_info(row), axis=1))

data = {}
data['nodes']=nodes
data['links']=links
data['movie_info'] = movie_infos

with open('dataset_70_actors.json', 'w') as outfile:
    json.dump(data, outfile)
data

{'nodes': [{'id': 'Antonio Banderas'},
  {'id': 'Laurence Fishburne'},
  {'id': 'Robert De Niro'},
  {'id': 'Tom Cruise'},
  {'id': 'Harrison Ford'},
  {'id': 'Bruce Willis'},
  {'id': 'Steve Buscemi'},
  {'id': 'Julianne Moore'},
  {'id': 'Philip Seymour Hoffman'},
  {'id': 'George Clooney'},
  {'id': 'Brad Pitt'},
  {'id': 'Matt Damon'},
  {'id': 'Samuel L. Jackson'},
  {'id': 'Morgan Freeman'},
  {'id': 'Cate Blanchett'},
  {'id': 'Johnny Depp'},
  {'id': 'Bill Murray'},
  {'id': 'John Leguizamo'},
  {'id': 'Cameron Diaz'},
  {'id': 'Ben Stiller'},
  {'id': 'James Franco'},
  {'id': 'Tom Hanks'},
  {'id': 'John Travolta'},
  {'id': 'Will Ferrell'},
  {'id': 'Owen Wilson'},
  {'id': 'John Turturro'},
  {'id': 'Frank Welker'},
  {'id': 'Judi Dench'},
  {'id': 'Jack Black'},
  {'id': 'Donald Sutherland'},
  {'id': 'Sylvester Stallone'},
  {'id': 'Stanley Tucci'},
  {'id': 'Jim Broadbent'},
  {'id': 'J.K. Simmons'},
  {'id': 'Richard Jenkins'},
  {'id': 'Paul Rudd'},
  {'id': 'Bill Hade