In [1]:
# Library imports
import pandas as pd
import numpy as np
import json

In [2]:
# Read the datasets
df1 = pd.read_csv('../data/primary_dataset.csv')
df2 = pd.read_csv('../data/secondary_dataset.csv')

## Primary dataset processing

In [3]:
df1.columns

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')

In [4]:
# Convert to a python list
df1['Actors List'] = df1.Actors.map(lambda x: [i.strip() for i in x.split(",")])
# Convert movie titles to lower case (for convenience)
df1['Title'] = df1['Title'].map(str.lower)

In [5]:
# Keep only relevant columns (for now - add more fields for extra filter features)
df1_relevant = df1[['Title', 'Actors List']]
# Standardize column names
df1_relevant.columns = ['title', 'actors']
df1_relevant.head()

Unnamed: 0,title,actors
0,guardians of the galaxy,"[Chris Pratt, Vin Diesel, Bradley Cooper, Zoe ..."
1,prometheus,"[Noomi Rapace, Logan Marshall-Green, Michael F..."
2,split,"[James McAvoy, Anya Taylor-Joy, Haley Lu Richa..."
3,sing,"[Matthew McConaughey, Reese Witherspoon, Seth ..."
4,suicide squad,"[Will Smith, Jared Leto, Margot Robbie, Viola ..."


## Secondary dataset processing

In [6]:
df2.columns

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [7]:
# We have the "\xa0" character at the end ofthe titles: we want to get rid of it
df2.movie_title = df2.movie_title.replace(u'\xa0', u'', regex=True)
# Combine columns and convert to python list
df2['actors_list'] = df2[['actor_1_name', 'actor_2_name', 'actor_3_name']].apply(lambda x: ','.join(x.dropna()), axis=1)
df2['actors_list'] = df2['actors_list'].map(lambda x: [i.strip() for i in x.split(",")])
# Convert movie titles to lower case (for convenience)
df2['movie_title'] = df2['movie_title'].map(str.lower)
# Drop rows with NaN values
df2 = df2.dropna()

In [8]:
# Keep only relevant columns (for now - add more fields for extra filter features)
df2_relevant = df2[['movie_title', 'director_name', 'actors_list', 'imdb_score', 'gross', 'budget']]
# Standardize column names
df2_relevant.columns = ['title', 'director', 'actors', 'imdb_rating', 'revenue', 'budget']
df2_relevant.head()

Unnamed: 0,title,director,actors,imdb_rating,revenue,budget
0,avatar,James Cameron,"[CCH Pounder, Joel David Moore, Wes Studi]",7.9,760505847.0,237000000.0
1,pirates of the caribbean: at world's end,Gore Verbinski,"[Johnny Depp, Orlando Bloom, Jack Davenport]",7.1,309404152.0,300000000.0
2,spectre,Sam Mendes,"[Christoph Waltz, Rory Kinnear, Stephanie Sigman]",6.8,200074175.0,245000000.0
3,the dark knight rises,Christopher Nolan,"[Tom Hardy, Christian Bale, Joseph Gordon-Levitt]",8.5,448130642.0,250000000.0
5,john carter,Andrew Stanton,"[Daryl Sabara, Samantha Morton, Polly Walker]",6.6,73058679.0,263700000.0


## Combine datasets

**Pointers about combining datasets:**
- Keep only df2 entries because it has budget values
- Fetch actors from df1 as it has more values (df1: 3 or 4 actors, df2: 3 actors)

In [9]:
# Merge the two dataframes to keep the common entries
df_merge = df2_relevant.merge(df1_relevant, on = "title")
df_merge.head()

Unnamed: 0,title,director,actors_x,imdb_rating,revenue,budget,actors_y
0,avatar,James Cameron,"[CCH Pounder, Joel David Moore, Wes Studi]",7.9,760505847.0,237000000.0,"[Sam Worthington, Zoe Saldana, Sigourney Weave..."
1,pirates of the caribbean: at world's end,Gore Verbinski,"[Johnny Depp, Orlando Bloom, Jack Davenport]",7.1,309404152.0,300000000.0,"[Johnny Depp, Orlando Bloom, Keira Knightley, ..."
2,spectre,Sam Mendes,"[Christoph Waltz, Rory Kinnear, Stephanie Sigman]",6.8,200074175.0,245000000.0,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R..."
3,the dark knight rises,Christopher Nolan,"[Tom Hardy, Christian Bale, Joseph Gordon-Levitt]",8.5,448130642.0,250000000.0,"[Christian Bale, Tom Hardy, Anne Hathaway, Gar..."
4,john carter,Andrew Stanton,"[Daryl Sabara, Samantha Morton, Polly Walker]",6.6,73058679.0,263700000.0,"[Taylor Kitsch, Lynn Collins, Willem Dafoe, Sa..."


In [10]:
# Keep the actors list from the primary dataset (run this cell only once)
df_merge['actors_x'] = df_merge['actors_y']
df_merge = df_merge.drop(columns = ['actors_y'])
# Standardize column names
df_merge.columns = ['title', 'director', 'actors', 'imdb_rating', 'revenue', 'budget']

In [11]:
# Get the remaining secondary dataset entries
df2_exclusive = df2_relevant[~df2_relevant['title'].isin(df_merge['title'])]

In [12]:
# Finally, combine the two dataframes
df_comb = df_merge.append(df2_exclusive)
df_comb.head()

Unnamed: 0,title,director,actors,imdb_rating,revenue,budget
0,avatar,James Cameron,"[Sam Worthington, Zoe Saldana, Sigourney Weave...",7.9,760505847.0,237000000.0
1,pirates of the caribbean: at world's end,Gore Verbinski,"[Johnny Depp, Orlando Bloom, Keira Knightley, ...",7.1,309404152.0,300000000.0
2,spectre,Sam Mendes,"[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",6.8,200074175.0,245000000.0
3,the dark knight rises,Christopher Nolan,"[Christian Bale, Tom Hardy, Anne Hathaway, Gar...",8.5,448130642.0,250000000.0
4,john carter,Andrew Stanton,"[Taylor Kitsch, Lynn Collins, Willem Dafoe, Sa...",6.6,73058679.0,263700000.0


In [13]:
# df_comb.to_csv(r'tmp1.csv', index = False)

## Create director-actor pairs

In [14]:
# Seperate the actors list into multiple rows
df_comb = df_comb.explode('actors')
# Remove duplicate entries
df_comb = df_comb.drop_duplicates()
df_comb.head()

Unnamed: 0,title,director,actors,imdb_rating,revenue,budget
0,avatar,James Cameron,Sam Worthington,7.9,760505847.0,237000000.0
0,avatar,James Cameron,Zoe Saldana,7.9,760505847.0,237000000.0
0,avatar,James Cameron,Sigourney Weaver,7.9,760505847.0,237000000.0
0,avatar,James Cameron,Michelle Rodriguez,7.9,760505847.0,237000000.0
1,pirates of the caribbean: at world's end,Gore Verbinski,Johnny Depp,7.1,309404152.0,300000000.0


In [15]:
# Group by director-actor pair, keep the remaining columns as lists
df_group = df_comb.groupby(['director', 'actors'], as_index=False)[['title', 'imdb_rating', 'revenue', 'budget']].agg(lambda x: list(x))
df_group.head()

Unnamed: 0,director,actors,title,imdb_rating,revenue,budget
0,Aaron Schneider,Bill Cobbs,[get low],[7.1],[9176553.0],[7500000.0]
1,Aaron Schneider,Bill Murray,[get low],[7.1],[9176553.0],[7500000.0]
2,Aaron Schneider,Robert Duvall,[get low],[7.1],[9176553.0],[7500000.0]
3,Aaron Seltzer,Alyson Hannigan,[date movie],[2.7],[48546578.0],[20000000.0]
4,Aaron Seltzer,Carmen Electra,[date movie],[2.7],[48546578.0],[20000000.0]


In [16]:
# Get the movies count for each pair
df_group['movies_count'] = df_group['title'].apply(len)
# Keep only the pairs with at least 2 movies
df_filter = df_group[df_group.movies_count > 1]
df_filter.head()

Unnamed: 0,director,actors,title,imdb_rating,revenue,budget,movies_count
22,Adam McKay,John C. Reilly,"[talladega nights: the ballad of ricky bobby, ...","[6.6, 6.9]","[148213377.0, 100468793.0]","[73000000.0, 65000000.0]",2
28,Adam McKay,Steve Carell,"[the big short, anchorman 2: the legend contin...","[7.8, 6.3, 7.2]","[70235322.0, 2175312.0, 84136909.0]","[28000000.0, 50000000.0, 26000000.0]",3
29,Adam McKay,Will Ferrell,"[the other guys, talladega nights: the ballad ...","[6.7, 6.6, 6.9, 6.3, 7.2]","[119219978.0, 148213377.0, 100468793.0, 217531...","[100000000.0, 73000000.0, 65000000.0, 50000000...",5
193,Alex Kendrick,Alex Kendrick,"[courageous, facing the giants]","[7.0, 6.7]","[34522221.0, 10174663.0]","[2000000.0, 100000.0]",2
195,Alex Kendrick,Erin Bethea,"[fireproof, facing the giants]","[6.5, 6.7]","[33451479.0, 10174663.0]","[500000.0, 100000.0]",2


In [17]:
# df_filter.to_csv(r'tmp2.csv', index = False)

In [18]:
### Store the result into a json file
directors = df_filter['director'].drop_duplicates()
actors = df_filter['actors'].drop_duplicates()

# Iterate rows:
nodes = list()
names = list()
for director in directors:
    nodes.append({'id': director, 'group' : 1})
    names.append(director)
for actor in actors:
    nodes.append({'id': actor, 'group' : 2})
    names.append(actor)

links = list()
for index, row in df_filter.iterrows():
    links.append({"source": row['director'], "target": row['actors'], "title": row['title'], "imdb_rating": row['imdb_rating'],
                  "revenue": row['revenue'], "budget": row['budget'], "counts": row['movies_count']})

json_file = {
    'nodes': nodes,
    'links': links
}

with open('final-result.json', 'w') as fp:
    json.dump(json_file, fp)

In [19]:
df_filter

Unnamed: 0,director,actors,title,imdb_rating,revenue,budget,movies_count
22,Adam McKay,John C. Reilly,"[talladega nights: the ballad of ricky bobby, ...","[6.6, 6.9]","[148213377.0, 100468793.0]","[73000000.0, 65000000.0]",2
28,Adam McKay,Steve Carell,"[the big short, anchorman 2: the legend contin...","[7.8, 6.3, 7.2]","[70235322.0, 2175312.0, 84136909.0]","[28000000.0, 50000000.0, 26000000.0]",3
29,Adam McKay,Will Ferrell,"[the other guys, talladega nights: the ballad ...","[6.7, 6.6, 6.9, 6.3, 7.2]","[119219978.0, 148213377.0, 100468793.0, 217531...","[100000000.0, 73000000.0, 65000000.0, 50000000...",5
193,Alex Kendrick,Alex Kendrick,"[courageous, facing the giants]","[7.0, 6.7]","[34522221.0, 10174663.0]","[2000000.0, 100000.0]",2
195,Alex Kendrick,Erin Bethea,"[fireproof, facing the giants]","[6.5, 6.7]","[33451479.0, 10174663.0]","[500000.0, 100000.0]",2
...,...,...,...,...,...,...,...
10915,Woody Allen,Woody Allen,"[to rome with love, the curse of the jade scor...","[6.3, 6.8, 7.4, 6.7, 6.4, 6.6, 6.4, 8.1]","[16684352.0, 7496522.0, 10569071.0, 17071230.0...","[17000000.0, 26000000.0, 20000000.0, 18000000....",8
10951,Zach Braff,Michael Weston,"[wish i was here, garden state]","[6.7, 7.6]","[3588432.0, 26781723.0]","[6000000.0, 2500000.0]",2
10955,Zack Snyder,Abbie Cornish,"[sucker punch, legend of the guardians: the ow...","[6.1, 7.0]","[36381716.0, 55673333.0]","[82000000.0, 80000000.0]",2
10956,Zack Snyder,Amy Adams,"[batman v superman: dawn of justice, man of st...","[6.9, 7.2]","[330249062.0, 291021565.0]","[250000000.0, 225000000.0]",2
