## ADA - P2

### Load data

In [2]:
import pandas as pd

# Define data folder
DATA_FOLDER = 'Data/MovieSummaries/'

# Column names from ReadMe file
char_col = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie release date', 'Character name', 'Actor date of birth', 'Actor gender', 
                'Actor height (in meters)', 'Actor ethnicity (Freebase ID)', 'Actor name', 'Actor age at movie release',
                'Freebase character/actor map ID', 'Freebase character ID', 'Freebase actor ID']

mov_col = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie name', 'Movie release date', 'Movie box office revenue',
               'Movie runtime', 'Movie languages (Freebase ID:name tuples)', 'Movie countries (Freebase ID:name tuples)', 
               'Movie genres (Freebase ID:name tuples)']

# Import characters and movies data into pandas dataframe
characters = pd.read_table(DATA_FOLDER+'character.metadata.tsv', header=None, names=char_col)
movies = pd.read_table(DATA_FOLDER+'movie.metadata.tsv', header=None, names=mov_col)

### Characters data

In [21]:
characters.shape

(450669, 13)

In [23]:
# Percent of missing data
(characters.isna().sum() / len(characters) * 100).sort_values(ascending=False)

Actor ethnicity (Freebase ID)      76.466542
Actor height (in meters)           65.645740
Character name                     57.220488
Freebase character ID              57.218269
Actor age at movie release         35.084064
Actor date of birth                23.552763
Actor gender                       10.120288
Movie release date                  2.217814
Actor name                          0.272484
Freebase actor ID                   0.180842
Wikipedia movie ID                  0.000000
Freebase movie ID                   0.000000
Freebase character/actor map ID     0.000000
dtype: float64

### Movies data

In [24]:
movies.shape

(81741, 9)

In [26]:
# Percent of missing data
(movies.isna().sum() / len(movies) * 100).sort_values(ascending=False)

Movie box office revenue                     89.722416
Movie runtime                                25.018045
Movie release date                            8.443743
Wikipedia movie ID                            0.000000
Freebase movie ID                             0.000000
Movie name                                    0.000000
Movie languages (Freebase ID:name tuples)     0.000000
Movie countries (Freebase ID:name tuples)     0.000000
Movie genres (Freebase ID:name tuples)        0.000000
dtype: float64

### Merge dataframes

In [35]:
# Merge left on characters
df = pd.merge(characters, movies, on=['Wikipedia movie ID', 'Freebase movie ID'], how='left')
display(df)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date_x,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID,Movie name,Movie release date_y,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg,Ghosts of Mars,2001-08-24,14010832.0,98.0,"{""/m/02h40lc"": ""English Language""}","{""/m/09c7w0"": ""United States of America""}","{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,913762,/m/03pcrp,1992-05-21,Elensh,1970-05,F,,,Dorothy Elias-Fahn,,/m/0kr406c,/m/0kr406h,/m/0b_vcv,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ..."
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}","{""/m/03_3d"": ""Japan""}","{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ..."
450666,28308153,/m/0cp05t9,1957,,1941-11-18,M,1.730,/m/02w7gg,David Hemmings,15.0,/m/0g8ngmc,,/m/022g44,Five Clues to Fortune,1957,,129.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/0lsxr"": ""Crime Fiction""}"
450667,28308153,/m/0cp05t9,1957,,,,,,Roberta Paterson,,/m/0g8ngmj,,/m/0g8ngmm,Five Clues to Fortune,1957,,129.0,"{""/m/02h40lc"": ""English Language""}","{""/m/07ssc"": ""United Kingdom""}","{""/m/0lsxr"": ""Crime Fiction""}"


### Movie sequels

Consider all character names that occur in at least two separate movies, generally as a consequence of remakes or sequels

In [41]:
# Total number of separate characters in dataset
char_sum = df['Freebase character ID']
char_sum.shape

(450669,)

In [42]:
# Number of separate characters without NaNs
char_sum = char_sum.dropna()
char_sum.shape

(192804,)

Define characters from movie sequel

In [87]:
# Count number of movies in which character appears
char_counts = char_sum.value_counts()

# Keep characters appearing in at least 2 movies
char_counts = char_counts[char_counts > 1]
print(char_counts)

Freebase character ID
/m/0h577m2    146
/m/0d9kl      123
/m/0dng4       80
/m/0dngm       61
/m/057ph       43
             ... 
/m/0gfn32z      2
/m/0gvdq4t      2
/m/0h6j93p      2
/m/0cgr3fm      2
/m/0d7r2n2      2
Name: count, Length: 4938, dtype: int64


Filtrate df by movie sequels: create a df sequels that keeps only df rows with Freebase character ID from char_counts

In [43]:
sequels = df

### Most successful movie sequels

In [45]:
sequels.sort_values(by='Movie box office revenue', ascending=False)

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date_x,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID,Movie name,Movie release date_y,Movie box office revenue,Movie runtime,Movie languages (Freebase ID:name tuples),Movie countries (Freebase ID:name tuples),Movie genres (Freebase ID:name tuples)
262128,4273140,/m/0bth54,2009-12-10,Parker Selfridge,1974-12-17,M,1.7145,/m/0222qb,Giovanni Ribisi,34.0,/m/03jpc_1,/m/07ykg9d,/m/03fbb6,Avatar,2009-12-10,2.782275e+09,178.0,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...",[Unknown],"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
262133,4273140,/m/0bth54,2009-12-10,Corporal Lyle Wainfleet,,M,,,Matt Gerald,,/m/07ykgdp,/m/07ykgds,/m/04mxshb,Avatar,2009-12-10,2.782275e+09,178.0,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...",[Unknown],"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
262131,4273140,/m/0bth54,2009-12-10,Tsu'Tey,1974-03-25,M,1.8300,/m/01hwt,Laz Alonso,35.0,/m/06whdzy,/m/07ykg9n,/m/05_44t,Avatar,2009-12-10,2.782275e+09,178.0,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...",[Unknown],"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
262130,4273140,/m/0bth54,2009-12-10,Mo'at,1952-12-25,F,1.6890,/m/0bh91q8,CCH Pounder,56.0,/m/03jp848,/m/07ykgbf,/m/054v3p,Avatar,2009-12-10,2.782275e+09,178.0,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...",[Unknown],"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
262129,4273140,/m/0bth54,2009-12-10,Norm Spellman,1977-09-25,M,,,Joel Moore,32.0,/m/05t7856,/m/07ykgcp,/m/04zqmj,Avatar,2009-12-10,2.782275e+09,178.0,"{""/m/02h40lc"": ""English Language"", ""/m/06nm1"":...",[Unknown],"{""/m/01jfsb"": ""Thriller"", ""/m/06n90"": ""Science..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,913762,/m/03pcrp,1992-05-21,Elensh,1970-05,F,,,Dorothy Elias-Fahn,,/m/0kr406c,/m/0kr406h,/m/0b_vcv,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}",[Unknown],"{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ..."
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j,The Super Dimension Fortress Macross II: Lover...,1992-05-21,,150.0,"{""/m/03_9r"": ""Japanese Language""}",[Unknown],"{""/m/06n90"": ""Science Fiction"", ""/m/0gw5n2f"": ..."
450666,28308153,/m/0cp05t9,1957,,1941-11-18,M,1.7300,/m/02w7gg,David Hemmings,15.0,/m/0g8ngmc,,/m/022g44,Five Clues to Fortune,1957,,129.0,"{""/m/02h40lc"": ""English Language""}",[Unknown],"{""/m/0lsxr"": ""Crime Fiction""}"
450667,28308153,/m/0cp05t9,1957,,,,,,Roberta Paterson,,/m/0g8ngmj,,/m/0g8ngmm,Five Clues to Fortune,1957,,129.0,"{""/m/02h40lc"": ""English Language""}",[Unknown],"{""/m/0lsxr"": ""Crime Fiction""}"


-> take most 1000 (or other) successful movie sequels

### Evaluate actor's success in following movies

Consider actors playing in subsequent movies: 

Consider first subsequent movie and compare success

Define success:
- box office revenue if we are lazy
- IMDb if we are motivated

Variables that might affect success
- number of sequels
- character type difference (far from each other?)
- movie genre difference (far from each other?)

- define how to measure personas difference: how to evaluate how far a character type is from another?
- some characters may be remarkable without being part of a movie sequel

In [32]:
# Example with Harry Potter
hp = characters.loc[characters['Actor name'] == 'Daniel Radcliffe']
display(hp.sort_values(by='Actor age at movie release',ascending=False))

Unnamed: 0,Wikipedia movie ID,Freebase movie ID,Movie release date,Character name,Actor date of birth,Actor gender,Actor height (in meters),Actor ethnicity (Freebase ID),Actor name,Actor age at movie release,Freebase character/actor map ID,Freebase character ID,Freebase actor ID
256976,35326709,/m/0j7l95d,2013,,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,23.0,/m/0mzn05p,,/m/013_vh
318085,28250083,/m/0dgnnp1,2012-02-03,Arthur Kipps,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,22.0,/m/0gw3f7w,/m/0gw85zy,/m/013_vh
84212,31941988,/m/0gvsynb,2011-07-07,Harry Potter,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,21.0,/m/09lybcb,/m/03647x,/m/013_vh
84781,9834441,/m/02pth35,2010-11-17,Harry Potter,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,21.0,/m/02tbbh6,/m/03647x,/m/013_vh
20929,858575,/m/03hxsv,2009-07-06,Harry Potter,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,19.0,/m/02tbf6n,/m/03647x,/m/013_vh
50142,670407,/m/031hcx,2007-06-28,Harry Potter,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,17.0,/m/0jz6hs,/m/03647x,/m/013_vh
159951,24336780,/m/03hl70g,2007,John Kipling,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,17.0,/m/04htr93,/m/0gwsqhq,/m/013_vh
30320,667372,/m/031786,2005-11-06,Harry Potter,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,16.0,/m/0jz6dz,/m/03647x,/m/013_vh
114730,667371,/m/03177r,2004-06-04,Harry Potter,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,14.0,/m/0jz6mq,/m/03647x,/m/013_vh
114619,667368,/m/031778,2002-11-03,Harry Potter,1989-07-23,M,1.66,/m/02w7gg,Daniel Radcliffe,13.0,/m/0jz6b0,/m/03647x,/m/013_vh
