In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

# Data Path
DATA_FOLDER = 'Data/'

PLOT_DATASET = DATA_FOLDER+"plot_summaries.txt"
CHARACTER_DATASET = DATA_FOLDER+"character.metadata.tsv"
MOVIE_DATASET = DATA_FOLDER+"movie.metadata.tsv"

# Test dataset in the original study
NAME_DATASET = DATA_FOLDER+"name.clusters.txt"
TVTROPES_DATASET = DATA_FOLDER+"tvtropes.clusters.txt"

In [2]:
#plot = pd.read_csv(PLOT_DATASET)
##### Loading characters data #####
character = pd.read_csv(CHARACTER_DATASET, sep = '\t', header = None)
# Assign columns' names
character.columns = ['WikiMovie_ID', 'FMovie_ID', 'Release_date', 'Character_name', 
                           'Actor_dob', 'Actor_gender', 'Actor_height', 'Actor_ethnicity',
                           'Actor_name', 'Actor_age', 'Map_ID', 
                           'FCharacter_ID', 'FActor_ID']
# Extract freebase movie ID
fmovie_id = character['FMovie_ID'].str.extract('/m/(\w+)$').fillna('NaN')
character['FMovie_ID'] = fmovie_id
# Extract freebase actor ethnicity ID
Actor_ethnicity = character['Actor_ethnicity'].str.extract('/m/(\w+)$').fillna('NaN')
character['Actor_ethnicity'] = Actor_ethnicity
# Extract character/actor map ID
Map = character['Map_ID'].str.extract('/m/(\w+)$').fillna('NaN')
character['Map_ID'] = Map
# Extract freebase character ID
FCharacter_ID = character['FCharacter_ID'].str.extract('/m/(\w+)$').fillna('NaN')
character['FCharacter_ID'] =FCharacter_ID
# Extract freebase actor ID
FActor_ID = character['FActor_ID'].str.extract('/m/(\w+)$').fillna('NaN')
character['FActor_ID'] = FActor_ID

# Display the dataframe
character.head()

Unnamed: 0,WikiMovie_ID,FMovie_ID,Release_date,Character_name,Actor_dob,Actor_gender,Actor_height,Actor_ethnicity,Actor_name,Actor_age,Map_ID,FCharacter_ID,FActor_ID
0,975900,03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,0bgchxw,0bgcj3x,03wcfv7
1,975900,03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,044038p,Natasha Henstridge,27.0,0jys3m,0bgchn4,0346l4
2,975900,03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,0x67,Ice Cube,32.0,0jys3g,0bgchn_,01vw26l
3,975900,03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,02vchl6,0bgchnq,034hyc
4,975900,03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,02vbb3r,0bgchp9,01y9xg


In [3]:
##### Loading movie data ######
movie = pd.read_csv(MOVIE_DATASET, sep='\t', header = None)

# Assign columns' names
movie.columns = ['WikiMovie_ID', 'FMovie_ID', 'Movie_name', 'Release_date', 
                    'Revenue', 'Runtime', 'Languages', 'Countries','Genres']
# Extract freebase movie ID
FMovie_ID = movie['FMovie_ID'].str.extract('/m/(\w+)$').fillna('NaN')
movie['FMovie_ID'] = FMovie_ID

# Extract movie language 
Language = movie['Languages'].str.extract('{"/m/\w+": "(\w+) Language"}').fillna('NaN')
movie['Languages'] = Language

# Extract movie country
Countries = movie['Countries'].str.extract('{"/m/\w+": "([^"]+)"}').fillna('NaN')
movie['Countries'] = Countries

# Create a pattern for genres
pattern = r'":\s*"([^"]+)"'
# Extract genres
genres = movie['Genres'].str.extractall(pattern)
# Group by the first level of the MultiIndex (level=0)
grouped_result = genres.groupby(level=0)
# Join the genres of a film
movie['Genres'] = grouped_result[0].apply(lambda x: ', '.join(x))

movie.head()

Unnamed: 0,WikiMovie_ID,FMovie_ID,Movie_name,Release_date,Revenue,Runtime,Languages,Countries,Genres
0,975900,03vyhn,Ghosts of Mars,2001-08-24,14010832.0,98.0,English,United States of America,"Thriller, Science Fiction, Horror, Adventure, ..."
1,3196793,08yl5d,Getting Away with Murder: The JonBenét Ramsey ...,2000-02-16,,95.0,English,United States of America,"Mystery, Biographical film, Drama, Crime Drama"
2,28463795,0crgdbh,Brun bitter,1988,,83.0,Norwegian,Norway,"Crime Fiction, Drama"
3,9363483,0285_cd,White Of The Eye,1987,,110.0,English,United Kingdom,"Thriller, Erotic thriller, Psychological thriller"
4,261236,01mrr1,A Woman in Flames,1983,,106.0,German,Germany,Drama


In [4]:
# Revenue Missing Rate
Revenue_MR = movie['Revenue'].isna().sum() / len(movie['Revenue'])
Revenue_MR

0.8972241592346558

In [5]:
# Load character name dataset
name = pd.read_csv(NAME_DATASET, sep='\t',header = None)
name.columns = ['Character_name', 'Map_ID']
# Extract character/actor map ID
Map = name['Map_ID'].str.extract('/m/(\w+)$').fillna('NaN')
name['Map_ID'] = Map

name.head()

Unnamed: 0,Character_name,Map_ID
0,Stuart Little,0k3w9c
1,Stuart Little,0k3wcx
2,Stuart Little,0k3wbn
3,John Doe,0jyg35
4,John Doe,0k2_zn


In [6]:
# Load tvtropes.com dataset
tvtropes = pd.read_csv(TVTROPES_DATASET, sep='\t',header = None)
# Extract character name, movie name, actor name, and character/actor map
tvtropes['Character'] = tvtropes.iloc[:,1].str.extract('"char":\s*"([^"]+)"').fillna('NaN')
tvtropes['movie'] = tvtropes.iloc[:,1].str.extract('"movie":\s*"([^"]+)"').fillna('NaN')
tvtropes['Map'] = tvtropes.iloc[:,1].str.extract('"id":\s*"/m/([^"]+)"').fillna('NaN')
tvtropes['Actor_name'] = tvtropes.iloc[:,1].str.extract('"actor":\s*"([^"]+)"').fillna('NaN')
# Rename character type column and clean original dataframe
tvtropes.rename(columns = {0:'Character_type'},inplace=True)
tvtropes.drop(columns = 1, inplace = True)

tvtropes.head()

Unnamed: 0,Character_type,Character,movie,Map,Actor_name
0,absent_minded_professor,Professor Philip Brainard,Flubber,0jy9q0,Robin Williams
1,absent_minded_professor,Professor Keenbean,Richie Rich,02vchl3,Michael McShane
2,absent_minded_professor,Dr. Reinhardt Lane,The Shadow,0k6fkc,Ian McKellen
3,absent_minded_professor,Dr. Harold Medford,Them!,0k6_br,Edmund Gwenn
4,absent_minded_professor,Daniel Jackson,Stargate,0k3rhh,James Spader
