## Pipeline for `character.metadata.tsv` dataset

In [5]:
import pandas as pd
import sys
sys.path.append('../data')

# Load the data from a TSV (tab-separated) file
folder_path = '../data/MovieSummaries'
file_name = 'character.metadata.tsv'

file_path = f"{folder_path}/{file_name}"

In [14]:
# Define column names based on the provided data description
column_names = [
    'wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 
    'character_name', 'actor_dob', 'actor_gender', 'actor_height', 
    'actor_ethnicity_id', 'actor_name', 'actor_age_at_release', 
    'character_actor_map_id', 'character_id', 'actor_id'
]

df = pd.read_csv(file_path, sep='\t', names=column_names, header=None)

# Display the first few rows of the DataFrame to verify
df.head()

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,character_name,actor_dob,actor_gender,actor_height,actor_ethnicity_id,actor_name,actor_age_at_release,character_actor_map_id,character_id,actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.62,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.78,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.75,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.65,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg


In [7]:
# Count the number of missing values in each column
missing_values = df.isnull().sum()

# Display the missing values count for each column
print(missing_values)

wikipedia_movie_id             0
freebase_movie_id              0
movie_release_date          9995
character_name            257875
actor_dob                 106145
actor_gender               45609
actor_height              295845
actor_ethnicity_id        344611
actor_name                  1228
actor_age_at_release      158113
character_actor_map_id         0
character_id              257865
actor_id                     815
dtype: int64


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450669 entries, 0 to 450668
Data columns (total 13 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   wikipedia_movie_id      450669 non-null  int64  
 1   freebase_movie_id       450669 non-null  object 
 2   movie_release_date      440674 non-null  object 
 3   character_name          192794 non-null  object 
 4   actor_dob               344524 non-null  object 
 5   actor_gender            405060 non-null  object 
 6   actor_height            154824 non-null  float64
 7   actor_ethnicity_id      106058 non-null  object 
 8   actor_name              449441 non-null  object 
 9   actor_age_at_release    292556 non-null  float64
 10  character_actor_map_id  450669 non-null  object 
 11  character_id            192804 non-null  object 
 12  actor_id                449854 non-null  object 
dtypes: float64(2), int64(1), object(10)
memory usage: 44.7+ MB


In [9]:
df.describe()

Unnamed: 0,wikipedia_movie_id,actor_height,actor_age_at_release
count,450669.0,154824.0,292556.0
mean,13969750.0,1.788893,37.788523
std,10796620.0,4.37994,20.58787
min,330.0,0.61,-7896.0
25%,3759292.0,1.6764,28.0
50%,11890650.0,1.75,36.0
75%,23665010.0,1.83,47.0
max,37501920.0,510.0,103.0


In [15]:
pickle_folder = '../pickles'
pickle_name = 'character_metadata.pkl'
df.to_pickle(f'{pickle_folder}/{pickle_name}')