## Pipeline for `character.metadata.tsv` dataset

In [5]:
import pandas as pd
import sys
sys.path.append('../data')

# Load the data from a TSV (tab-separated) file
folder_path = '../data/MovieSummaries'
file_name = 'character.metadata.tsv'

file_path = f"{folder_path}/{file_name}"

In [6]:
# Define column names based on the provided data description
column_names = [
    'wikipedia_movie_id', 'freebase_movie_id', 'movie_release_date', 
    'character_name', 'actor_dob', 'actor_gender', 'actor_height', 
    'actor_ethnicity_id', 'actor_name', 'actor_age_at_release', 
    'character_actor_map_id', 'character_id', 'actor_id'
]

df = pd.read_csv(file_path, sep='\t', names=column_names, header=None)

# Display the first few rows of the DataFrame to verify
print(df.head())

   wikipedia_movie_id freebase_movie_id movie_release_date  \
0              975900         /m/03vyhn         2001-08-24   
1              975900         /m/03vyhn         2001-08-24   
2              975900         /m/03vyhn         2001-08-24   
3              975900         /m/03vyhn         2001-08-24   
4              975900         /m/03vyhn         2001-08-24   

               character_name   actor_dob actor_gender  actor_height  \
0                    Akooshay  1958-08-26            F         1.620   
1  Lieutenant Melanie Ballard  1974-08-15            F         1.780   
2         Desolation Williams  1969-06-15            M         1.727   
3          Sgt Jericho Butler  1967-09-12            M         1.750   
4             Bashira Kincaid  1977-09-25            F         1.650   

  actor_ethnicity_id          actor_name  actor_age_at_release  \
0                NaN      Wanda De Jesus                  42.0   
1         /m/044038p  Natasha Henstridge                  27.0

In [7]:
# Count the number of missing values in each column
missing_values = df.isnull().sum()

# Display the missing values count for each column
print(missing_values)

wikipedia_movie_id             0
freebase_movie_id              0
movie_release_date          9995
character_name            257875
actor_dob                 106145
actor_gender               45609
actor_height              295845
actor_ethnicity_id        344611
actor_name                  1228
actor_age_at_release      158113
character_actor_map_id         0
character_id              257865
actor_id                     815
dtype: int64


In [8]:
df

Unnamed: 0,wikipedia_movie_id,freebase_movie_id,movie_release_date,character_name,actor_dob,actor_gender,actor_height,actor_ethnicity_id,actor_name,actor_age_at_release,character_actor_map_id,character_id,actor_id
0,975900,/m/03vyhn,2001-08-24,Akooshay,1958-08-26,F,1.620,,Wanda De Jesus,42.0,/m/0bgchxw,/m/0bgcj3x,/m/03wcfv7
1,975900,/m/03vyhn,2001-08-24,Lieutenant Melanie Ballard,1974-08-15,F,1.780,/m/044038p,Natasha Henstridge,27.0,/m/0jys3m,/m/0bgchn4,/m/0346l4
2,975900,/m/03vyhn,2001-08-24,Desolation Williams,1969-06-15,M,1.727,/m/0x67,Ice Cube,32.0,/m/0jys3g,/m/0bgchn_,/m/01vw26l
3,975900,/m/03vyhn,2001-08-24,Sgt Jericho Butler,1967-09-12,M,1.750,,Jason Statham,33.0,/m/02vchl6,/m/0bgchnq,/m/034hyc
4,975900,/m/03vyhn,2001-08-24,Bashira Kincaid,1977-09-25,F,1.650,,Clea DuVall,23.0,/m/02vbb3r,/m/0bgchp9,/m/01y9xg
...,...,...,...,...,...,...,...,...,...,...,...,...,...
450664,913762,/m/03pcrp,1992-05-21,Elensh,1970-05,F,,,Dorothy Elias-Fahn,,/m/0kr406c,/m/0kr406h,/m/0b_vcv
450665,913762,/m/03pcrp,1992-05-21,Hibiki,1965-04-12,M,,,Jonathan Fahn,27.0,/m/0kr405_,/m/0kr4090,/m/0bx7_j
450666,28308153,/m/0cp05t9,1957,,1941-11-18,M,1.730,/m/02w7gg,David Hemmings,15.0,/m/0g8ngmc,,/m/022g44
450667,28308153,/m/0cp05t9,1957,,,,,,Roberta Paterson,,/m/0g8ngmj,,/m/0g8ngmm
