In [39]:
import pandas as pd
import matplotlib.pyplot as plt

DATA_FOLDER = 'Data/MovieSummaries/'
CHARACTER_DATASET = DATA_FOLDER + 'character.metadata.tsv'

# Dataset loading
characters = pd.read_csv(CHARACTER_DATASET, sep='\t', header=None)

# define the name for each columns
characters.columns = ['Wikipedia movie ID', 'Freebase movie ID', 'Movie release date', 'Character name',
                      'Actor date of birth', 'Actor gender', 'Actor height (in meters)', 'Actor ethnicity (Freebase ID)',
                      'Actor name', 'Actor age at movie release', 'Freebase character/actor map ID', 'Freebase character ID',
                      'Freebase actor ID']
shape = characters.shape

# Print the shape
print(shape)

(450669, 13)


### Remove characters with missing values

In [40]:
# Verify the missing values 
print(characters.isna().sum())

Wikipedia movie ID                      0
Freebase movie ID                       0
Movie release date                   9995
Character name                     257875
Actor date of birth                106145
Actor gender                        45609
Actor height (in meters)           295845
Actor ethnicity (Freebase ID)      344611
Actor name                           1228
Actor age at movie release         158113
Freebase character/actor map ID         0
Freebase character ID              257865
Freebase actor ID                     815
dtype: int64


In [38]:

# Drop the character without name, actor or date of birth 
characters = characters.dropna(subset=['Character name', 'Actor date of birth', 'Actor name', 'Movie release date'])

print(characters.isna().sum())
shape = characters.shape

# Print the shape
print(shape)

Wikipedia movie ID                     0
Freebase movie ID                      0
Movie release date                     0
Character name                         0
Actor date of birth                    0
Actor gender                           0
Actor ethnicity (Freebase ID)      88701
Actor name                             0
Actor age at movie release             0
Freebase character/actor map ID        0
Freebase character ID                  0
Freebase actor ID                      0
Release year                           0
Actor year of birth                    0
dtype: int64
(148058, 14)


In [29]:
# Drop the 'Height' column
characters = characters.drop(columns=['Actor height (in meters)'])

In [30]:
# Convert the release date column in datetime
characters['Movie release date'] = pd.to_datetime(characters['Movie release date'], errors='coerce')

# Extract the year and keep NaN values using Int64Dtype
characters['Release year'] = characters['Movie release date'].dt.year
characters = characters.dropna(subset=['Release year'])
characters['Release year'] = characters['Release year'].astype(int)

print(characters[['Movie release date', 'Release year']].head())

  Movie release date  Release year
0         2001-08-24          2001
1         2001-08-24          2001
2         2001-08-24          2001
3         2001-08-24          2001
4         2001-08-24          2001


In [31]:
# Convert the birth date column in datetime
characters['Actor date of birth'] = pd.to_datetime(characters['Actor date of birth'], errors='coerce')

# Extract the year and convert it in int
characters['Actor year of birth'] = characters['Actor date of birth'].dt.year
characters = characters.dropna(subset=['Actor year of birth'])
characters['Actor year of birth'] = characters['Actor year of birth'].astype(int)

print(characters[[ 'Actor date of birth', 'Actor year of birth']].head())

  Actor date of birth  Actor year of birth
0          1958-08-26                 1958
1          1974-08-15                 1974
2          1969-06-15                 1969
3          1967-09-12                 1967
4          1977-09-25                 1977


In [32]:
def calculate_actor_age(row):
    return row['Release year'] - row['Actor year of birth']

# if missing values
characters.loc[characters['Actor age at movie release'].isna(), 'Actor age at movie release'] = characters.apply(calculate_actor_age, axis=1)

print(characters[['Actor year of birth', 'Release year', 'Actor age at movie release']].head())

   Actor year of birth  Release year  Actor age at movie release
0                 1958          2001                        42.0
1                 1974          2001                        27.0
2                 1969          2001                        32.0
3                 1967          2001                        33.0
4                 1977          2001                        23.0


In [33]:
# Calculate age for missing values in 'Actor age at movie release'
characters['Actor age at movie release'] = characters['Actor age at movie release'].fillna(
    characters['Release year'] - characters['Actor year of birth']
)

# Vérifier les résultats
print(characters[['Actor year of birth', 'Release year', 'Actor age at movie release']].head())


   Actor year of birth  Release year  Actor age at movie release
0                 1958          2001                        42.0
1                 1974          2001                        27.0
2                 1969          2001                        32.0
3                 1967          2001                        33.0
4                 1977          2001                        23.0


In [34]:
## what we are doing for missing values like : 
print(characters.isna().sum())

Wikipedia movie ID                     0
Freebase movie ID                      0
Movie release date                     0
Character name                         0
Actor date of birth                    0
Actor gender                         358
Actor ethnicity (Freebase ID)      89055
Actor name                             0
Actor age at movie release             0
Freebase character/actor map ID        0
Freebase character ID                  0
Freebase actor ID                      0
Release year                           0
Actor year of birth                    0
dtype: int64


In [35]:
# drop the missing value for actor gender
characters = characters.dropna(subset=['Actor gender'])

In [36]:
print(characters.isna().sum())
shape = characters.shape

# Print the shape
print(shape)

Wikipedia movie ID                     0
Freebase movie ID                      0
Movie release date                     0
Character name                         0
Actor date of birth                    0
Actor gender                           0
Actor ethnicity (Freebase ID)      88701
Actor name                             0
Actor age at movie release             0
Freebase character/actor map ID        0
Freebase character ID                  0
Freebase actor ID                      0
Release year                           0
Actor year of birth                    0
dtype: int64
(148058, 14)
