In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns

In [None]:
# Load datasets
headers_character_metadata = ['Wikipedia Movie ID', 'Freebase Movie ID','Movie release date', 'Character name', 'Actor DOB', 
                              'Actor gender', 'Actor height', 'Actor ethnicity', 'Actor name',
                              'Actor age at movie release', 'Freebase character map', 'what1', 'what2']
character_metadata = pd.read_csv('../data/character.metadata.tsv', sep='\t', names=headers_character_metadata)

headers_movie_metadata = ["Wikipedia Movie ID", "Freebase Movie ID", "Movie name", "Movie release date",
                          "Movie box office revenue", "Movie runtime", "Movie languages", "Movie countries",
                          "Movie genres"]
movie_metadata = pd.read_csv('../data/movie.metadata.tsv', sep='\t', names=headers_movie_metadata)

# Display datasets for inspection
character_metadata.head()

In [None]:
# Add a column of release year to get consistensy
character_metadata['Movie release year'] = character_metadata['Movie release date'].str.split('-').str[0].astype('Int64')
movie_metadata['Movie release year'] = movie_metadata['Movie release date'].str.split('-').str[0].astype('Int64')

# Add a column of birth year to get consistensy
character_metadata['Actor birth year'] = character_metadata['Actor DOB'].str.split('-').str[0].astype('Int64')


# Remove outliers

## Birth year

In [None]:
years = character_metadata['Actor birth year'].unique()
np.sort(years.fillna(0))

In [None]:
# Define the condition to identify rows where 'Actor birth year' needs to be set to NaN
condition = (character_metadata['Actor birth year'] < 1800) | (character_metadata['Actor birth year'] > 2030)

# Use .loc to set the 'Actor birth year' to NaN for the selected rows
character_metadata.loc[condition, 'Actor birth year'] = np.nan

## Movie release year

In [None]:
# See years
years = character_metadata['Movie release year'].unique()
np.sort(years.fillna(0))

In [None]:
# The movie with with year '1010' was made in 2010
# Replace 1010 with 2010
character_metadata[character_metadata['Movie release year'] == 1010] = 2010
movie_metadata[movie_metadata['Movie release year'] == 1010] = 2010

## Height

In [None]:
# Replace 'character_metadata' with your actual DataFrame name and 'Actor height' with the actual column name if needed
height = character_metadata['Actor height'].unique()

# Sort the unique height values in ascending order and round them to 2 decimal places
sorted_height = np.sort(np.round(height, 2))

# Find the 5 minimum and 5 maximum values
min_values = sorted_height[:15]
max_values = sorted_height[-15:]

print("5 Minimum Values of Actor Height:")
print(min_values)

print("5 Maximum Values of Actor Height:")
print(max_values)


In [None]:
# Define the condition to identify rows where height needs to be set to NaN
condition = (character_metadata['Actor height'] > 2.4)
character_metadata.loc[condition, 'Actor height'] = np.nan

# The small values are kids - maybe they should be removed ??

# Age

In [None]:
age = character_metadata['Actor age at movie release'].unique()

# Sort the unique height values in ascending order and round them to 2 decimal places
sorted_age = np.sort(np.round(age, 2))

# Find the 5 minimum and 5 maximum values
min_values = sorted_age[:20]
max_values = sorted_age[-20:]

print("5 Minimum Values of Actor age:")
print(min_values)

print("5 Maximum Values of Actor age:")
print(max_values)

In [None]:
# Define the condition to identify rows where age needs to be set to NaN
condition = (character_metadata['Actor age at movie release'] < 0) | (character_metadata['Actor age at movie release'] > 110)
character_metadata.loc[condition, 'Actor age at movie release'] = np.nan

# Check dtypes and cnvert certain objects to floats for analysis

In [None]:
character_metadata.dtypes

In [None]:
character_metadata['Movie release year'] = pd.to_numeric(character_metadata['Movie release year'], errors='coerce').astype(float)
character_metadata['Actor age at movie release'] = pd.to_numeric(character_metadata['Actor age at movie release'], errors='coerce').astype(float)

# NaNs

Let's look at the amount of missing values in out data. For each attribute, the percentage of NaN's is calculated:

In [None]:
# Percentage of missing datapoints for each column in both data set
print('Percentage of NaNs for character dataset')
print(character_metadata.isna().mean() * 100)
print('\n')
print('Percentage of NaNs for movie dataset')
print(movie_metadata.isna().mean() * 100)

In [None]:
character_nan_percentages = character_metadata.isna().mean() * 100
movie_nan_percentages = movie_metadata.isna().mean() * 100

# Create a bar plot
fig, ax = plt.subplots(figsize=(10, 5))

# Plot the character dataset NaN percentages
ax.bar(character_nan_percentages.index, character_nan_percentages, label='Character Dataset', alpha=0.5)

# Plot the movie dataset NaN percentages
ax.bar(movie_nan_percentages.index, movie_nan_percentages, label='Movie Dataset', alpha=0.5)

ax.set_ylim(0,100)
ax.set_xlabel('Columns')
ax.set_ylabel('Percentage of NaN values')
ax.set_title('Percentage of NaN Values in Character and Movie Dataset')
ax.legend()
plt.xticks(rotation=90) # rotate x-axis
plt.show()


## Male vs female NaN (not relevant)
Difference between male and female data. 
NOT IMPORTANT

In [None]:
# Divide into female and male dataset
female_characters = character_metadata[character_metadata['Actor gender'] == 'F'].copy()
male_characters = character_metadata[character_metadata['Actor gender'] == 'M'].copy()

print('Percentage of NaNs for female dataset')
print(female_characters.isna().mean() * 100)
print('\n')
print('Percentage of NaNs for male dataset')
print(male_characters.isna().mean() * 100)

In [None]:
female_nan_percentages = female_characters.isna().mean() * 100
male_nan_percentages = male_characters.isna().mean() * 100

# Find the common attributes
common_attributes = female_nan_percentages.index.intersection(male_nan_percentages.index)

# Create a bar plot with grouped bars for the common attributes
width = 0.35  # Width of the bars

fig, ax = plt.subplots(figsize=(12, 5))

# Calculate the x-axis positions for bars
x = np.arange(len(common_attributes))

# Plot the percentage of NaN values for the female dataset
ax.bar(x - width/2, female_nan_percentages[common_attributes], width, label='Female Dataset', color='red')
# Plot the percentage of NaN values for the male dataset
ax.bar(x + width/2, male_nan_percentages[common_attributes], width, label='Male Dataset', color='blue')

# Set x-axis labels and title
ax.set_xlabel('Attributes')
ax.set_ylabel('Percentage of NaN values')
ax.set_title('Percentage of NaN Values in Female and Male Character Datasets (Common Attributes)')

# Set x-axis labels to be attribute names
ax.set_xticks(x)
ax.set_xticklabels(common_attributes, rotation=90)
ax.legend()
plt.show()


## Remove NaNs

Here we could remove coloumns containing % NaNs above a threshold:

In [None]:
# Define a threshold (e.g., 70%) for removing columns
threshold = 70

# Filter columns where NaN percentage is less than or equal to the threshold
filtered_female_columns = female_characters.columns[female_characters.isna().mean() * 100 <= threshold]
filtered_male_colums = male_characters.columns[male_characters.isna().mean() * 100 <= threshold]

# Create a new DataFrame with only the filtered columns
filtered_female_character = female_characters[filtered_female_columns].copy()
filtered_male_character = male_characters[filtered_male_colums].copy()

print('Percentage of NaNs for filtered female dataset')
print(filtered_female_character.isna().mean() * 100)
print('\n')
print('Percentage of NaNs for filtered male dataset')
print(filtered_male_character.isna().mean() * 100)

# Male vs female actors in movies over time

In [None]:
female_count = female_characters.groupby('Wikipedia Movie ID').size()
male_count = male_characters.groupby('Wikipedia Movie ID').size()

# Ensure that all Wikipedia Movie IDs in 'female_count' are represented in 'male_count'
# Fill missing values with 0
male_count = male_count.reindex(female_count.index, fill_value=0)

# Calculate the percentage of female characters
percentage_female = (female_count / (female_count + male_count)) * 100

# Reset the index to have 'Wikipedia Movie ID' as a column
percentage_female = percentage_female.reset_index()

# Rename the columns for clarity
percentage_female.columns = ['Wikipedia Movie ID', 'Percentage Female Characters']


In [None]:
# Merge dataframes
percentage_female_with_year = pd.merge(percentage_female, movie_metadata, on='Wikipedia Movie ID', how='left')

# Calculate female percentage pr. year
percentage_female_per_year = percentage_female_with_year.groupby('Movie release year')['Percentage Female Characters'].mean()
percentage_female_per_year.columns = ['Movie release year', 'Percentage Female Characters']

plt.plot(percentage_female_per_year.index, percentage_female_per_year)
plt.title('Percentage of female actors in movies over time')
plt.ylabel('Female actor percentage [%]')
plt.xlabel('Year')

In [None]:
percentage_female_with_year_nona = percentage_female_with_year[['Movie box office revenue', 'Percentage Female Characters']].dropna()

In [None]:
import statsmodels.api as sm

percentage_female_with_year_nooutlier = percentage_female_with_year_nona[percentage_female_with_year_nona["Movie box office revenue"] < 1.3e+09]

# Create a scatterplot with a regression line
#sns.set(style="darkgrid")
sns.lmplot(x="Percentage Female Characters", y="Movie box office revenue", data=percentage_female_with_year_nooutlier)

# Perform linear regression
X = percentage_female_with_year_nooutlier["Percentage Female Characters"]
X = sm.add_constant(X)  # Add an intercept term
y = percentage_female_with_year_nooutlier["Movie box office revenue"]

model = sm.OLS(y, X).fit()

# Print the regression summary
print(model.summary())

# Show the plot
plt.show()


# Number of movies over time

There is a lot of newer movies

In [None]:
# Calculate number of movies over time
no_movies = movie_metadata.groupby('Movie release year').size()
plt.plot(no_movies)
plt.show()

# Distribution of variables

In [None]:
import matplotlib.pyplot as plt

# Variables of interest
variables = ['Movie release year', 'Actor birth year', 'Actor height', 'Actor age at movie release']

# Extract the specific columns from the female_characters and male_characters datasets
female_data = [female_characters[var].dropna() for var in variables]
male_data = [male_characters[var].dropna() for var in variables]

# Create a figure with three subplots
fig, axes = plt.subplots(1, 4, figsize=(15, 5))

# Create box plots for each variable in each subplot
for i, variable in enumerate(variables):
    ax = axes[i]
    
    # Create the box plots for female and male characters for the current variable
    width = 0.35  # Width of the boxes
    
    positions = [1, 2]
    boxprops = dict(linewidth=2, color='black')
    medianprops = dict(linewidth=2, color='black')
    capprops = dict(color='black')
    
    # Set custom colors for the female and male box plots
    box_colors = ['red', 'blue']
    
    bp_female = ax.boxplot([female_data[i]], positions=[positions[0]], widths=width, patch_artist=True,
                           boxprops=boxprops, medianprops=medianprops, capprops=capprops)
    bp_male = ax.boxplot([male_data[i]], positions=[positions[1]], widths=width, patch_artist=True,
                         boxprops=boxprops, medianprops=medianprops, capprops=capprops)
    
    # Set colors for female and male box plots
    for box, color in zip([bp_female, bp_male], box_colors):
        for element in ['boxes', 'medians', 'caps']:
            plt.setp(box[element], color=color)
    
    ax.set_ylabel('Values')
    ax.set_title(variable)
    ax.set_xticks(positions)
    ax.set_xticklabels(['Female', 'Male'])

# Adjust the layout to prevent overlapping
plt.tight_layout()

# Show the subplots
plt.show()


In [None]:
# Variables of interest
variables = ['Movie release year', 'Actor birth year', 'Actor height', 'Actor age at movie release']

# Create a figure with four subplots (one for each variable)
fig, axes = plt.subplots(1, 4, figsize=(15, 5))

# Create histograms for each variable in each subplot
for i, variable in enumerate(variables):
    ax = axes[i]

    # Create histograms for female and male characters for the current variable
    female_data = female_characters[variable].dropna()
    male_data = male_characters[variable].dropna()

    ax.hist(female_data, bins=20, alpha=0.5, color='red', label='Female')
    ax.hist(male_data, bins=20, alpha=0.5, color='blue', label='Male')

    ax.set_xlabel(variable)
    ax.set_ylabel('Frequency')
    ax.set_title(variable)
    ax.legend()

# Adjust the layout to prevent overlapping
plt.tight_layout()

# Show the subplots
plt.show()


In [None]:
# Age
stats.ttest_ind(female_characters['Actor age at movie release'].dropna(), male_characters['Actor age at movie release'].dropna(), nan_policy='omit')

In [None]:
# Movie release year
female_birth_year = pd.to_numeric(female_characters['Actor birth year'], errors='coerce').astype('float64')
male_birth_year = pd.to_numeric(male_characters['Actor birth year'], errors='coerce').astype('float64')

stats.ttest_ind(female_birth_year.dropna(), male_birth_year.dropna())

In [None]:
# Movie release year
female_movie_release_years = pd.to_numeric(female_characters['Movie release year'], errors='coerce').astype('float64')
male_movie_release_years = pd.to_numeric(male_characters['Movie release year'], errors='coerce').astype('float64')

stats.ttest_ind(female_characters['Movie release year'].dropna(), male_characters['Movie release year'].dropna())

In [None]:
# Remove nans from relevant columns
character_metadata_nona = character_metadata[['Movie release year', 'Actor age at movie release', 'Actor gender']].dropna()

sns.lineplot(x='Movie release year', y='Actor age at movie release', data=character_metadata_nona, hue="Actor gender", err_style="band")

plt.title('Average age for actors at the movie release date')
plt.xlabel('Year')
plt.ylabel('Age')
plt.tight_layout()
plt.show()
