In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()

# Visualizing the processed data
In this notebook, we aim to visualize our core components of the project.
## 1. Timeline visualization
We'll start with creating a timeline for each name, and display a certain amount of movies (for example the top 10) that came out throughout the years.

In [None]:
folder_processed_data_path = './data/processed_data/'

movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'))
movie_df.set_index(['wiki_ID'], inplace=True)

name_by_movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'name_by_movie_ordered_df.csv'))
name_by_movie_df.set_index(['wiki_ID', 'char_words', 'gender'], inplace=True)

baby_name_df = pd.read_csv(os.path.join(folder_processed_data_path, 'baby_name_df.csv'))
baby_name_df.set_index(['name', 'year'], inplace=True)

**Visualisation for a specific name:** Let's choose a name and visualize the variation of the baby names throughout the years. We'll pick `Mia`, because it's a cool name.

In [None]:
# Set a name
chosen_name = "Mia"

# Filter the name_by_movie_df by the chosen name
filt_name_by_movie_df = name_by_movie_df.query("char_words == @chosen_name").copy(deep=True)
filt_name_by_movie_df.reset_index(inplace=True)

display(filt_name_by_movie_df.sort_values(by="wiki_ID").head())
print(f"There are {len(filt_name_by_movie_df)} movies with a character named {chosen_name}.")

In [None]:
# Keep only wiki_ID column
filt_name_by_movie_df = filt_name_by_movie_df[['wiki_ID']].copy(deep=True)

# Merge the movie_df with the filt_name_by_movie_df
filt_movie_df = pd.merge(movie_df, filt_name_by_movie_df, on='wiki_ID', how='inner').copy(deep=True)
display(filt_movie_df.sort_values(by="wiki_ID").head())

There are too many movies. Let's keep only the 10 most famous ones for now. We base it on the `numVotes` column, representing the number of IMDB votes:

In [None]:
# We set the number of top movies for this visualization
top_movie_count = 10

# Sort by top rated movies
sorted_filt_movie_df = filt_movie_df.sort_values(by='numVotes', ascending=False).copy(deep=True)

# Keep only the top 10 movies
filt_movie_df = sorted_filt_movie_df.head(top_movie_count).copy(deep=True)
display(filt_movie_df)

# Get the release dates
release_serie = filt_movie_df['year']

We now plot the percentage of babies having the chosen name `Mia` for each year:

In [None]:
# Grab percentage data
percentage_df = baby_name_df.loc[chosen_name]['percentage'].to_frame().reset_index()
percentage_df.sort_values(by=['year'], ascending=True, inplace=True)
print(f"There are {len(percentage_df)} years with a character named {chosen_name}. \nThe earliest year is {percentage_df['year'].min()} and the latest is {percentage_df['year'].max()}.")

x_values = percentage_df['year'].values
y_values = percentage_df['percentage'].values

# Plotting
plt.plot(x_values, y_values)
plt.xlabel('Years')
plt.ylabel('Percentage of total births')
plt.title(f'Name "{chosen_name}" with ten most reviewed \n movies with a character of the same name')

# Draw vertical lines for each movie release
vertical_lines_series = release_serie.values
for x_value in vertical_lines_series:
    plt.axvline(x=x_value, color='r', linestyle='--')

# Labeling
plt.axvline(x=x_value, color='r', linestyle='--', label='Movie releases')
plt.legend(loc='upper left', bbox_to_anchor=(1.05, 1))
plt.show()

We note that there are not 10 marks on the graphs, that's because some movies had the same release year. We won't be able to distinguish them in our analysis and they will be considered as having the same impact (which would be in reality their "cumulated" impact).

## 2. Parallel questions 
### a. Distribution of months in movies
Looking at studies showing that baby conception rates are at the highest in fall or winter season leading to higher birth in the summer, will movies released in summer show the highest correlation with newborn naming?

First, we look at which data is missing, and how it is spread across the dataset.

In [None]:
plt.close('all')
# How many movies have months?
print(f"Proportion of movies with recorded release month: {movie_df.month.notna().mean()*100:.3f}%")
# Are there years where there is a significant proportion of missing data?
plt.figure(figsize=(15, 7))
plt.hist([movie_df.year[movie_df.month.notna()], movie_df.year[movie_df.month.isna()]], label=['With month', 'No month'], bins='auto', stacked=False)
plt.legend(loc=2)
plt.show()

We would also like to know if there are significant differences in movie releases between each month:

In [None]:
plt.close('all')
# Plot the distribution of months for movies
plt.figure(figsize=(13, 6))
sns.set_palette('rocket')
sns.countplot(x='month', data=movie_df, palette="twilight_shifted")
# plt.bar(['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], movie_df.groupby('month').count().mov_name)
plt.title('Ditribution of movie releases per month')
plt.show()

### b. Movie genres
Does the genre of a movie play a role in the influence of character names on baby naming trends?

In [None]:
# Import the genres
movie_genres_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_genres_df.csv'))

In [None]:
# Plot the number of movies per genre
plt.figure(figsize=(10, 12))
sns.countplot(y='genre', data=movie_genres_df, order=movie_genres_df['genre'].value_counts().index[:40]) # only take top 40 genres

plt.title('Number of Movies per Genre')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.show()

# Count the number of genres in movies
genre_counts = movie_genres_df.groupby('wiki_ID')['genre'].count()
genre_count_frequency = genre_counts.value_counts().sort_index()

genre_count_frequency.plot(kind='bar')
plt.title('Number of Genres per Movie')
plt.xlabel('Number of Genres')
plt.ylabel('Number of Movies')
plt.yscale('log')
plt.show()

In [None]:
top_5_genres = movie_genres_df['genre'].value_counts().index[:5].values
print(top_5_genres)

We can see that the most popular genres are these one above.

### c. Movie popularity
We compute the movie's popularity based on their number of votes on IMDB, which we now have.

In [None]:
plt.close('all')

# plt.figure(figsize=(15, 8))


In [None]:
plt.close('all')

plt.figure(figsize=(10, 10))
plt.subplot(311)
plt.plot(movie_df.groupby('year').numVotes.sum())
plt.title("Total number of ratings on IMDB, per year")
plt.yscale('log')
#
plt.subplot(312)
plt.plot(movie_df.groupby('year').count().mov_name)
plt.title("Total number of movies on IMDB, per year")
plt.yscale('log')
#
plt.subplot(313)
plt.plot(movie_df.groupby('year').numVotes.mean())
plt.title("Mean number of ratings per film, per year")
plt.yscale('log')
plt.show()

In [None]:
# Which movie genres are the most popular? (sum of ratings)

### d. Importance of characters in movies
Importance is based on the TMDB dataset, represented by orders. The lower the number, the more important the character.

In [None]:
name_by_movie_df.groupby('wiki_ID').count()

In [None]:
plt.close('all')
# Plot a histogram and a boxplot of number of characters in movies
fig, (ax_box, ax_hist) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.15, .85)}, figsize=(15, 7))

n_chars = name_by_movie_df.groupby('wiki_ID').count()

sns.boxplot(x=n_chars.values, ax=ax_box, flierprops={'alpha':0.05})
sns.histplot(n_chars, bins=np.max(n_chars)-np.min(n_chars), ax=ax_hist)

ax_box.set(yticks=[])
sns.despine(ax=ax_hist)
sns.despine(ax=ax_box, left=True)

ax_hist.get_legend().remove()
ax_hist.set_yscale('log')
ax_hist.set_xlabel('number of characters')
ax_box.set_title('Distribution of number of characters per movie')
plt.show()


In [None]:
# How many movies have ordered characters? (all characters in the movie have NaN values for orders)

### d. Genders of movie characters