In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

# Prepare data for interactive plot
In this notebook, three csv files are generated for the website interactive plot.
- **movie_impact.csv** : contains the movies to add as scatter points on the baby_name curve for positive, negative and insignificant variations. <br>
        Columns: `['name', 'status', 'group_year', 'movie_id']` <br>
        `'status'` contains either `'t'`, `'b'`, `'i'` for positive, negative and insignificant variation respectively <br>
        `'group_year'` correspond to the year where the movie must be displayed

- **name_per_year.csv** : contains the baby names data for each name and year. The data is in fraction of the total newborns of the year (in percent).<br>
Columns: `['name', 'year', 'percentage']`

- **movies.csv** : contains the movie informations and objects needed to construct the links to the poster and the IMDB wesite. <br>
Columns: `['movie_id', 'mov_name', 'year', 'vote', 'rating', 'poster_url' 'imdb_id]`

All of them are simplified version of the dataframe computed in [preprocessing.ipynb](./preprocessing.ipynb), in order to simplify the plot generation on the website and to avoid slowing down the website with too much data.

In [None]:
folder_processed_data_path = './processed_data/'
processed_website_data_folder = './processed_data/website/'
tmp_data_folder = './tmp_data/'

We import the dataframe computed in preprocessing.

In [None]:
# import the name by movie
name_by_movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'name_by_movie_ordered_pvalue_10_5_df.csv'))
name_by_movie_df.set_index(['wiki_ID', 'char_words', 'gender'], inplace=True)
display(name_by_movie_df.sample(2))

# import the movie dataframe
movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'))
movie_df.set_index(['wiki_ID'], inplace=True)
display(movie_df.sample(2))

# import the name_by_movie dataframe
baby_name_df = pd.read_csv(os.path.join(tmp_data_folder, 'zero_padding_baby_name_df.csv'))
baby_name_df.set_index(['name', 'year'], inplace=True)
display(baby_name_df.sample(2))

## Compute `movie_impact` dataframe

The dataframe `movie_impact` is a subset of the dataframe `name_by_movie` computed in [preprocessing.ipynb](./preprocessing.ipynb).

### Filtering of `name_by_movie`  : importance of the role

Lets remove the character names with an minor role in the movie. We will keep only the characters with an order higher or equal to the median of the set of order in the movie. First, let's compute the number of order for each movies and merge it the `name_by_movie` dataframe.

In [None]:
# compute the order count of each movie
name_by_movie_groupby_id = name_by_movie_df.groupby(['wiki_ID'])
order_counts = name_by_movie_groupby_id['order'].count()

# convert from serie to dataframe and rename the columns
order_counts_df = order_counts.to_frame()
order_counts_df.rename(columns={"order": "nb_order"}, inplace=True)
display(order_counts_df.head())

In [None]:
# merge the order count with name_by_movie_df
name_by_movie_order_counts = name_by_movie_df.reset_index().merge(order_counts_df, on='wiki_ID', how='left')
display(name_by_movie_order_counts.sample(2))

In [None]:
# check a special case with tommy character in Titanic (1997)
name_by_movie_order_counts[name_by_movie_order_counts['wiki_ID'] == 52371]

In [None]:
# filter to keep only half of the characters with the lowest order
name_by_movie_merged_important_role = name_by_movie_order_counts[name_by_movie_order_counts['order'] <= (name_by_movie_order_counts['nb_order']/2)].copy(deep=True)

print(f"length of the dataframe : {len(name_by_movie_df)} -> {len(name_by_movie_merged_important_role)}")

In [None]:
# check case with tommy charcater in titanic
name_by_movie_merged_important_role[name_by_movie_merged_important_role['wiki_ID'] == 52371]

In [None]:
# set this dataframe as name_by_movie_df
name_by_movie_df = name_by_movie_merged_important_role.copy(deep=True)
print(f"Length of the name_by_movie_df : {len(name_by_movie_df)}")
print(f"Is the indexing of name_by_movie_web unique? {name_by_movie_df.index.is_unique}")

### Preprocessing `name_by_movie`
In this section, we remove useless columns and NaN values, add the movie release year for each character and set the name in lowercase.

In [None]:
# Remove useless 't_stat' column
name_by_movie_web = name_by_movie_df.reset_index().copy(deep=True)
name_by_movie_web.drop(columns=['index', 't_stat', 'order', 'nb_order'], inplace=True)
display(name_by_movie_web.sample(2))

In [None]:
# drop gender column
len_before = len(name_by_movie_web)

name_by_movie_web.reset_index(inplace=True, drop=True)
name_by_movie_web.drop_duplicates(subset=['wiki_ID', 'char_words'], keep='first', inplace=True)
name_by_movie_web.drop(columns=['gender'], inplace=True)
name_by_movie_web.set_index(['char_words', 'wiki_ID'], inplace=True)

len_after = len(name_by_movie_web)
print(f"length of the dataframe : {len_before} -> {len_after}")
print(f"Is the indexing of name_by_movie_web unique? {name_by_movie_web.index.is_unique}")
display(name_by_movie_web.sample(2))

In [None]:
# drop rows with NaN values in 'p_value' column
len_before = len(name_by_movie_web)
name_by_movie_web.dropna(subset=['p_value'], inplace=True)
len_after = len(name_by_movie_web)
print(f"length : {len_before} -> {len_after}")

In [None]:
# Add needed info about movie for deducing the TOP/BOTTOM/INSIGN movies
needed_movie_info = movie_df.reset_index()[['wiki_ID', 'year', 'averageRating', 'numVotes']].copy(deep=True)

len_before_merge = len(name_by_movie_web)
name_by_movie_with_info = name_by_movie_web.reset_index().merge(needed_movie_info, on='wiki_ID', how='left').copy(deep=True) # merge the release year into the name_by_movie_web dataframe
len_after_merge = len(name_by_movie_with_info)
print(f"length : {len_before_merge} -> {len_after_merge}")

name_by_movie_with_info.set_index(['char_words', 'wiki_ID'], inplace=True)
display(name_by_movie_with_info.sample(2))

In [None]:
# set the char_words in lowercase
name_by_movie_with_info.reset_index(inplace=True)
name_by_movie_with_info['char_words'] = name_by_movie_with_info['char_words'].str.lower()
name_by_movie_with_info.set_index(['char_words', 'wiki_ID'], inplace=True)
print(f"Length of name_by_movie_with_info : {len(name_by_movie_with_info)}")
display(name_by_movie_with_info.sample(2))

In [None]:
# create the movie_impact dataframe
columns = ['name', 'status', 'group_year', 'movie_id']
movie_impact_df = pd.DataFrame(columns=columns)
display(movie_impact_df)

### Three functions to compute TOP/BOTTOM/INSIGN
These three functions are used to compute the `movie_impact_df`

In [None]:
def compute_top_movies(name, chosen_name_movies_top_df, chosen_name_movies_df, movie_impact_df):
    for i in range(5): # from top1 to top5
        if(not chosen_name_movies_top_df.empty):
            # get the year with the highest positive variation
            top_i_year = chosen_name_movies_top_df.iloc[-1].year.astype(int)

            # get the movies released close to the year with the highest positive variation [top_i_year-3, top_i_year+3]
            top_i_year_chosen_name_movies = chosen_name_movies_df.query(f'year >= {top_i_year - 3} and year <= {top_i_year + 3}').copy(deep=True)

            # keep only the three most popular movies
            top_i_year_chosen_name_movies.sort_values(by=['numVotes'], ascending=False, inplace=True)
            top_i_3_chosen_name_movies = top_i_year_chosen_name_movies.iloc[:3].copy(deep=True)

            # add the the three popular movies to the movie_impact_df
            for index, row in top_i_3_chosen_name_movies.iterrows():
                movie_impact_df.loc[len(movie_impact_df)] = {'name': name, 'status': 't', 'group_year': top_i_year, 'movie_id': index}

            # remove also the movies release in the frame [-5 years, +5 years] of the top1_year
            chosen_name_movies_top_df.query(f'year < {top_i_year - 5} or year > {top_i_year + 5}', inplace=True)

            # remove the found movies from the chosen_name_movies_df to avoid picking them again for next iterations
            chosen_name_movies_df.drop(top_i_3_chosen_name_movies.index, inplace=True)

In [None]:
def compute_bottom_movies(name, chosen_name_movies_bottom_df, chosen_name_movies_df, movie_impact_df):
    for i in range(5): # from bottom1 to bottom5
        if(not chosen_name_movies_bottom_df.empty):
            # get the year with the highest negative variation
            bottom_i_year = chosen_name_movies_bottom_df.iloc[-1].year.astype(int)

            # get the movies released close to the year with the highest negative variation [bottom_i_year-3, bottom_i_year+3]
            bottom_i_year_chosen_name_movies = chosen_name_movies_df.query(f'year >= {bottom_i_year - 3} and year <= {bottom_i_year + 3}').copy(deep=True)

            # keep only the three most popular movies
            bottom_i_year_chosen_name_movies.sort_values(by=['numVotes'], ascending=False, inplace=True)
            bottom_i_3_chosen_name_movies = bottom_i_year_chosen_name_movies.iloc[:3].copy(deep=True)

            # add the the three popular movies to the movie_impact_df
            for index, row in bottom_i_3_chosen_name_movies.iterrows():
                movie_impact_df.loc[len(movie_impact_df)] = {'name': name, 'status': 'b', 'group_year': bottom_i_year, 'movie_id': index}

            # remove also the movies release in the frame [-5 years, +5 years] of the bottom_i_year
            chosen_name_movies_bottom_df.query(f'year < {bottom_i_year - 5} or year > {bottom_i_year + 5}', inplace=True)

            # remove the found movies from the chosen_name_movies_df to avoid picking them again for next iterations
            chosen_name_movies_df.drop(bottom_i_3_chosen_name_movies.index, inplace=True)

In [None]:
def compute_insign_movies(name, chosen_name_movies_insign_df, movie_impact_df):
    for i in range(5): # from insign1 to insign5
        if(not chosen_name_movies_insign_df.empty):
            # get the movies release the year with the highest rating and number of votes
            insign_i_year = chosen_name_movies_insign_df.iloc[-1].year.astype(int)
            insign_i_year_chosen_name_movies = chosen_name_movies_insign_df.query(f'year == {insign_i_year}').copy(deep=True)

            # add the most popular movie to the movie_impact_df
            movie_impact_df.loc[len(movie_impact_df)] = {'name': name, 'status': 'i', 'group_year': insign_i_year, 'movie_id': insign_i_year_chosen_name_movies.index[0]}

            # remove the found movies from the chosen_name_movies_top_df to avoid picking them again for next iterations
            chosen_name_movies_insign_df.drop(insign_i_year_chosen_name_movies.index, inplace=True)

### (Optional) Special case for a single name

In [None]:
# given name by user
chosen_name = 'elizabeth'

In [None]:
# get the movie containing the chosen name
chosen_name_movies_df = name_by_movie_with_info.loc[chosen_name, :].copy(deep=True)
display(chosen_name_movies_df)

In [None]:
# sort by slope_change to have the movies ranked by how big the variation is in the movie release year
chosen_name_movies_df.sort_values(by=['slope_change'], inplace=True)

# keep only the candidate movies for the TOP 5 movies with positive impact
chosen_name_movies_top_df = chosen_name_movies_df.query('(slope_change > 0) and (p_value < 0.1)').copy(deep=True)
display(chosen_name_movies_top_df)

In [None]:
# compute TOP 5 and add it to the movie_impact_df
compute_top_movies(chosen_name, chosen_name_movies_top_df, chosen_name_movies_df, movie_impact_df)
display(movie_impact_df)

In [None]:
# keep only the candidate movies for the BOTTOM 5 movies with negative impact
chosen_name_movies_bottom_df = chosen_name_movies_df.query('(slope_change <= 0) and (p_value < 0.1)').copy(deep=True)
# sort by slope_change in the opposite direction to have the highest negative variation at the end
chosen_name_movies_bottom_df.sort_values(by=['slope_change'], ascending=True, inplace=True)
display(chosen_name_movies_bottom_df)

In [None]:
# compute BOTTOM 5 and add it to the movie_impact_df
compute_bottom_movies(chosen_name, chosen_name_movies_bottom_df, chosen_name_movies_df, movie_impact_df)
display(movie_impact_df)

In [None]:
# keep only the candidate movies for the top 5 most famous movies with no significant impact
chosen_name_movies_insign_df = chosen_name_movies_df.query('p_value > 0.1').copy(deep=True)

# rank them by popularity
chosen_name_movies_insign_df.sort_values(by=['averageRating', 'numVotes'], inplace=True)
display(chosen_name_movies_insign_df)

In [None]:
# compute INSIGN 5 and add it to the movie_impact_df
compute_insign_movies(chosen_name, chosen_name_movies_insign_df, movie_impact_df)
display(movie_impact_df)

In [None]:
# check if there movie duplicates in the movie_impact_df
print(f"number of duplicates in movie_impact_df : {movie_impact_df.duplicated(subset=['movie_id']).sum()}")

### Generalize for all the names to generate `movie_impact_df`

In [None]:
# get all the names in the name_by_movie_with_info dataframe
names = name_by_movie_with_info.index.get_level_values(0).unique().tolist()
print(names)
print(f"Number of names : {len(names)}")

In [None]:
iter = 0
for name in names:
    print(f"Number of name treated: {iter}", end='\r', flush=True)
    # get the movie containing the chosen name
    chosen_name_movies_df = name_by_movie_with_info.loc[name, :].copy(deep=True)

    # sort the movies by slope_change, get movies with significant p_value and positive slope_change and compute the top 5
    chosen_name_movies_df.sort_values(by=['slope_change'], inplace=True)
    chosen_name_movies_top_df = chosen_name_movies_df.query('(slope_change > 0) and (p_value < 0.1)').copy(deep=True)
    compute_top_movies(name, chosen_name_movies_top_df, chosen_name_movies_df, movie_impact_df)

    # sort the movies by slope_change, get movies with significant p_value and negative slope_change and compute the bottom 5
    chosen_name_movies_bottom_df = chosen_name_movies_df.query('(slope_change <= 0) and (p_value < 0.1)').copy(deep=True)
    chosen_name_movies_bottom_df.sort_values(by=['slope_change'], ascending=False, inplace=True)
    compute_bottom_movies(name, chosen_name_movies_bottom_df, chosen_name_movies_df, movie_impact_df)

    # get the remaining movies and filter to keep only the insignificant ones and compute the insign 5
    chosen_name_movies_insign_df = chosen_name_movies_df.query('p_value > 0.1').copy(deep=True)
    chosen_name_movies_insign_df.sort_values(by=['numVotes'], inplace=True)
    compute_insign_movies(name, chosen_name_movies_insign_df, movie_impact_df)
    
    iter = iter + 1

In [None]:
# see the result
display(movie_impact_df.sample(10))

### (Optional) Import `movie_impact` dataframe to avoid timeconsuming computation

In [None]:
# import the movie_impact_df to avoid the time-consuming computation of movie_impact_df
imported_movie_impact_df = pd.read_csv(os.path.join(processed_website_data_folder, 'movie_impacts.csv'))
imported_movie_impact_df.set_index(['name', 'status'], inplace=True)
display(imported_movie_impact_df)

In [None]:
# Check for NaN values in the name column
contains_nan = imported_movie_impact_df.reset_index()['name'].isna().any()

if contains_nan:
    print("The `name` column contains NaN values.")
else:
    print("The `name` column does not contain NaN values.")

For some reason, the name column contains NaN values. Let's remove them.

In [None]:
# drop NaN values in the name column
print(f"Length of imported_movie_impact_df before dropping NaN values : {len(imported_movie_impact_df)}")
imported_movie_impact_df.reset_index(inplace=True)
imported_movie_impact_df.dropna(subset=['name'], inplace=True)
imported_movie_impact_df.set_index(['name', 'status'], inplace=True)
print(f"Length of imported_movie_impact_df after dropping NaN values : {len(imported_movie_impact_df)}")

display(imported_movie_impact_df)
movie_impact_df = imported_movie_impact_df.copy(deep=True)

In [None]:
# Check for NaN values in the name column
contains_nan = imported_movie_impact_df.reset_index()['name'].isna().any()

if contains_nan:
    print("The `name` column contains NaN values.")
else:
    print("The `name` column does not contain NaN values.")

## Compute `name_per_year` dataframe

The dataframe `name_per_year` is a subset of the dataframe `name_by_movie` computed in [preprocessing.ipynb](./preprocessing.ipynb).

In [None]:
# remove number column
name_per_year = baby_name_df.copy(deep=True)
name_per_year.drop(columns='number', inplace=True)
display(name_per_year.sample(2))
print(f"Is the indexing of name_per_year unique? {name_per_year.index.is_unique}")
print(f"Length of the name_per_year_df : {len(name_per_year)}")

In [None]:
# round the percentage values to reduce the size of the future csv file for the web
name_per_year['percentage'] = name_per_year['percentage'].round(6)

# set the names in lowercases
name_per_year.reset_index(inplace=True)
name_per_year['name'] = name_per_year['name'].str.lower()
name_per_year.set_index(['name', 'year'], inplace=True)

display(name_per_year.sample(2))
print(f"Length of the name_per_year_df : {len(name_per_year)}")

In [None]:
# filter the name_per_year_df to keep only the names in the movie_impact_df
name_per_year_filtered = name_per_year.loc[movie_impact_df.reset_index()['name'].unique().tolist(), :].copy(deep=True)
name_per_year = name_per_year_filtered.copy(deep=True)

In [None]:
# Let's sort the dataframe name_per_year to anticipate the ploting
name_per_year.sort_values(by=['name', 'year'], inplace=True)
display(name_per_year.head(15))

Compute the number of names in the three dataframes in order to compare

In [None]:
# compute how many unique names are there in the dataframes to compare
names_in_name_per_year = name_per_year.reset_index()['name'].unique().tolist()
print(f"Number of unique names in name_per_year = {len(names_in_name_per_year)}")
names_in_movie_impact = movie_impact_df.reset_index()['name'].unique().tolist()
print(f"Number of unique names in movie_impact_df = {len(names_in_movie_impact)}")
names_in_name_by_movie_with_info = name_by_movie_with_info.reset_index()['char_words'].unique().tolist()
print(f"Number of unique names in name_by_movie_with_info = {len(names_in_name_by_movie_with_info)}")

In [None]:
# Identify names in name_by_movie not present in name_per_year
values_only_in_name_by_movie = name_by_movie_with_info.reset_index()[~name_by_movie_with_info.reset_index()['char_words'].isin(name_per_year.reset_index()['name'])]['char_words'].unique()

print(f"Number of unique names in name_by_movie_with_info  = {len(name_by_movie_with_info.reset_index()['char_words'].unique())}")

# Display the result
print("Values in name_by_movie_with_info but not present in name_per_year:")
print(values_only_in_name_by_movie)
print(f"Number of names missing  = {len(values_only_in_name_by_movie)}")

All good :)

## Compute `movies` dataframe
This dataset contains the information relative to the movie given its `wiki_ID`. It is a subset of the dataframe `movie_df` computed in [preprocessing.ipynb](./preprocessing.ipynb).

In [None]:
# keep only the movies in the movie_df that are in the movie_impact_df
simplified_movie_df = movie_df.loc[movie_impact_df['movie_id'].unique().tolist(), :].copy(deep=True)

print(f"Number of movies in movie_impact_df : {len(movie_impact_df['movie_id'].unique())}")
print(f"Number of movies kept in movie_df: {len(simplified_movie_df)}")

# remove useless columns
simplified_movie_df.drop(columns=['month', 'revenue'], inplace=True)
simplified_movie_df.rename(columns={'averageRating': 'rating', 'numVotes': 'votes', 'IMDB_ID': 'imdb_id'}, inplace=True)

display(simplified_movie_df.sample(2))

## Export the three datasets

In [None]:
# Export movie_impact.csv
display(movie_impact_df.reset_index().sample(2))
movie_impact_df.reset_index().to_csv(os.path.join(processed_website_data_folder, 'movie_impacts.csv'), index=False)

# Export name_per_year.csv
display(name_per_year.reset_index().sample(2))
name_per_year.reset_index().to_csv(os.path.join(processed_website_data_folder, 'name_per_year.csv'), index=False)

# Export movie.csv
display(simplified_movie_df.reset_index().sample(2))
simplified_movie_df.reset_index().to_csv(os.path.join(processed_website_data_folder, 'movies.csv'), index=False)