In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

# Prepare data for interactive plot
In this notebook, three csv files are generated for the website interactive plot.
- **movie_impact.csv** : contains the movies to add as scatter points on the baby_name curve for positive, negative and insignificant variations. <br>
        Columns: `['name', 'status', 'group_year', 'movie_id']` <br>
        `'status'` contains either `'t'`, `'b'`, `'i'` for positive, negative and insignificant variation respectively <br>
        `'group_year'` correspond to the year where the movie must be displayed

- **name_per_year.csv** : contains the baby names data for each name and year. The data is in fraction of the total newborns of the year (in percent).<br>
Columns: `['name', 'year', 'percentage']`

- **movies.csv** : contains the movie informations and objects needed to construct the links to the poster and the IMDB wesite. <br>
Columns: `['movie_id', 'mov_name', 'year', 'vote', 'rating', 'poster_url' 'imdb_id]`

All of them are simplified version of the dataframe computed in [preprocessing.ipynb](./preprocessing.ipynb), in order to simplify the plot generation on the website and to avoid slowing down the website with too much data.

In [2]:
folder_processed_data_path = './processed_data/'
processed_website_data_folder = './processed_data/website/'

We import the dataframe computed in preprocessing.

In [3]:
# import the name by movie
name_by_movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'name_by_movie_ordered_pvalue_10_5_df.csv'))
name_by_movie_df.set_index(['wiki_ID', 'char_words', 'gender'], inplace=True)
display(name_by_movie_df.sample(2))

# import the movie dataframe
movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'))
movie_df.set_index(['wiki_ID'], inplace=True)
display(movie_df.sample(2))

# import the name_by_movie dataframe
baby_name_df = pd.read_csv(os.path.join(folder_processed_data_path, 'baby_name_df.csv'))
baby_name_df.set_index(['name', 'year'], inplace=True)
display(baby_name_df.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,order,t_stat,p_value,slope_change
wiki_ID,char_words,gender,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
24133239,Hortense,F,9.0,-0.230267,0.822111,6.2e-05
1871167,Stu,M,9.0,,,0.0


Unnamed: 0_level_0,mov_name,year,month,revenue,numVotes,averageRating,poster_url,IMDB_ID
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
6159203,Occupation 101,2006,3.0,,1839,8.2,,tt0807956
10206962,Wonder Boys,2000,2.0,33426588.0,65971,7.2,/2MdBltbu2e2sbKmoK1vXNBZOkdB.jpg,tt0185014


Unnamed: 0_level_0,Unnamed: 1_level_0,number,percentage
name,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Lewis,1903,336,0.091102
Kinan,1997,1,2.8e-05


## Compute `movie_impact` dataframe

The dataframe `movie_impact` is a subset of the dataframe `name_by_movie` computed in [preprocessing.ipynb](./preprocessing.ipynb).

### Filtering of `name_by_movie`  : importance of the role

Lets remove the character names with an minor role in the movie. We will keep only the characters with an order higher or equal to the median of the set of order in the movie. First, let's compute the number of order for each movies and merge it the `name_by_movie` dataframe.

In [4]:
# compute the order count of each movie
name_by_movie_groupby_id = name_by_movie_df.groupby(['wiki_ID'])
order_counts = name_by_movie_groupby_id['order'].count()

# convert from serie to dataframe and rename the columns
order_counts_df = order_counts.to_frame()
order_counts_df.rename(columns={"order": "nb_order"}, inplace=True)
display(order_counts_df.head())

Unnamed: 0_level_0,nb_order
wiki_ID,Unnamed: 1_level_1
3217,11
3746,15
3837,18
3947,16
4227,2


In [5]:
# merge the order count with name_by_movie_df
name_by_movie_order_counts = name_by_movie_df.reset_index().merge(order_counts_df, on='wiki_ID', how='left')
display(name_by_movie_order_counts.sample(2))

Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order
35398,1640736,Kerry,F,10.0,-0.212025,0.835965,0.000137,17
165075,32163620,Claire,F,7.0,-0.424148,0.679634,0.00079,9


In [6]:
# check a special case with tommy character in Titanic (1997) (wiki_ID = 52371) (toy_story_1995_ID = 53085)
display(name_by_movie_order_counts[name_by_movie_order_counts['wiki_ID'] == 52371])
len(name_by_movie_order_counts)

Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order
1891,52371,Caledon,M,,,,0.0,28
1892,52371,Dewitt,F,,-0.414992,0.686126,5.9e-05,28
1893,52371,Rose,F,1.0,-1.552929,0.14872,0.002796,28
1894,52371,De,M,,0.785788,0.448587,-0.000182,28
1895,52371,Jack,M,0.0,-1.987322,0.072358,0.012744,28
1896,52371,Charles,M,,-3.093865,0.010215,0.008629,28
1897,52371,Bruce,M,10.0,-2.545008,0.027236,0.00157,28
1898,52371,Calvert,F,12.0,0.216687,0.832419,-1.7e-05,28
1899,52371,Ismay,M,10.0,,,0.0,28
1900,52371,Brown,F,3.0,0.0,1.0,0.0,28


172906

In [7]:
# add a boolean attribute to check if all the order are nan
all_nan_mask = name_by_movie_order_counts.groupby('wiki_ID')['order'].transform(lambda x: x.isna().all())
name_by_movie_order_counts['all_nan'] = all_nan_mask
display(name_by_movie_order_counts)
display(name_by_movie_order_counts[name_by_movie_order_counts['wiki_ID'] == 53085])

Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order,all_nan
0,3217,Gold,,6.0,,,0.000000,11,False
1,3217,Linda,F,7.0,-0.416786,0.684853,0.000673,11,False
2,3217,Henry,M,4.0,-2.031668,0.067058,0.002513,11,False
3,3217,Duke,M,4.0,0.579441,0.573967,-0.000113,11,False
4,3217,Warrior,M,9.0,,,0.000000,11,False
...,...,...,...,...,...,...,...,...,...
172901,37478048,Ajay,M,9.0,-0.819213,0.430057,0.000130,1,False
172902,37501922,Murphy,F,3.0,1.264175,0.232298,-0.000365,4,False
172903,37501922,Hunter,M,1.0,-7.083089,0.000020,0.036603,4,False
172904,37501922,John,M,1.0,-2.172964,0.052505,0.012557,4,False


Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order,all_nan
1998,53085,Sheriff,M,,0.229327,0.822823,-7e-06,0,True
1999,53085,Hannah,F,,0.42509,0.678968,-0.004957,0,True
2000,53085,Sid,M,,-0.191613,0.851537,1.1e-05,0,True
2001,53085,Woody,M,,-0.070205,0.945291,7e-06,0,True
2002,53085,Rex,M,,-0.685448,0.50725,0.000164,0,True
2003,53085,Buzz,M,,0.0,1.0,0.0,0,True
2004,53085,Bo,F,,0.064099,0.950041,-3.2e-05,0,True
2005,53085,Andy,M,,-1.964199,0.075273,0.002058,0,True


In [8]:
# filter to keep only half of the characters with the lowest order, or keep all of them if all the order of the movie are NaN
name_by_movie_merged_important_role = name_by_movie_order_counts[name_by_movie_order_counts['all_nan'] | (name_by_movie_order_counts['order'] <= (name_by_movie_order_counts['nb_order']/2))].copy(deep=True)
print(f"length of the dataframe : {len(name_by_movie_df)} -> {len(name_by_movie_merged_important_role)}")

length of the dataframe : 172906 -> 100988


In [9]:
# check case with tommy charcater in titanic
name_by_movie_merged_important_role[name_by_movie_merged_important_role['wiki_ID'] == 52371]

Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order,all_nan
1893,52371,Rose,F,1.0,-1.552929,0.14872,0.002796,28,False
1895,52371,Jack,M,0.0,-1.987322,0.072358,0.012744,28,False
1897,52371,Bruce,M,10.0,-2.545008,0.027236,0.00157,28,False
1898,52371,Calvert,F,12.0,0.216687,0.832419,-1.7e-05,28,False
1899,52371,Ismay,M,10.0,,,0.0,28,False
1900,52371,Brown,F,3.0,0.0,1.0,0.0,28,False
1901,52371,Bobby,M,13.0,-1.158331,0.271259,0.001117,28,False
1905,52371,Fabrizio,M,14.0,-0.994141,0.34152,7.1e-05,28,False
1907,52371,Lizzy,F,12.0,-3.342963,0.006559,0.0001,28,False
1909,52371,Andrews,M,6.0,-0.561223,0.585897,4.5e-05,28,False


In [10]:
# set this dataframe as name_by_movie_df
name_by_movie_df = name_by_movie_merged_important_role.copy(deep=True)
print(f"Length of the name_by_movie_df : {len(name_by_movie_df)}")
print(f"Is the indexing of name_by_movie_web unique? {name_by_movie_df.index.is_unique}")

Length of the name_by_movie_df : 100988
Is the indexing of name_by_movie_web unique? True


### Preprocessing `name_by_movie`
In this section, we remove useless columns and NaN values, add the movie release year for each character and set the name in lowercase.

In [11]:
# Remove useless 't_stat' column
name_by_movie_web = name_by_movie_df.reset_index().copy(deep=True)
name_by_movie_web.drop(columns=['index', 't_stat', 'order', 'nb_order'], inplace=True)
display(name_by_movie_web.sample(2))

Unnamed: 0,wiki_ID,char_words,gender,p_value,slope_change,all_nan
5960,240357,Eve,F,0.441925,-0.000356,False
42002,5566075,King,M,0.623983,0.000308,False


In [12]:
# drop gender column
len_before = len(name_by_movie_web)

name_by_movie_web.reset_index(inplace=True, drop=True)
name_by_movie_web.drop_duplicates(subset=['wiki_ID', 'char_words'], keep='first', inplace=True)
name_by_movie_web.drop(columns=['gender'], inplace=True)
name_by_movie_web.set_index(['char_words', 'wiki_ID'], inplace=True)

len_after = len(name_by_movie_web)
print(f"length of the dataframe : {len_before} -> {len_after}")
print(f"Is the indexing of name_by_movie_web unique? {name_by_movie_web.index.is_unique}")
display(name_by_movie_web.sample(2))

length of the dataframe : 100988 -> 97615
Is the indexing of name_by_movie_web unique? True


Unnamed: 0_level_0,Unnamed: 1_level_0,p_value,slope_change,all_nan
char_words,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jason,16766524,0.51208,-0.00379,False
Marcie,1075228,0.554638,-0.001315,False


In [13]:
# drop rows with NaN values in 'p_value' column
len_before = len(name_by_movie_web)
name_by_movie_web.dropna(subset=['p_value'], inplace=True)
len_after = len(name_by_movie_web)
print(f"length : {len_before} -> {len_after}")

length : 97615 -> 86657


In [14]:
# Add needed info about movie for deducing the TOP/BOTTOM/INSIGN movies
needed_movie_info = movie_df.reset_index()[['wiki_ID', 'year', 'averageRating', 'numVotes']].copy(deep=True)

len_before_merge = len(name_by_movie_web)
name_by_movie_with_info = name_by_movie_web.reset_index().merge(needed_movie_info, on='wiki_ID', how='left').copy(deep=True) # merge the release year into the name_by_movie_web dataframe
len_after_merge = len(name_by_movie_with_info)
print(f"length : {len_before_merge} -> {len_after_merge}")

name_by_movie_with_info.set_index(['char_words', 'wiki_ID'], inplace=True)
display(name_by_movie_with_info.sample(2))

length : 86657 -> 86657


Unnamed: 0_level_0,Unnamed: 1_level_0,p_value,slope_change,all_nan,year,averageRating,numVotes
char_words,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Barbara,3828251,0.057605,0.001707,False,1999,5.5,35469
Richard,29840120,0.167792,0.002697,False,2012,5.3,1151


In [15]:
# set the char_words in lowercase
name_by_movie_with_info.reset_index(inplace=True)
name_by_movie_with_info['char_words'] = name_by_movie_with_info['char_words'].str.lower()
name_by_movie_with_info.set_index(['char_words', 'wiki_ID'], inplace=True)
print(f"Length of name_by_movie_with_info : {len(name_by_movie_with_info)}")
display(name_by_movie_with_info.sample(2))

Length of name_by_movie_with_info : 86657


Unnamed: 0_level_0,Unnamed: 1_level_0,p_value,slope_change,all_nan,year,averageRating,numVotes
char_words,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
anderson,16305665,0.400544,0.000197,True,1986,6.6,1235
money,11745640,0.811578,1.6e-05,True,2005,5.4,73


In [16]:
# create the movie_impact dataframe
columns = ['name', 'status', 'group_year', 'movie_id']
movie_impact_df = pd.DataFrame(columns=columns)
display(movie_impact_df)

Unnamed: 0,name,status,group_year,movie_id


### Three functions to compute TOP/BOTTOM/INSIGN
These three functions are used to compute the `movie_impact_df`

In [17]:
def compute_top_movies(name, chosen_name_movies_top_df, chosen_name_movies_df, movie_impact_df):
    for i in range(5): # from top1 to top5
        if(not chosen_name_movies_top_df.empty):
            # get the year with the highest positive variation
            top_i_year = chosen_name_movies_top_df.iloc[-1].year.astype(int)

            # get the movies released close to the year with the highest positive variation [top_i_year-3, top_i_year+3]
            top_i_year_chosen_name_movies = chosen_name_movies_df.query(f'year >= {top_i_year - 3} and year <= {top_i_year + 3}').copy(deep=True)

            # keep only the three most popular movies
            top_i_year_chosen_name_movies.sort_values(by=['numVotes'], ascending=False, inplace=True)
            top_i_3_chosen_name_movies = top_i_year_chosen_name_movies.iloc[:3].copy(deep=True)

            # add the the three popular movies to the movie_impact_df
            for index, row in top_i_3_chosen_name_movies.iterrows():
                movie_impact_df.loc[len(movie_impact_df)] = {'name': name, 'status': 't', 'group_year': top_i_year, 'movie_id': index}

            # remove also the movies release in the frame [-5 years, +5 years] of the top1_year
            chosen_name_movies_top_df.query(f'year < {top_i_year - 5} or year > {top_i_year + 5}', inplace=True)

            # remove the found movies from the chosen_name_movies_df to avoid picking them again for next iterations
            chosen_name_movies_df.drop(top_i_3_chosen_name_movies.index, inplace=True)

In [18]:
def compute_bottom_movies(name, chosen_name_movies_bottom_df, chosen_name_movies_df, movie_impact_df):
    for i in range(5): # from bottom1 to bottom5
        if(not chosen_name_movies_bottom_df.empty):
            # get the year with the highest negative variation
            bottom_i_year = chosen_name_movies_bottom_df.iloc[-1].year.astype(int)

            # get the movies released close to the year with the highest negative variation [bottom_i_year-3, bottom_i_year+3]
            bottom_i_year_chosen_name_movies = chosen_name_movies_df.query(f'year >= {bottom_i_year - 3} and year <= {bottom_i_year + 3}').copy(deep=True)

            # keep only the three most popular movies
            bottom_i_year_chosen_name_movies.sort_values(by=['numVotes'], ascending=False, inplace=True)
            bottom_i_3_chosen_name_movies = bottom_i_year_chosen_name_movies.iloc[:3].copy(deep=True)

            # add the the three popular movies to the movie_impact_df
            for index, row in bottom_i_3_chosen_name_movies.iterrows():
                movie_impact_df.loc[len(movie_impact_df)] = {'name': name, 'status': 'b', 'group_year': bottom_i_year, 'movie_id': index}

            # remove also the movies release in the frame [-5 years, +5 years] of the bottom_i_year
            chosen_name_movies_bottom_df.query(f'year < {bottom_i_year - 5} or year > {bottom_i_year + 5}', inplace=True)

            # remove the found movies from the chosen_name_movies_df to avoid picking them again for next iterations
            chosen_name_movies_df.drop(bottom_i_3_chosen_name_movies.index, inplace=True)

In [19]:
def compute_insign_movies(name, chosen_name_movies_insign_df, movie_impact_df):
    for i in range(5): # from insign1 to insign5
        if(not chosen_name_movies_insign_df.empty):
            # get the movies release the year with the highest rating and number of votes
            insign_i_year = chosen_name_movies_insign_df.iloc[-1].year.astype(int)
            insign_i_year_chosen_name_movies = chosen_name_movies_insign_df.query(f'year == {insign_i_year}').copy(deep=True)

            # add the most popular movie to the movie_impact_df
            movie_impact_df.loc[len(movie_impact_df)] = {'name': name, 'status': 'i', 'group_year': insign_i_year, 'movie_id': insign_i_year_chosen_name_movies.index[0]}

            # remove the found movies from the chosen_name_movies_top_df to avoid picking them again for next iterations
            chosen_name_movies_insign_df.drop(insign_i_year_chosen_name_movies.index, inplace=True)

### (Optional) Special case for a single name

In [20]:
# given name by user
chosen_name = 'elizabeth'

In [21]:
# get the movie containing the chosen name
chosen_name_movies_df = name_by_movie_with_info.loc[chosen_name, :].copy(deep=True)
display(chosen_name_movies_df)

Unnamed: 0_level_0,p_value,slope_change,all_nan,year,averageRating,numVotes
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
44366,0.839461,0.001306,False,2000,6.7,35871
45126,0.502248,-0.007198,False,1969,7.4,7973
57820,0.719843,-0.002252,False,2001,8.0,833937
62592,0.988123,0.000095,False,1940,7.9,72466
73375,0.016718,0.018199,False,1935,7.8,51978
...,...,...,...,...,...,...
34194778,0.949114,0.000425,False,1939,6.4,348
34377191,0.085567,0.011863,False,2012,6.9,116791
35610105,0.085567,0.011863,False,2012,5.9,12242
35641836,0.085567,0.011863,False,2012,6.0,14921


In [22]:
# sort by slope_change to have the movies ranked by how big the variation is in the movie release year
chosen_name_movies_df.sort_values(by=['slope_change'], inplace=True)

# keep only the candidate movies for the TOP 5 movies with positive impact
chosen_name_movies_top_df = chosen_name_movies_df.query('(slope_change > 0) and (p_value < 0.1)').copy(deep=True)
display(chosen_name_movies_top_df)

Unnamed: 0_level_0,p_value,slope_change,all_nan,year,averageRating,numVotes
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
34377191,0.085567,0.011863,False,2012,6.9,116791
25129766,0.085567,0.011863,False,2012,6.2,276685
31775043,0.085567,0.011863,False,2012,7.0,631028
31960682,0.085567,0.011863,False,2012,7.1,182886
30265620,0.085567,0.011863,False,2012,7.9,536962
35610105,0.085567,0.011863,False,2012,5.9,12242
35641836,0.085567,0.011863,False,2012,6.0,14921
15280057,0.073254,0.012947,False,1949,6.6,2129
14001162,0.073254,0.012947,False,1949,6.4,477
183744,0.019626,0.01487,False,1946,7.2,1237


In [23]:
# compute TOP 5 and add it to the movie_impact_df
compute_top_movies(chosen_name, chosen_name_movies_top_df, chosen_name_movies_df, movie_impact_df)
display(movie_impact_df)

Unnamed: 0,name,status,group_year,movie_id
0,elizabeth,t,1974,1358544
1,elizabeth,t,1974,3048950
2,elizabeth,t,1974,223904
3,elizabeth,t,1935,73376
4,elizabeth,t,1935,73375
5,elizabeth,t,1935,600132
6,elizabeth,t,1946,2279601
7,elizabeth,t,1946,7590753
8,elizabeth,t,1946,15280057
9,elizabeth,t,2012,25080984


In [24]:
# keep only the candidate movies for the BOTTOM 5 movies with negative impact
chosen_name_movies_bottom_df = chosen_name_movies_df.query('(slope_change <= 0) and (p_value < 0.1)').copy(deep=True)
# sort by slope_change in the opposite direction to have the highest negative variation at the end
chosen_name_movies_bottom_df.sort_values(by=['slope_change'], ascending=True, inplace=True)
display(chosen_name_movies_bottom_df)

Unnamed: 0_level_0,p_value,slope_change,all_nan,year,averageRating,numVotes
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1709616,0.010131,-0.059634,True,1910,6.4,4596
29836834,0.021689,-0.034561,False,1982,4.7,311
23924255,0.02703,-0.034334,False,1983,5.0,936
10821674,0.074134,-0.02962,True,1984,6.3,2244
15752401,0.08051,-0.026071,False,1912,4.6,152
11770563,0.011267,-0.024274,False,1967,5.8,1580
1096485,0.011267,-0.024274,False,1967,6.4,2375


In [25]:
# compute BOTTOM 5 and add it to the movie_impact_df
compute_bottom_movies(chosen_name, chosen_name_movies_bottom_df, chosen_name_movies_df, movie_impact_df)
display(movie_impact_df)

Unnamed: 0,name,status,group_year,movie_id
0,elizabeth,t,1974,1358544
1,elizabeth,t,1974,3048950
2,elizabeth,t,1974,223904
3,elizabeth,t,1935,73376
4,elizabeth,t,1935,73375
5,elizabeth,t,1935,600132
6,elizabeth,t,1946,2279601
7,elizabeth,t,1946,7590753
8,elizabeth,t,1946,15280057
9,elizabeth,t,2012,25080984


In [26]:
# keep only the candidate movies for the top 5 most famous movies with no significant impact
chosen_name_movies_insign_df = chosen_name_movies_df.query('p_value > 0.1').copy(deep=True)

# rank them by popularity
chosen_name_movies_insign_df.sort_values(by=['averageRating', 'numVotes'], inplace=True)
display(chosen_name_movies_insign_df)

Unnamed: 0_level_0,p_value,slope_change,all_nan,year,averageRating,numVotes
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32393429,0.265850,0.007394,False,2010,2.8,406
7473588,0.675362,0.002837,False,2006,3.0,538
6393328,0.215221,-0.016166,False,1989,3.4,566
2918858,0.865804,-0.001095,False,2005,3.7,38767
600820,0.361306,-0.010645,False,1988,3.8,16313
...,...,...,...,...,...,...
57820,0.719843,-0.002252,False,2001,8.0,833937
381810,0.719843,-0.002252,False,2001,8.1,118584
29454281,0.150199,0.009833,False,2011,8.1,481920
321496,0.639223,-0.002984,False,2003,8.1,1180659


In [27]:
# compute INSIGN 5 and add it to the movie_impact_df
compute_insign_movies(chosen_name, chosen_name_movies_insign_df, movie_impact_df)
display(movie_impact_df)

Unnamed: 0,name,status,group_year,movie_id
0,elizabeth,t,1974,1358544
1,elizabeth,t,1974,3048950
2,elizabeth,t,1974,223904
3,elizabeth,t,1935,73376
4,elizabeth,t,1935,73375
5,elizabeth,t,1935,600132
6,elizabeth,t,1946,2279601
7,elizabeth,t,1946,7590753
8,elizabeth,t,1946,15280057
9,elizabeth,t,2012,25080984


In [28]:
# check if there movie duplicates in the movie_impact_df
print(f"number of duplicates in movie_impact_df : {movie_impact_df.duplicated(subset=['movie_id']).sum()}")

number of duplicates in movie_impact_df : 0


### Generalize for all the names to generate `movie_impact_df`

In [29]:
# get all the names in the name_by_movie_with_info dataframe
names = name_by_movie_with_info.index.get_level_values(0).unique().tolist()
print(names)
print(f"Number of names : {len(names)}")

['henry', 'duke', 'ash', 'williams', 'lord', 'sheila', 'arthur', 'leon', 'rick', 'rachael', 'roy', 'bryant', 'sebastian', 'lamarr', 'bart', 'lyle', 'olson', 'von', 'governor', 'johnson', 'taggart', 'lili', 'jim', 'dorothy', 'frank', 'jeffrey', 'beaumont', 'ben', 'barbara', 'sandy', 'booth', 'lyndon', 'barry', 'summers', 'kimberly', 'benny', 'buffy', 'jennifer', 'merrick', 'amilyn', 'edward', 'campbell', 'william', 'king', 'princess', 'robert', 'hamish', 'wallace', 'bruce', 'isabelle', 'knox', 'james', 'dent', 'alexander', 'gordon', 'vicki', 'the', 'harvey', 'vale', 'alfred', 'robin', 'max', 'ivy', 'wilson', 'grayson', 'dick', 'meridian', 'chase', 'eric', 'parks', 'jerry', 'charles', 'leland', 'mary', 'herbert', 'raymond', 'susan', 'foster', 'walter', 'carter', 'emily', 'norton', 'lien', 'bai', 'may', 'lo', 'li', 'fox', 'te', 'jade', 'jen', 'master', 'shu', 'sir', 'tsai', 'bo', 'yu', 'jennie', 'sybil', 'harold', 'lindsay', 'sam', 'montague', 'aubrey', 'andrew', 'elliott', 'lionel', 'col

In [30]:
iter = 0
for name in names:
    print(f"Number of name treated: {iter}", end='\r', flush=True)
    # get the movie containing the chosen name
    chosen_name_movies_df = name_by_movie_with_info.loc[name, :].copy(deep=True)

    # sort the movies by slope_change, get movies with significant p_value and positive slope_change and compute the top 5
    chosen_name_movies_df.sort_values(by=['slope_change'], inplace=True)
    chosen_name_movies_top_df = chosen_name_movies_df.query('(slope_change > 0) and (p_value < 0.1)').copy(deep=True)
    compute_top_movies(name, chosen_name_movies_top_df, chosen_name_movies_df, movie_impact_df)

    # sort the movies by slope_change, get movies with significant p_value and negative slope_change and compute the bottom 5
    chosen_name_movies_bottom_df = chosen_name_movies_df.query('(slope_change <= 0) and (p_value < 0.1)').copy(deep=True)
    chosen_name_movies_bottom_df.sort_values(by=['slope_change'], ascending=False, inplace=True)
    compute_bottom_movies(name, chosen_name_movies_bottom_df, chosen_name_movies_df, movie_impact_df)

    # get the remaining movies and filter to keep only the insignificant ones and compute the insign 5
    chosen_name_movies_insign_df = chosen_name_movies_df.query('p_value > 0.1').copy(deep=True)
    chosen_name_movies_insign_df.sort_values(by=['numVotes'], inplace=True)
    compute_insign_movies(name, chosen_name_movies_insign_df, movie_impact_df)
    
    iter = iter + 1

Number of name treated: 8912

In [31]:
# see the result
display(movie_impact_df.sample(10))

Unnamed: 0,name,status,group_year,movie_id
15079,wanda,t,1980,5579274
22812,nikhil,i,1977,11135832
28161,regen,i,2006,6133862
16592,bowman,i,1997,666923
467,edward,i,1944,2648856
20861,dorinda,i,1943,5566354
8586,louisa,i,1993,5836236
27907,lon,i,2006,6618673
2583,gale,i,1996,113549
13064,bond,i,2012,12252836


### (Optional) Import `movie_impact` dataframe to avoid timeconsuming computation

In [32]:
# import the movie_impact_df to avoid the time-consuming computation of movie_impact_df
imported_movie_impact_df = pd.read_csv(os.path.join(processed_website_data_folder, 'movie_impacts.csv'))
imported_movie_impact_df.set_index(['name', 'status'], inplace=True)
display(imported_movie_impact_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,index,group_year,movie_id
name,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
henry,t,0,2009,3213691
henry,t,1,2009,5016250
henry,t,2,2009,29446866
henry,t,3,1933,73488
henry,t,4,1933,73375
...,...,...,...,...
calisto,i,32548,2011,36404162
licia,i,32549,2012,36424869
zelma,i,32550,1939,36598217
kirstie,i,32551,2007,36956792


In [33]:
# Check for NaN values in the name column
contains_nan = imported_movie_impact_df.reset_index()['name'].isna().any()

if contains_nan:
    print("The `name` column contains NaN values.")
else:
    print("The `name` column does not contain NaN values.")

The `name` column contains NaN values.


For some reason, the name column contains NaN values. Let's remove them.

In [34]:
# drop NaN values in the name column
print(f"Length of imported_movie_impact_df before dropping NaN values : {len(imported_movie_impact_df)}")
imported_movie_impact_df.reset_index(inplace=True)
imported_movie_impact_df.dropna(subset=['name'], inplace=True)
imported_movie_impact_df.set_index(['name', 'status'], inplace=True)
print(f"Length of imported_movie_impact_df after dropping NaN values : {len(imported_movie_impact_df)}")

display(imported_movie_impact_df)

# uncomment the next line to save the imported dataframe as movie_impact_df
# movie_impact_df = imported_movie_impact_df.copy(deep=True)

Length of imported_movie_impact_df before dropping NaN values : 32553
Length of imported_movie_impact_df after dropping NaN values : 32548


Unnamed: 0_level_0,Unnamed: 1_level_0,index,group_year,movie_id
name,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
henry,t,0,2009,3213691
henry,t,1,2009,5016250
henry,t,2,2009,29446866
henry,t,3,1933,73488
henry,t,4,1933,73375
...,...,...,...,...
calisto,i,32548,2011,36404162
licia,i,32549,2012,36424869
zelma,i,32550,1939,36598217
kirstie,i,32551,2007,36956792


In [35]:
# Check for NaN values in the name column
contains_nan = imported_movie_impact_df.reset_index()['name'].isna().any()

if contains_nan:
    print("The `name` column contains NaN values.")
else:
    print("The `name` column does not contain NaN values.")

The `name` column does not contain NaN values.


## Compute `name_per_year` dataframe

The dataframe `name_per_year` is a subset of the dataframe `name_by_movie` computed in [preprocessing.ipynb](./preprocessing.ipynb).

In [36]:
# remove number column
name_per_year = baby_name_df.copy(deep=True)
name_per_year.drop(columns='number', inplace=True)
display(name_per_year.sample(2))
print(f"Is the indexing of name_per_year unique? {name_per_year.index.is_unique}")
print(f"Length of the name_per_year_df : {len(name_per_year)}")

Unnamed: 0_level_0,Unnamed: 1_level_0,percentage
name,year,Unnamed: 2_level_1
Fay,1939,0.02482
Annalycia,2011,2.8e-05


Is the indexing of name_per_year unique? True
Length of the name_per_year_df : 1903290


In [37]:
# round the percentage values to reduce the size of the future csv file for the web
name_per_year['percentage'] = name_per_year['percentage'].round(6)

# set the names in lowercases
name_per_year.reset_index(inplace=True)
name_per_year['name'] = name_per_year['name'].str.lower()
name_per_year.set_index(['name', 'year'], inplace=True)

display(name_per_year.sample(2))
print(f"Length of the name_per_year_df : {len(name_per_year)}")

Unnamed: 0_level_0,Unnamed: 1_level_0,percentage
name,year,Unnamed: 2_level_1
mykenna,1996,2.8e-05
brigid,2005,0.00282


Length of the name_per_year_df : 1903290


In [38]:
# filter the name_per_year_df to keep only the names in the movie_impact_df
name_per_year_filtered = name_per_year.loc[movie_impact_df.reset_index()['name'].unique().tolist(), :].copy(deep=True)
name_per_year = name_per_year_filtered.copy(deep=True)

In [39]:
# Let's sort the dataframe name_per_year to anticipate the ploting
name_per_year.sort_values(by=['name', 'year'], inplace=True)
display(name_per_year.head(15))

Unnamed: 0_level_0,Unnamed: 1_level_0,percentage
name,year,Unnamed: 2_level_1
aadam,1987,2.8e-05
aadam,1988,2.8e-05
aadam,1993,8.2e-05
aadam,1994,5.5e-05
aadam,1995,5.6e-05
aadam,1996,2.8e-05
aadam,1997,2.8e-05
aadam,1998,0.000112
aadam,1999,2.8e-05
aadam,2000,5.5e-05


Compute the number of names in the three dataframes in order to compare

In [40]:
# compute how many unique names are there in the dataframes to compare
names_in_name_per_year = name_per_year.reset_index()['name'].unique().tolist()
print(f"Number of unique names in name_per_year = {len(names_in_name_per_year)}")
names_in_movie_impact = movie_impact_df.reset_index()['name'].unique().tolist()
print(f"Number of unique names in movie_impact_df = {len(names_in_movie_impact)}")
names_in_name_by_movie_with_info = name_by_movie_with_info.reset_index()['char_words'].unique().tolist()
print(f"Number of unique names in name_by_movie_with_info = {len(names_in_name_by_movie_with_info)}")

Number of unique names in name_per_year = 8913
Number of unique names in movie_impact_df = 8913
Number of unique names in name_by_movie_with_info = 8913


In [41]:
# Identify names in name_by_movie not present in name_per_year
values_only_in_name_by_movie = name_by_movie_with_info.reset_index()[~name_by_movie_with_info.reset_index()['char_words'].isin(name_per_year.reset_index()['name'])]['char_words'].unique()

print(f"Number of unique names in name_by_movie_with_info  = {len(name_by_movie_with_info.reset_index()['char_words'].unique())}")

# Display the result
print("Values in name_by_movie_with_info but not present in name_per_year:")
print(values_only_in_name_by_movie)
print(f"Number of names missing  = {len(values_only_in_name_by_movie)}")

Number of unique names in name_by_movie_with_info  = 8913
Values in name_by_movie_with_info but not present in name_per_year:
[]
Number of names missing  = 0


All good :)

## Compute `movies` dataframe
This dataset contains the information relative to the movie given its `wiki_ID`. It is a subset of the dataframe `movie_df` computed in [preprocessing.ipynb](./preprocessing.ipynb).

In [42]:
# keep only the movies in the movie_df that are in the movie_impact_df
simplified_movie_df = movie_df.loc[movie_impact_df['movie_id'].unique().tolist(), :].copy(deep=True)

print(f"Number of movies in movie_impact_df : {len(movie_impact_df['movie_id'].unique())}")
print(f"Number of movies kept in movie_df: {len(simplified_movie_df)}")

# remove useless columns
simplified_movie_df.drop(columns=['month', 'revenue'], inplace=True)
simplified_movie_df.rename(columns={'averageRating': 'rating', 'numVotes': 'votes', 'IMDB_ID': 'imdb_id'}, inplace=True)

display(simplified_movie_df.sample(2))

Number of movies in movie_impact_df : 14774
Number of movies kept in movie_df: 14774


Unnamed: 0_level_0,mov_name,year,votes,rating,poster_url,imdb_id
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
16913663,Constantine's Sword,2007,866,7.2,/4eJiXgbL14DIyYhYPD7WdVNlzzf.jpg,tt0902270
13869704,Donga Sachinollu,2008,8,4.4,,tt1583239


## Export the three datasets

In [43]:
# Export movie_impact.csv
display(movie_impact_df.reset_index().sample(2))
movie_impact_df.reset_index().to_csv(os.path.join(processed_website_data_folder, 'movie_impacts.csv'), index=False)

# Export name_per_year.csv
display(name_per_year.reset_index().sample(2))
name_per_year.reset_index().to_csv(os.path.join(processed_website_data_folder, 'name_per_year.csv'), index=False)

# Export movie.csv
display(simplified_movie_df.reset_index().sample(2))
simplified_movie_df.reset_index().to_csv(os.path.join(processed_website_data_folder, 'movies.csv'), index=False)

Unnamed: 0,index,name,status,group_year,movie_id
5069,5069,jenny,i,1994,21883163
19793,19793,forest,t,2006,2877925


Unnamed: 0,name,year,percentage
519152,summer,1995,0.05945
353301,majestic,2012,0.000198


Unnamed: 0,wiki_ID,mov_name,year,votes,rating,poster_url,imdb_id
5645,29375144,The High Cost of Living,2011,2165,6.7,/umY4Q0nSC1YTYf4psQi54C03RzB.jpg,tt1479388
9566,644845,EdTV,1999,44585,6.1,/wCOgmBCmfNUYbx10XuFfhvdzc7W.jpg,tt0131369
