In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np

# Prepare data for interactive part of the website

In [2]:
folder_processed_data_path = './processed_data/'
processed_website_data_folder = './processed_data/website/'

In [3]:
# import the name by movie
name_by_movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'name_by_movie_ordered_pvalue_10_5_df.csv'))
name_by_movie_df.set_index(['wiki_ID', 'char_words', 'gender'], inplace=True)
display(name_by_movie_df.sample(2))

# import the movie dataframe
movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'))
movie_df.set_index(['wiki_ID'], inplace=True)
display(movie_df.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,order,t_stat,p_value,slope_change
wiki_ID,char_words,gender,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8619888,Barton,M,,-0.277361,0.786646,1.8e-05
31567587,Dominic,M,0.0,2.116171,0.057954,-0.011738


Unnamed: 0_level_0,mov_name,year,month,revenue,numVotes,averageRating,poster_url,IMDB_ID
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
33002544,Shehzaade,1989,,,52,4.5,,tt0359965
12289584,Skeleton Man,2004,,,2075,2.1,/73dDpqzg7hUlnSFJJe95ISnxA1R.jpg,tt0372832


## Compute `movie_impact` dataframe

### Filtering of `name_by_movie`  : importance of the role

Now, lets remove the character names with an minor role in the movie. We will keep only the characters with a order higher or equal to the median of the set of order in the movie. First, let's compute the number of order for each movies.

In [4]:
name_by_movie_merged_groupby = name_by_movie_df.groupby(['wiki_ID'])
name_by_movie_merged_nunique = name_by_movie_merged_groupby['order'].count()

name_by_movie_merged_nunique = name_by_movie_merged_nunique.to_frame()
name_by_movie_merged_nunique.rename(columns={"order": "nb_order"}, inplace=True)
display(name_by_movie_merged_nunique.head())

Unnamed: 0_level_0,nb_order
wiki_ID,Unnamed: 1_level_1
3217,11
3746,15
3837,18
3947,16
4227,2


In [5]:
print(f"length of name_by_movie_df : {len(name_by_movie_df)}")

length of name_by_movie_df : 172906


In [6]:
print(f"length of name_by_movie_df (before): {len(name_by_movie_df)}")
name_by_movie_merged_with_nunique = name_by_movie_df.reset_index().merge(name_by_movie_merged_nunique, on='wiki_ID', how='left')
display(name_by_movie_merged_with_nunique.sample(2))
print(f"length of name_by_movie_merged_with_nunique (after): {len(name_by_movie_merged_with_nunique)}")

length of name_by_movie_df (before): 172906


Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order
165091,32171599,Ravi,M,0.0,0.231803,0.820947,-6e-05,2
94777,11223100,Heart,M,0.0,,,,2


length of name_by_movie_merged_with_nunique (after): 172906


In [7]:
display(name_by_movie_merged_with_nunique.sample(2))
print(f"length of name_by_movie_merged_with_nunique : {len(name_by_movie_merged_with_nunique)}")

Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order
145035,25367068,Laura,F,3.0,-3.22691,0.008059,0.015288,4
140661,24190433,Hogan,M,2.0,,,,14


length of name_by_movie_merged_with_nunique : 172906


In [8]:
# check case with tommy charcater in titanic
name_by_movie_merged_with_nunique[name_by_movie_merged_with_nunique['wiki_ID'] == 52371]

Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order
1891,52371,Caledon,M,,,,,28
1892,52371,Dewitt,F,,,,,28
1893,52371,Rose,F,1.0,-1.552929,0.14872,0.002796,28
1894,52371,De,M,,,,,28
1895,52371,Jack,M,0.0,-1.987322,0.072358,0.012744,28
1896,52371,Charles,M,,-3.093865,0.010215,0.008629,28
1897,52371,Bruce,M,10.0,-2.545008,0.027236,0.00157,28
1898,52371,Calvert,F,12.0,0.216687,0.832419,-1.7e-05,28
1899,52371,Ismay,M,10.0,,,,28
1900,52371,Brown,F,3.0,,,,28


In [9]:
# filtering
name_by_movie_merged_important_role = name_by_movie_merged_with_nunique[name_by_movie_merged_with_nunique['order'] <= (name_by_movie_merged_with_nunique['nb_order']/2)].copy(deep=True)
# name_by_movie_merged_important_role.drop(columns=['nb_order'], inplace=True)
print(f"length of the initial dataframe : {len(name_by_movie_df)}")
print(f"length of the filtered dataframe : {len(name_by_movie_merged_important_role)}")
display(name_by_movie_merged_important_role.sample(2))

length of the initial dataframe : 172906
length of the filtered dataframe : 86642


Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order
15117,473268,Ed,M,1.0,0.355571,0.728886,-4.5e-05,13
164136,31893898,Freddie,M,1.0,-0.288221,0.778536,9.9e-05,16


In [10]:
# check case with tommy charcater in titanic
name_by_movie_merged_important_role[name_by_movie_merged_important_role['wiki_ID'] == 52371]

Unnamed: 0,wiki_ID,char_words,gender,order,t_stat,p_value,slope_change,nb_order
1893,52371,Rose,F,1.0,-1.552929,0.14872,0.002796,28
1895,52371,Jack,M,0.0,-1.987322,0.072358,0.012744,28
1897,52371,Bruce,M,10.0,-2.545008,0.027236,0.00157,28
1898,52371,Calvert,F,12.0,0.216687,0.832419,-1.7e-05,28
1899,52371,Ismay,M,10.0,,,,28
1900,52371,Brown,F,3.0,,,,28
1901,52371,Bobby,M,13.0,-1.158331,0.271259,0.001117,28
1905,52371,Fabrizio,M,14.0,,,,28
1907,52371,Lizzy,F,12.0,,,,28
1909,52371,Andrews,M,6.0,-0.561223,0.585897,4.5e-05,28


In [11]:
name_by_movie_df = name_by_movie_merged_important_role.copy(deep=True)
print(f"Length of the name_by_movie_df : {len(name_by_movie_df)}")

Length of the name_by_movie_df : 86642


### Preprocessing `name_by_movie`

In [12]:
# Remove useless columns
name_by_movie_web = name_by_movie_df.reset_index().copy(deep=True)
name_by_movie_web.drop(columns=['t_stat'], inplace=True)
display(name_by_movie_web.sample(2))

Unnamed: 0,index,wiki_ID,char_words,gender,order,p_value,slope_change,nb_order
36128,66704,5079733,Beecher,F,0.0,,,7
54515,104375,14261811,Musette,F,2.0,,,10


In [13]:
# drop gender column
len_before = len(name_by_movie_web)

name_by_movie_web.reset_index(inplace=True, drop=True)
name_by_movie_web.drop_duplicates(subset=['wiki_ID', 'char_words'], keep='first', inplace=True)
name_by_movie_web.drop(columns=['gender'], inplace=True)
name_by_movie_web.set_index(['char_words', 'wiki_ID'], inplace=True)

len_after = len(name_by_movie_web)
print(f"length : {len_before} -> {len_after}")
print(f"Is the indexing of name_by_movie_web unique? {name_by_movie_web.index.is_unique}")
display(name_by_movie_web.sample(2))

length : 86642 -> 83633
Is the indexing of name_by_movie_web unique? True


Unnamed: 0_level_0,Unnamed: 1_level_0,index,order,p_value,slope_change,nb_order
char_words,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Carlos,24207129,140930,4.0,0.185631,-0.007314,10
Oren,2444093,45222,2.0,0.149138,0.000211,7


In [14]:
# drop NaN values
len_before = len(name_by_movie_web)
name_by_movie_web.dropna(subset=['p_value'], inplace=True)
len_after = len(name_by_movie_web)
print(f"length : {len_before} -> {len_after}")

length : 83633 -> 61957


In [15]:
# Add the year of release of the movie to the name_by_movie_web dataframe

needed_movie_info = movie_df.reset_index()[['wiki_ID', 'year', 'averageRating', 'numVotes']].copy(deep=True)

len_before_merge = len(name_by_movie_web)
name_by_movie_with_info = name_by_movie_web.reset_index().merge(needed_movie_info, on='wiki_ID', how='left').copy(deep=True) # merge the release year into the name_by_movie_web dataframe
len_after_merge = len(name_by_movie_with_info)
print(f"length : {len_before_merge} -> {len_after_merge}")

name_by_movie_with_info.set_index(['char_words', 'wiki_ID'], inplace=True)
display(name_by_movie_with_info.sample(2))

length : 61957 -> 61957


Unnamed: 0_level_0,Unnamed: 1_level_0,index,order,p_value,slope_change,nb_order,year,averageRating,numVotes
char_words,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Charlie,3733842,58019,6.0,0.530251,0.000538,13,1966,7.6,21555
Willie,21575683,128330,1.0,0.670954,0.000502,4,1990,5.3,253


In [16]:
# set the char_names in lowercase
name_by_movie_with_info.reset_index(inplace=True)
name_by_movie_with_info['char_words'] = name_by_movie_with_info['char_words'].str.lower()
name_by_movie_with_info.set_index(['char_words', 'wiki_ID'], inplace=True)
print(f"Length of name_by_movie_with_info : {len(name_by_movie_with_info)}")
display(name_by_movie_with_info.sample(2))

Length of name_by_movie_with_info : 61957


Unnamed: 0_level_0,Unnamed: 1_level_0,index,order,p_value,slope_change,nb_order,year,averageRating,numVotes
char_words,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
andrea,28149365,154101,7.0,0.967168,-0.000357,15,1969,5.1,292
julie,1011468,25841,2.0,0.162156,0.002235,36,2002,7.2,148700


In [17]:
# create the movie_impact dataframe
columns = ['name', 'status', 'group_year', 'movie_id']
movie_impact_df = pd.DataFrame(columns=columns)
display(movie_impact_df)

Unnamed: 0,name,status,group_year,movie_id


### Functions to compute TOP/BOTTOM/INSIGN

In [18]:
def compute_top_movies(name, chosen_name_movies_top_df, chosen_name_movies_df, movie_impact_df):
    for i in range(5): # from top1 to top5
        if(not chosen_name_movies_top_df.empty):
            # get the year with the highest positive variation
            top_i_year = chosen_name_movies_top_df.iloc[-1].year.astype(int)

            # get the movies released close to the year with the highest positive variation [top_i_year-3, top_i_year+3]
            top_i_year_chosen_name_movies = chosen_name_movies_df.query(f'year >= {top_i_year - 3} and year <= {top_i_year + 3}').copy(deep=True)

            # keep only the three most popular movies
            top_i_year_chosen_name_movies.sort_values(by=['numVotes'], ascending=False, inplace=True)
            top_i_3_chosen_name_movies = top_i_year_chosen_name_movies.iloc[:3].copy(deep=True)

            # add the the three popular movies to the movie_impact_df
            for index, row in top_i_3_chosen_name_movies.iterrows():
                movie_impact_df.loc[len(movie_impact_df)] = {'name': name, 'status': 't', 'group_year': top_i_year, 'movie_id': index}

            # # remove the found movies from the chosen_name_movies_top_df to avoid picking them again for next iterations
            # chosen_name_movies_top_df.drop(top_i_year_chosen_name_movies.index, inplace=True, errors='ignore')

            # remove also the movies release in the frame [-5 years, +5 years] of the top1_year
            chosen_name_movies_top_df.query(f'year < {top_i_year - 5} or year > {top_i_year + 5}', inplace=True)

            # remove the found movies from the chosen_name_movies_df to avoid picking them again for next iterations
            chosen_name_movies_df.drop(top_i_3_chosen_name_movies.index, inplace=True)

In [19]:
def compute_bottom_movies(name, chosen_name_movies_bottom_df, chosen_name_movies_df, movie_impact_df):
    for i in range(5): # from bottom1 to bottom5
        if(not chosen_name_movies_bottom_df.empty):
            # get the year with the hgighest negative variation
            bottom_i_year = chosen_name_movies_bottom_df.iloc[-1].year.astype(int)

            # get the movies released close to the year with the highest negative variation [bottom_i_year-3, bottom_i_year+3]
            bottom_i_year_chosen_name_movies = chosen_name_movies_df.query(f'year >= {bottom_i_year - 3} and year <= {bottom_i_year + 3}').copy(deep=True)

            # keep only the three most popular movies
            bottom_i_year_chosen_name_movies.sort_values(by=['numVotes'], ascending=False, inplace=True)
            bottom_i_3_chosen_name_movies = bottom_i_year_chosen_name_movies.iloc[:3].copy(deep=True)

            # add the the three popular movies to the movie_impact_df
            for index, row in bottom_i_3_chosen_name_movies.iterrows():
                movie_impact_df.loc[len(movie_impact_df)] = {'name': name, 'status': 'b', 'group_year': bottom_i_year, 'movie_id': index}

            # # remove the found movies from the chosen_name_movies_bottom_df to avoid picking them again for next iterations
            # chosen_name_movies_bottom_df.drop(bottom_i_year_chosen_name_movies.index, inplace=True, errors='ignore')

            # remove also the movies release in the frame [-5 years, +5 years] of the bottom_i_year
            chosen_name_movies_bottom_df.query(f'year < {bottom_i_year - 5} or year > {bottom_i_year + 5}', inplace=True)

            # remove the found movies from the chosen_name_movies_df to avoid picking them again for next iterations
            chosen_name_movies_df.drop(bottom_i_3_chosen_name_movies.index, inplace=True)

In [20]:
def compute_insign_movies(name, chosen_name_movies_insign_df, movie_impact_df):
    for i in range(5): # from insign1 to insign5
        if(not chosen_name_movies_insign_df.empty):
            # get the movies release the year with the highest rating and number of votes
            insign_i_year = chosen_name_movies_insign_df.iloc[-1].year.astype(int)
            insign_i_year_chosen_name_movies = chosen_name_movies_insign_df.query(f'year == {insign_i_year}').copy(deep=True)

            # # keep only the three most popular movies
            # insign_i_3_chosen_name_movies = insign_i_year_chosen_name_movies.iloc[:3].copy(deep=True)

            # add the most popular movie to the movie_impact_df
            # for index, row in [insign_i_year_chosen_name_movies.iloc[0]]:
            movie_impact_df.loc[len(movie_impact_df)] = {'name': name, 'status': 'i', 'group_year': insign_i_year, 'movie_id': insign_i_year_chosen_name_movies.index[0]}

            # remove the found movies from the chosen_name_movies_top_df to avoid picking them again for next iterations
            chosen_name_movies_insign_df.drop(insign_i_year_chosen_name_movies.index, inplace=True)

            # remove also the movies release in the frame [-5 years, +5 years] of the top1_year
            chosen_name_movies_insign_df.query(f'year < {insign_i_year - 5} or year > {insign_i_year + 5}', inplace=True)

### Special case for a single name

In [None]:
# given name by user
chosen_name = 'elizabeth'

In [None]:
# get the movie containing the chosen name
chosen_name_movies_df = name_by_movie_with_info.loc[chosen_name, :].copy(deep=True)
display(chosen_name_movies_df)

In [None]:
chosen_name_movies_df.sort_values(by=['slope_change'], inplace=True)
chosen_name_movies_top_df = chosen_name_movies_df.query('(slope_change > 0) and (p_value < 0.1)').copy(deep=True)
display(chosen_name_movies_top_df)

Do the loop to do the computation for top1 to top5

In [None]:
compute_top_movies(chosen_name, chosen_name_movies_top_df, movie_impact_df)

display(movie_impact_df)

In [None]:
chosen_name_movies_bottom_df = chosen_name_movies_df.query('(slope_change <= 0) and (p_value < 0.1)').copy(deep=True)
chosen_name_movies_bottom_df.sort_values(by=['slope_change'], ascending=False, inplace=True)
display(chosen_name_movies_bottom_df)

In [None]:
compute_bottom_movies(chosen_name, chosen_name_movies_bottom_df, movie_impact_df)

display(movie_impact_df)

In [None]:
remaining_chosen_name_movies_df = pd.concat([chosen_name_movies_top_df, chosen_name_movies_bottom_df])
chosen_name_movies_insign_df = remaining_chosen_name_movies_df.query('p_value > 0.1').copy(deep=True)
chosen_name_movies_insign_df.sort_values(by=['averageRating', 'numVotes'], inplace=True)
display(chosen_name_movies_insign_df)

In [None]:
compute_insign_movies(chosen_name, chosen_name_movies_insign_df, movie_impact_df)
display(movie_impact_df)

In [None]:
# check if there movie duplicates in the movie_impact_df
print(f"number of duplicates in movie_impact_df : {movie_impact_df.duplicated(subset=['movie_id']).sum()}")

### Generalize for all the names

In [21]:
# get all the names in the name_by_movie_with_info dataframe
names = name_by_movie_with_info.index.get_level_values(0).unique().tolist()
print(names)
print(f"Number of names : {len(names)}")

['henry', 'duke', 'williams', 'sheila', 'arthur', 'leon', 'rick', 'rachael', 'roy', 'bryant', 'sebastian', 'lamarr', 'bart', 'lyle', 'von', 'johnson', 'lili', 'jim', 'dorothy', 'frank', 'jeffrey', 'ben', 'barbara', 'sandy', 'lyndon', 'barry', 'kimberly', 'benny', 'jennifer', 'merrick', 'edward', 'campbell', 'william', 'king', 'princess', 'robert', 'wallace', 'bruce', 'isabelle', 'james', 'alexander', 'gordon', 'vicki', 'harvey', 'alfred', 'robin', 'max', 'ivy', 'wilson', 'grayson', 'chase', 'eric', 'jerry', 'charles', 'leland', 'mary', 'herbert', 'raymond', 'susan', 'foster', 'walter', 'carter', 'emily', 'norton', 'lien', 'may', 'jade', 'master', 'sir', 'bo', 'yu', 'jennie', 'sybil', 'harold', 'lindsay', 'sam', 'aubrey', 'andrew', 'elliott', 'lionel', 'jack', 'general', 'major', 'johann', 'scott', 'cheryl', 'marion', 'alice', 'victor', 'nick', 'lee', 'tania', 'ed', 'jake', 'joe', 'bobby', 'annie', 'laura', 'fiona', 'tom', 'scarlett', 'david', 'matthew', 'henrietta', 'gerald', 'carrie',

In [22]:
iter = 0
for name in names:
    print(f"Number of name treated: {iter}", end='\r', flush=True)
    # get the movie containing the chosen name
    chosen_name_movies_df = name_by_movie_with_info.loc[name, :].copy(deep=True)

    # sort the movies by slope_change, get movies with significant p_value and positive slope_change and compute the top 5
    chosen_name_movies_df.sort_values(by=['slope_change'], inplace=True)
    chosen_name_movies_top_df = chosen_name_movies_df.query('(slope_change > 0) and (p_value < 0.1)').copy(deep=True)
    # display(chosen_name_movies_top_df)
    compute_top_movies(name, chosen_name_movies_top_df, chosen_name_movies_df, movie_impact_df)

    # sort the movies by slope_change, get movies with significant p_value and negative slope_change and compute the bottom 5
    chosen_name_movies_bottom_df = chosen_name_movies_df.query('(slope_change <= 0) and (p_value < 0.1)').copy(deep=True)
    chosen_name_movies_bottom_df.sort_values(by=['slope_change'], ascending=False, inplace=True)
    compute_bottom_movies(name, chosen_name_movies_bottom_df, chosen_name_movies_df, movie_impact_df)

    # get the remaining movies and filter to keep only the insignificant ones and compute the insign 5
    # remaining_chosen_name_movies_df = pd.concat([chosen_name_movies_top_df, chosen_name_movies_bottom_df])
    chosen_name_movies_insign_df = chosen_name_movies_df.query('p_value > 0.1').copy(deep=True)
    chosen_name_movies_insign_df.sort_values(by=['averageRating', 'numVotes'], inplace=True)
    compute_insign_movies(name, chosen_name_movies_insign_df, movie_impact_df)
    
    iter = iter + 1

Number of name treated: 4619

In [23]:
# movie_impact_df.set_index(['name', 'status'], inplace=True)
display(movie_impact_df.sample(10))

# Export DataFrame to a CSV file in the processed data folder
# movie_impact_df.reset_index(drop=True).sort_values(by=['name', 'status']).to_csv(os.path.join(processed_website_data_folder, 'movie_impact.csv'), index=False)

Unnamed: 0,name,status,group_year,movie_id
6138,gideon,i,1993,11471094
13401,nicolas,i,2002,2563837
15885,conway,i,2005,2891096
14420,charly,i,1998,27894922
18114,landis,i,2003,23470870
14282,carlyle,i,1983,19250997
18921,keefe,i,1982,20917102
9885,laurel,i,2006,23961694
1729,michael,t,1938,62111
1583,kurt,t,2009,28386410


### Import `movie_impact` dataframe to avoid timeconsuming computation

In [35]:
# import the movie_impact_df to avoid the time-consuming computation of movie_impact_df
imported_movie_impact_df = pd.read_csv(os.path.join(processed_website_data_folder, 'movie_impacts.csv'))
imported_movie_impact_df.set_index(['name', 'status'], inplace=True)
display(imported_movie_impact_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,group_year,movie_id
name,status,Unnamed: 2_level_1,Unnamed: 3_level_1
henry,t,2009,3213691
henry,t,2009,5016250
henry,t,2009,29446866
henry,t,1933,73488
henry,t,1933,73375
...,...,...,...
doria,i,1955,36348682
charmaine,i,2010,36478252
malek,i,2011,36563324
zelma,i,1939,36598217


In [36]:
# Check for NaN values in the name column
contains_nan = imported_movie_impact_df.reset_index()['name'].isna().any()

if contains_nan:
    print("The `name` column contains NaN values.")
else:
    print("The `name` column does not contain NaN values.")

The `name` column contains NaN values.


In [37]:
# drop NaN values in the name column
print(f"Length of imported_movie_impact_df before dropping NaN values : {len(imported_movie_impact_df)}")
imported_movie_impact_df.reset_index(inplace=True)
imported_movie_impact_df.dropna(subset=['name'], inplace=True)
imported_movie_impact_df.set_index(['name', 'status'], inplace=True)
print(f"Length of imported_movie_impact_df after dropping NaN values : {len(imported_movie_impact_df)}")

display(imported_movie_impact_df)
movie_impact_df = imported_movie_impact_df.copy(deep=True)

Length of imported_movie_impact_df before dropping NaN values : 19542
Length of imported_movie_impact_df after dropping NaN values : 19538


Unnamed: 0_level_0,Unnamed: 1_level_0,group_year,movie_id
name,status,Unnamed: 2_level_1,Unnamed: 3_level_1
henry,t,2009,3213691
henry,t,2009,5016250
henry,t,2009,29446866
henry,t,1933,73488
henry,t,1933,73375
...,...,...,...
doria,i,1955,36348682
charmaine,i,2010,36478252
malek,i,2011,36563324
zelma,i,1939,36598217


## Compute `name_per_year` dataframe

In [25]:
# import the babynames dataframe
name_per_year = pd.read_csv(os.path.join(folder_processed_data_path, 'baby_name_df.csv'))
name_per_year.drop(columns='number', inplace=True)
name_per_year.set_index(['name', 'year'], inplace=True)
print(f"Is the indexing of name_per_year unique? {name_per_year.index.is_unique}")
display(name_per_year.sample(2))
print(f"Length of the name_per_year_df : {len(name_per_year)}")

Is the indexing of name_per_year unique? True


Unnamed: 0_level_0,Unnamed: 1_level_0,percentage
name,year,Unnamed: 2_level_1
Jimmi,1979,0.000184
Charleigh,2003,0.001438


Length of the name_per_year_df : 1903290


In [26]:
# round the percentage values to reduce the size of the future csv file for the web
name_per_year['percentage'] = name_per_year['percentage'].round(6)

# set the names in lowercases
name_per_year.reset_index(inplace=True)
name_per_year['name'] = name_per_year['name'].str.lower()
name_per_year.set_index(['name', 'year'], inplace=True)

display(name_per_year.sample(2))
print(f"Length of the name_per_year_df : {len(name_per_year)}")

Unnamed: 0_level_0,Unnamed: 1_level_0,percentage
name,year,Unnamed: 2_level_1
noah,1917,0.010907
cooper,1957,0.000144


Length of the name_per_year_df : 1903290


In [27]:
# filter the name_per_year_df to keep only the names in the movie_impact_df
name_per_year_filtered = name_per_year.loc[movie_impact_df.reset_index()['name'].unique().tolist(), :].copy(deep=True)
name_per_year = name_per_year_filtered.copy(deep=True)

In [28]:
# Let's sort the dataframe name_per_year to anticipate the ploting
name_per_year.sort_values(by=['name', 'year'], inplace=True)
display(name_per_year.head(15))

Unnamed: 0_level_0,Unnamed: 1_level_0,percentage
name,year,Unnamed: 2_level_1
aadam,1987,2.8e-05
aadam,1988,2.8e-05
aadam,1993,8.2e-05
aadam,1994,5.5e-05
aadam,1995,5.6e-05
aadam,1996,2.8e-05
aadam,1997,2.8e-05
aadam,1998,0.000112
aadam,1999,2.8e-05
aadam,2000,5.5e-05


Let's check if the names present in name_per_year dataframe are also present in the name_by_movie dataframe

In [29]:
# Identify names in name_per_year not present in name_by_movie
values_only_in_name_per_year = name_per_year.reset_index()[~name_per_year.reset_index()['name'].isin(name_by_movie_with_info.reset_index()['char_words'])]['name'].unique()

print(f"Number of unique names in name_per_year  = {len(name_per_year.reset_index()['name'].unique())}")

# Display the result
print("Values in name_per_year but not present in name_by_movie:")
print(values_only_in_name_per_year)
print(f"Number of names missing  = {len(values_only_in_name_per_year)}")

Number of unique names in name_per_year  = 4620
Values in name_per_year but not present in name_by_movie:
[]
Number of names missing  = 0


In [30]:
# Identify names in name_by_movie not present in name_per_year
values_only_in_name_by_movie = name_by_movie_with_info.reset_index()[~name_by_movie_with_info.reset_index()['char_words'].isin(name_per_year.reset_index()['name'])]['char_words'].unique()

print(f"Number of unique names in name_by_movie_with_info  = {len(name_by_movie_with_info.reset_index()['char_words'].unique())}")

# Display the result
print("Values in name_by_movie_with_info but not present in name_per_year:")
print(values_only_in_name_by_movie)
print(f"Number of names missing  = {len(values_only_in_name_by_movie)}")

Number of unique names in name_by_movie_with_info  = 4620
Values in name_by_movie_with_info but not present in name_per_year:
[]
Number of names missing  = 0


Compute the number of names in the three dataframes in order to compare

In [31]:
# compute how many unique names are there in the dataframes to compare
names_in_name_per_year = name_per_year.reset_index()['name'].unique().tolist()
print(f"Number of unique names in name_per_year = {len(names_in_name_per_year)}")
names_in_movie_impact = movie_impact_df.reset_index()['name'].unique().tolist()
print(f"Number of unique names in movie_impact_df = {len(names_in_movie_impact)}")
names_in_name_by_movie_with_info = name_by_movie_with_info.reset_index()['char_words'].unique().tolist()
print(f"Number of unique names in name_by_movie_with_info = {len(names_in_name_by_movie_with_info)}")

Number of unique names in name_per_year = 4620
Number of unique names in movie_impact_df = 4620
Number of unique names in name_by_movie_with_info = 4620


## Compute `movies` dataframe
This dataset contains the information relative to the movie given its `wiki_ID`.

In [32]:
print(f"Number of movies in movie_impact_df : {len(movie_impact_df['movie_id'].unique())}")
# keep only the movies in the movie_df that are in the movie_impact_df
simplified_movie_df = movie_df.loc[movie_impact_df['movie_id'].unique().tolist(), :].copy(deep=True)
print(f"Number of movies kept in movie_df: {len(simplified_movie_df)}")

# remove useless columns
simplified_movie_df.drop(columns=['month', 'revenue'], inplace=True)
simplified_movie_df.rename(columns={'averageRating': 'rating', 'numVotes': 'votes', 'IMDB_ID': 'imdb_id'}, inplace=True)

display(simplified_movie_df.sample(2))

Number of movies in movie_impact_df : 10232
Number of movies kept in movie_df: 10232


Unnamed: 0_level_0,mov_name,year,votes,rating,poster_url,imdb_id
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
715207,Sleepaway Camp,1983,36853,6.2,/2XW01n41n0Fi8jSn0ShTRtL0HwM.jpg,tt0086320
8454631,China Moon,1994,7466,6.3,/qt2cjzyRUNuKEliK7p7LCpYAgRW.jpg,tt0109417


## Export the three datasets

In [33]:
# Export movie_impact.csv
display(movie_impact_df.reset_index().sample(2))
movie_impact_df.reset_index(drop=True).to_csv(os.path.join(processed_website_data_folder, 'movie_impacts.csv'), index=False)

# Export name_per_year.csv
display(name_per_year.reset_index().sample(2))
name_per_year.reset_index().to_csv(os.path.join(processed_website_data_folder, 'name_per_year.csv'), index=False)

# Export movie.csv
display(simplified_movie_df.reset_index().sample(2))
simplified_movie_df.reset_index().to_csv(os.path.join(processed_website_data_folder, 'movies.csv'), index=False)

Unnamed: 0,index,name,status,group_year,movie_id
14886,14886,tara,i,2012,14413223
18999,18999,myrna,i,1969,22288927


Unnamed: 0,name,year,percentage
78780,chris,1958,0.141645
92900,cory,1952,0.001543


Unnamed: 0,wiki_ID,mov_name,year,votes,rating,poster_url,imdb_id
4669,18968628,The Cartier Affair,1984,252,4.9,/vscv4j9kebeYed55nBarhjyPCXf.jpg,tt0087036
4188,229047,Enough,2002,46622,5.7,/w3GZUYczsU9Mlfij9MCEo0OqoQ6.jpg,tt0278435
