In [2]:
import pandas as pd

# prenoms inventés
mean_diff = pd.read_csv("data/clean/influenced_names_means_diff.csv")
influenced_meandiff = mean_diff[mean_diff["Influence"] > 0]
# removing common identification mistakes such as "the", "a" or "Mr"
influenced_meandiff = influenced_meandiff[~influenced_meandiff["Character Name"].isin(["the", "a", "Mr"])]
print("Number of influenced names with mean diff: ", len(influenced_meandiff))
influenced_meandiff.head()



Number of influenced names with mean diff:  1585


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Influence
0,31186339,the hunger games,2012,Katniss,4,Katniss,KATNISS,inf
1,22144721,iron man 2,2010,Stark,3,Howard Stark,STARK,inf
2,146947,spider-man,2002,Osborn,6,Harry Osborn,OSBORN,inf
4,443972,hook,1991,Banning,16,Peter Banning,BANNING,inf
5,537416,ace ventura: when nature calls,1995,Abbot,2,Grand Abbot,ABBOT,inf


In [3]:
# tous les prénoms influencés
prophet = pd.read_csv("data/clean/influenced_names_prophet.csv")
influenced_prophet = prophet[prophet["Influenced"] > 0]
# removing common identification mistakes such as "the", "a" or "Mr"
influenced_prophet = influenced_prophet[~influenced_prophet["Character Name"].isin(["the", "a", "Mr"])]
print("Number of influenced names with mean diff: ", len(influenced_prophet))
influenced_prophet.head()

Number of influenced names with mean diff:  432


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced
0,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,1
1,633411,the avengers,1998,Emma,15,Emma Peel,EMMA,14985.966667,1
2,3727473,man on fire,1987,Samantha,4,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5,1
3,347000,suspiria,1977,Sarah,15,Sarah,SARAH,14372.466667,1
5,320401,barton fink,1991,Taylor,3,Audrey Taylor,TAYLOR,13892.1,1


In [4]:
#remove every non numeric value in mean diff
mean_diff = mean_diff[mean_diff["Influence"].apply(lambda x: str(x).replace(".", "").isdigit())]
threshold = mean_diff['Influence'].quantile(0.75)

significant_names = mean_diff[mean_diff['Influence'] > threshold]
print("Number of significant names:", len(significant_names))
significant_names.head()

Number of significant names: 371


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Influence
151,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667
152,633411,the avengers,1998,Emma,15,Emma Peel,EMMA,14985.966667
153,3727473,man on fire,1987,Samantha,4,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5
154,347000,suspiria,1977,Sarah,15,Sarah,SARAH,14372.466667
155,483274,point break,1991,Tyler,3,Tyler Endicott,TYLER,14176.666667


In [7]:
cmu_imdb_merged = pd.read_csv("data/clean/cmu_imdb_merged.csv")
cmu_imdb_merged.head(1)

Unnamed: 0,Wikipedia_movie_ID,Movie_name,Release_date,Revenue,Runtime,Languages,Countries,Genres,weightedAverageRating,totalVotes,is_blockbuster
0,29988427.0,!women art revolution,2010-01-01,,0 days 01:23:00,English,"United States of America, Canada","LGBT, History, Documentary",6.9,262.0,False


In [54]:
# Perform a merge based on 'Wikipedia ID' (from influenced_prophet) and 'Wikipedia_movie_ID' (from cmu_imdb_merged)
merged_df = influenced_prophet.merge(
    cmu_imdb_merged[['Wikipedia_movie_ID', 'Genres']],  # Keep only relevant columns
    left_on="Wikipedia ID", 
    right_on="Wikipedia_movie_ID", 
    how="left"
)
# Drop redundant column after the merge
#merged_df.drop(columns=['Wikipedia_movie_ID'], inplace=True)

# Display the result
print("Merged Dataset:")
display(merged_df.head(1))

# Save the new dataset with Genres
#merged_df.to_csv("data/clean/influenced_prophet_with_genres.csv", index=False)
#print("Dataset saved successfully with Genres added!")


Merged Dataset:


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced,Wikipedia_movie_ID,Genres
0,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,1,451866.0,"Thriller, Action Thrillers, Action/Adventure, ..."


In [95]:
merged_df['Genres'] = merged_df['Genres'].str.split(', ')
merged_df.head(1)


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced,Wikipedia_movie_ID,Genres
0,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,1,451866.0,


## Influenced prophet with genres


In [1]:
from src.models.trend_by_genres import *

In [2]:
exploded_df = load("data/clean/influenced_prophet_with_genres.csv")
exploded_df.head()

Unnamed: 0,Wikipedia ID,Movie_name,Year,Count,Full name,Normalized_name,Mean Difference,Genres
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Thriller
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Action Thrillers
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Action/Adventure
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Glamorized Spy Film
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Action


In [3]:
genre_influence = get_top_genre_influence(exploded_df, top_n=10)
genre_influence.head(10)

Unnamed: 0,Genres,Mean Difference
176,Thriller,264148.016667
64,Drama,221542.266667
1,Action,220826.433333
39,Comedy,143556.933333
54,Crime Fiction,122131.833333
5,Adventure,112813.8
144,Romance Film,108416.916667
152,Science Fiction,92833.933333
130,Period piece,83911.616667
102,Horror,83729.683333


In [4]:
plot_top_genres(genre_influence)

In [5]:
top_names_by_genre = get_top_names_by_genre(exploded_df)
# remove the genre 

top_names_by_genre.head(10)

Unnamed: 0,Wikipedia ID,Movie_name,Year,Count,Full name,Normalized_name,Mean Difference,Genres
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Action
1,633411,the avengers,1998,15,Emma Peel,EMMA,14985.966667,Action
2,3727473,man on fire,1987,4,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5,Action
1,633411,the avengers,1998,15,Emma Peel,EMMA,14985.966667,Adventure
6,142417,apollo 13,1995,8,Jack Swigert,JACK,12508.133333,Adventure
7,268833,goldeneye,1995,4,Jack Wade,JACK,12508.133333,Adventure
1,633411,the avengers,1998,15,Emma Peel,EMMA,14985.966667,Comedy
4,320401,barton fink,1991,3,Audrey Taylor,TAYLOR,13892.1,Comedy
11,685977,sixteen candles,1984,14,"Samantha ""Sam"" Baker",SAMANTHA,10643.533333,Comedy
8,167857,the usual suspects,1995,3,Jack Baer,JACK,12508.133333,Crime Fiction


In [6]:
plot_treemap(top_names_by_genre)
