In [2]:
import pandas as pd

# prenoms inventés
mean_diff = pd.read_csv("data/clean/influenced_names_means_diff.csv")
influenced_meandiff = mean_diff[mean_diff["Influence"] > 0]
# removing common identification mistakes such as "the", "a" or "Mr"
influenced_meandiff = influenced_meandiff[~influenced_meandiff["Character Name"].isin(["the", "a", "Mr"])]
print("Number of influenced names with mean diff: ", len(influenced_meandiff))
influenced_meandiff.head()



Number of influenced names with mean diff:  1585


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Influence
0,31186339,the hunger games,2012,Katniss,4,Katniss,KATNISS,inf
1,22144721,iron man 2,2010,Stark,3,Howard Stark,STARK,inf
2,146947,spider-man,2002,Osborn,6,Harry Osborn,OSBORN,inf
4,443972,hook,1991,Banning,16,Peter Banning,BANNING,inf
5,537416,ace ventura: when nature calls,1995,Abbot,2,Grand Abbot,ABBOT,inf


In [3]:
# tous les prénoms influencés
prophet = pd.read_csv("data/clean/influenced_names_prophet.csv")
influenced_prophet = prophet[prophet["Influenced"] > 0]
# removing common identification mistakes such as "the", "a" or "Mr"
influenced_prophet = influenced_prophet[~influenced_prophet["Character Name"].isin(["the", "a", "Mr"])]
print("Number of influenced names with mean diff: ", len(influenced_prophet))
influenced_prophet.head()

Number of influenced names with mean diff:  432


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced
0,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,1
1,633411,the avengers,1998,Emma,15,Emma Peel,EMMA,14985.966667,1
2,3727473,man on fire,1987,Samantha,4,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5,1
3,347000,suspiria,1977,Sarah,15,Sarah,SARAH,14372.466667,1
5,320401,barton fink,1991,Taylor,3,Audrey Taylor,TAYLOR,13892.1,1


In [4]:
#remove every non numeric value in mean diff
mean_diff = mean_diff[mean_diff["Influence"].apply(lambda x: str(x).replace(".", "").isdigit())]
threshold = mean_diff['Influence'].quantile(0.75)

significant_names = mean_diff[mean_diff['Influence'] > threshold]
print("Number of significant names:", len(significant_names))
significant_names.head()

Number of significant names: 371


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Influence
151,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667
152,633411,the avengers,1998,Emma,15,Emma Peel,EMMA,14985.966667
153,3727473,man on fire,1987,Samantha,4,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5
154,347000,suspiria,1977,Sarah,15,Sarah,SARAH,14372.466667
155,483274,point break,1991,Tyler,3,Tyler Endicott,TYLER,14176.666667


In [5]:
cmu_imdb_merged = pd.read_csv("data/clean/cmu_imdb_merged.csv")
cmu_imdb_merged.head(1)

Unnamed: 0,Wikipedia_movie_ID,Movie_name,Release_date,Revenue,Runtime,Languages,Countries,Genres,weightedAverageRating,totalVotes,is_blockbuster
0,29988427.0,!women art revolution,2010-01-01,,0 days 01:23:00,English,"United States of America, Canada","LGBT, History, Documentary",6.9,262.0,False


In [6]:
# Perform a merge based on 'Wikipedia ID' (from influenced_prophet) and 'Wikipedia_movie_ID' (from cmu_imdb_merged)
merged_df = influenced_prophet.merge(
    cmu_imdb_merged[['Wikipedia_movie_ID', 'Genres']],  # Keep only relevant columns
    left_on="Wikipedia ID", 
    right_on="Wikipedia_movie_ID", 
    how="left"
)
# Drop redundant column after the merge
#merged_df.drop(columns=['Wikipedia_movie_ID'], inplace=True)

# Display the result
print("Merged Dataset:")
display(merged_df.head(1))

# Save the new dataset with Genres
#merged_df.to_csv("data/clean/influenced_prophet_with_genres.csv", index=False)
#print("Dataset saved successfully with Genres added!")


Merged Dataset:


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced,Wikipedia_movie_ID,Genres
0,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,1,451866.0,"Thriller, Action Thrillers, Action/Adventure, ..."


In [7]:
merged_df['Genres'] = merged_df['Genres'].str.split(', ')
merged_df.head(1)


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced,Wikipedia_movie_ID,Genres
0,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,1,451866.0,"[Thriller, Action Thrillers, Action/Adventure,..."


## Study of movie genre influence on names


In [8]:
from src.models.trend_by_genres import *

### Amplitude of influence

In [9]:
exploded_df = load("data/clean/influenced_prophet_with_genres.csv")
exploded_df.head()

Unnamed: 0,Wikipedia ID,Movie_name,Year,Count,Full name,Normalized_name,Mean Difference,Genres
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Thriller
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Action Thrillers
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Action/Adventure
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Glamorized Spy Film
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Action


In [10]:
genre_influence = get_top_genre_influence(exploded_df, top_n=10)
genre_influence.head(10)

Unnamed: 0,Genres,Mean Difference
176,Thriller,264148.016667
64,Drama,221542.266667
1,Action,220826.433333
39,Comedy,143556.933333
54,Crime Fiction,122131.833333
5,Adventure,112813.8
144,Romance Film,108416.916667
152,Science Fiction,92833.933333
130,Period piece,83911.616667
102,Horror,83729.683333


In [11]:
plot_top_genres(genre_influence)

In [12]:
top_names_by_genre = get_top_names_by_genre(exploded_df)
# remove the genre 

top_names_by_genre.head(10)

Unnamed: 0,Wikipedia ID,Movie_name,Year,Count,Full name,Normalized_name,Mean Difference,Genres
0,451866,mission: impossible ii,2000,18,Ethan Hunt,ETHAN,15725.466667,Action
1,633411,the avengers,1998,15,Emma Peel,EMMA,14985.966667,Action
2,3727473,man on fire,1987,4,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5,Action
1,633411,the avengers,1998,15,Emma Peel,EMMA,14985.966667,Adventure
6,142417,apollo 13,1995,8,Jack Swigert,JACK,12508.133333,Adventure
7,268833,goldeneye,1995,4,Jack Wade,JACK,12508.133333,Adventure
1,633411,the avengers,1998,15,Emma Peel,EMMA,14985.966667,Comedy
4,320401,barton fink,1991,3,Audrey Taylor,TAYLOR,13892.1,Comedy
11,685977,sixteen candles,1984,14,"Samantha ""Sam"" Baker",SAMANTHA,10643.533333,Comedy
8,167857,the usual suspects,1995,3,Jack Baer,JACK,12508.133333,Crime Fiction


In [13]:
plot_treemap(top_names_by_genre)


### Proportion of influence

In [71]:
def load(filepath):
    """
    Load the dataset, clean it, and explode the Genres column.
    """
    # Load the data
    df = pd.read_csv(filepath)
    
    # Rename the column and drop unnecessary ones
    df = df.drop(columns=['Influenced', 'Wikipedia_movie_ID'])
    # Make a capital letter for the title of the movie name
    df['Movie Name'] = df['Movie Name'].str.title()
    df['Label'] = df['Character Name'] + " from " + df['Movie Name']


    
    # Explode Genres
    df['Genres'] = df['Genres'].str.split(', ')
    exploded_df = df.explode('Genres')

    # remove the genre 'Action/Adventure'  
    exploded_df = exploded_df[exploded_df['Genres'] != 'Action/Adventure']

    
    return exploded_df



In [72]:
names_influenced = load("data/clean/influenced_prophet_with_genres.csv")
names_influenced.head()

Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Genres,Label
0,451866,Mission: Impossible Ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,Thriller,Ethan from Mission: Impossible Ii
0,451866,Mission: Impossible Ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,Action Thrillers,Ethan from Mission: Impossible Ii
0,451866,Mission: Impossible Ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,Glamorized Spy Film,Ethan from Mission: Impossible Ii
0,451866,Mission: Impossible Ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,Action,Ethan from Mission: Impossible Ii
0,451866,Mission: Impossible Ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,Spy,Ethan from Mission: Impossible Ii


In [73]:
def count_top_genres(df, top_n=10):
    """
   Determine the top N genres by count
    """
   # Count occurrences of each genre
    genre_counts = df['Genres'].value_counts()
    
    # Get the top N genres
    top_genres = genre_counts.head(top_n)
    
    # Convert to a DataFrame for output
    top_genres_df = top_genres.reset_index()
    top_genres_df.columns = ['Genres', 'Count']  # Rename columns for clarity
    
    return top_genres_df
        

# Example usage
top_genres = count_top_genres(names_influenced, top_n=10)
print("Top Genres:")
top_genres.head()



Top Genres:


Unnamed: 0,Genres,Count
0,Drama,210
1,Thriller,159
2,Action,159
3,Comedy,121
4,Crime Fiction,100


In [74]:
# viauslization 
def plot_top_genres(genre_influence, metric):
    """
    Create a bar chart for the top N genres and save it as an HTML file.
    """
    fig = px.bar(
        genre_influence,
        x='Genres',
        y=metric,
        title='Top 10 Most Influential Movie Genres on Names',
        labels={'Mean Difference': 'Total Influence Score'},
        template='plotly_white'
    )

    fig.show()
plot_top_genres(top_genres, metric = 'Count')

In [75]:
# Percentage
def proportion_of_influence(df, top_n=10):
    """
    Determine the proportion of influence for the top N genres
    """
    # Count occurrences of each genre
    genre_counts = df['Genres'].value_counts()
    
    # Get the top N genres
    top_genres = genre_counts.head(top_n)
    print(top_genres)

    # Calculate the total number of names
    total_names = len(df)
    print(total_names)

    # Calculate the proportion of influence for each genre in percentage
    top_genres_proportion = top_genres / total_names * 100
    
    # Convert to a DataFrame for output
    top_genres_proportion_df = top_genres_proportion.reset_index()
    top_genres_proportion_df.columns = ['Genres', 'Proportion (%)']  # Rename columns for clarity

    return top_genres_proportion_df

# Example usage
top_genres_proportion = proportion_of_influence(names_influenced, top_n=20)
print("Top Genres by Proportion:")
top_genres_proportion.head()


Genres
Drama              210
Thriller           159
Action             159
Comedy             121
Crime Fiction      100
Romance Film        93
Adventure           88
Science Fiction     86
Horror              58
Period piece        54
Fantasy             53
Film adaptation     49
Cult                46
Indie               43
Coming of age       42
War film            42
Mystery             42
Romantic drama      40
Crime Thriller      37
Teen                35
Name: count, dtype: int64
2578
Top Genres by Proportion:


Unnamed: 0,Genres,Proportion (%)
0,Drama,8.145849
1,Thriller,6.167572
2,Action,6.167572
3,Comedy,4.693561
4,Crime Fiction,3.878976


In [76]:
# visusalisation

# viauslization 
def plot_top_genres(genre_influence, metric):
    """
    Create a bar chart for the top N genres and save it as an HTML file.
    """
    fig = px.bar(
        genre_influence,
        x='Genres',
        y=metric,
        title='Top 10 Most Influential Movie Genres on Names',
        labels={'Mean Difference': 'Total Influence Score'},
        template='plotly_white'
    )

    fig.show()

    
plot_top_genres(top_genres_proportion, metric = 'Proportion (%)')   

In [89]:
def find_top_names_for_top_genres(df, top_genres, top_n_names=3):
    """
    Determine the top N normalized names for each of the top genres.

    Parameters:
        df (DataFrame): The dataset with 'Genres' and 'Normalized_name'.
        top_genres (DataFrame): The DataFrame containing top genres with their counts.
        top_n_names (int): Number of top names to retrieve per genre.

    Returns:
        DataFrame: Top N names for each genre.
    """
    # Filter the dataset to include only rows with the top genres
    filtered_df = df[df['Genres'].isin(top_genres['Genres'])]

    # Group by genre and name, and count occurrences
    name_counts = (
        filtered_df.groupby(['Genres', 'Normalized_name'])
        .size()
        .reset_index(name='Count')
    )

    # Find the top N names for each genre
    top_names = (
        name_counts.groupby('Genres')
        .apply(lambda x: x.nlargest(top_n_names, 'Count'))
        .reset_index(drop=True)
    )

    return top_names

# Example usage
top_names = find_top_names_for_top_genres(names_influenced, top_genres, top_n_names=3)
print("Top Names for Top Genres:")
display(top_names)


Top Names for Top Genres:






Unnamed: 0,Genres,Normalized_name,Count
0,Action,JACK,6
1,Action,MAX,5
2,Action,CARTER,4
3,Adventure,CLARK,4
4,Adventure,JACK,4
5,Adventure,ISLA,3
6,Comedy,MAX,5
7,Comedy,SAM,4
8,Comedy,CLARK,3
9,Crime Fiction,JACK,4


In [90]:
def find_top_names_with_movies(df, top_genres, top_n_names=3):
    """
    Determine the top N normalized names for each genre, with a list of movie names.

    Parameters:
        df (DataFrame): The dataset with 'Genres', 'Normalized_name', and 'Movie Name'.
        top_genres (DataFrame): The DataFrame containing top genres with their counts.
        top_n_names (int): Number of top names to retrieve per genre.

    Returns:
        DataFrame: Top N names for each genre with a list of associated movie names.
    """
    # Filter the dataset to include only rows with the top genres
    filtered_df = df[df['Genres'].isin(top_genres['Genres'])]

    # Group by genre and normalized name, aggregating movie names into a list
    grouped = (
        filtered_df.groupby(['Genres', 'Normalized_name'])
        .agg(
            Count=('Normalized_name', 'size'),
            Movie_Names=('Movie Name', lambda x: list(x.unique()))
        )
        .reset_index()
    )

    # Find the top N names for each genre
    top_names = (
        grouped.groupby('Genres')
        .apply(lambda x: x.nlargest(top_n_names, 'Count'))
        .reset_index(drop=True)
    )

    return top_names

# Example usage
top_names_with_movies = find_top_names_with_movies(names_influenced, top_genres, top_n_names=3)
print("Top Names with Associated Movies:")
display(top_names_with_movies)


Top Names with Associated Movies:






Unnamed: 0,Genres,Normalized_name,Count,Movie_Names
0,Action,JACK,6,"[Goldeneye, Speed, Lethal Weapon 3, The Hunt F..."
1,Action,MAX,5,"[Jackie Brown, Mission: Impossible, Mad Max Be..."
2,Action,CARTER,4,"[2 Fast 2 Furious, Rush Hour, The Lost World: ..."
3,Adventure,CLARK,4,"[Superman Ii, Superman Iii, Vacation, Superman]"
4,Adventure,JACK,4,"[Apollo 13, Goldeneye, Speed, Big Trouble In L..."
5,Adventure,ISLA,3,"[Jurassic Park Iii, The Lost World: Jurassic P..."
6,Comedy,MAX,5,"[Jackie Brown, Liar Liar, Notting Hill, The Lo..."
7,Comedy,SAM,4,"[Sleepless In Seattle, The Lost Boys, Ghost, B..."
8,Comedy,CLARK,3,"[Superman Iii, Vacation, The Breakfast Club]"
9,Crime Fiction,JACK,4,"[The Usual Suspects, Speed, Lethal Weapon 3, T..."


In [93]:
import plotly.express as px

def plot_treemap_with_movies(top_names_by_genre):
    """
    Plot a treemap of the top 3 influential names per genre with movie names displayed on hover.
    """
    # Ensure the movie names are in a single string for hover display
    top_names_by_genre['Movies'] = top_names_by_genre['Movie_Names'].apply(lambda x: ", ".join(x))

    # Create the treemap
    fig = px.treemap(
        top_names_by_genre,
        path=['Genres', 'Normalized_name'],
        values='Count',  # Use the count of names as the value
        title="Top 3 Influential Names per Genre",
        template="plotly_white",
        color='Genres',
        custom_data=['Movies']  # Include aggregated movie names for hover
    )
    
    # Update hover template to display the movie list
    fig.update_traces(
        hovertemplate="<b>%{label}</b><br>Movies: %{customdata[0]}<extra></extra>"
    )
    
    # Show and save the figure
    fig.show()
    fig.write_html("docs/_includes/treemap_top3_by_genre_by count.html")


# Plot the treemap
plot_treemap_with_movies(top_names_with_movies)
