In [1]:
import pandas as pd

mean_diff = pd.read_csv("data/clean/influenced_names_means_diff.csv")
influenced_meandiff = mean_diff[mean_diff["Influence"] > 0]
# removing common identification mistakes such as "the", "a" or "Mr"
influenced_meandiff = influenced_meandiff[~influenced_meandiff["Character Name"].isin(["the", "a", "Mr"])]
print("Number of influenced names with mean diff: ", len(influenced_meandiff))
influenced_meandiff.sample()



Number of influenced names with mean diff:  1585


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Influence
52,55720,mulan,1998,Mulan,7,Fa Mulan,MULAN,inf


In [2]:
prophet = pd.read_csv("data/clean/influenced_names_prophet.csv")
influenced_prophet = prophet[prophet["Influenced"] > 0]
# removing common identification mistakes such as "the", "a" or "Mr"
influenced_prophet = influenced_prophet[~influenced_prophet["Character Name"].isin(["the", "a", "Mr"])]
print("Number of influenced names with prophet: ", len(influenced_prophet))
influenced_prophet.head()

Number of influenced names with prophet:  432


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced
0,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,1
1,633411,the avengers,1998,Emma,15,Emma Peel,EMMA,14985.966667,1
2,3727473,man on fire,1987,Samantha,4,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5,1
3,347000,suspiria,1977,Sarah,15,Sarah,SARAH,14372.466667,1
5,320401,barton fink,1991,Taylor,3,Audrey Taylor,TAYLOR,13892.1,1


In [3]:
import plotly.express as px
import plotly.graph_objects as go
names_per_year = influenced_prophet.groupby("Year").size().reset_index(name="Count")

# Create the figure
fig = go.Figure()

    # Add line for baby name count over time
fig.add_trace(
    go.Scatter(
        x=names_per_year['Year'], 
        y=names_per_year['Count'], 
        mode='lines',
        name='Baby Name Count',
        line=dict(color='blue')
    )
)

fig.update_layout(
        title="Number of Names influenced by movies per Year",
        xaxis_title='Year',
        yaxis_title='Count',
        legend_title="Legend",
        template="plotly_white",
        width=800
        )   

fig.show()
fig.write_html("docs/_includes/influenced_names_per_year.html")





In [4]:
# bar chart for eras from 60s to 2020s
influenced_prophet["Era"] = pd.cut(influenced_prophet["Year"], bins=[1960, 1970, 1980, 1990, 2000, 2020], labels=["60s", "70s", "80s", "90s", "00s"])
eras = influenced_prophet.groupby("Era").size().reset_index(name="Count")
fig = px.bar(eras, x="Era", y="Count", title="Number of influenced names per Era")
fig.update_layout(
        xaxis_title='Era',
        yaxis_title='Count',
        template="plotly_white",
        width=800
        )
fig.show()
fig.write_html("docs/_includes/influenced_names_per_era.html")

influenced_prophet_90s = influenced_prophet[influenced_prophet["Era"] == "90s"]
influenced_prophet_90s





Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced,Era
0,451866,mission: impossible ii,2000,Ethan,18,Ethan Hunt,ETHAN,15725.466667,1,90s
1,633411,the avengers,1998,Emma,15,Emma Peel,EMMA,14985.966667,1,90s
5,320401,barton fink,1991,Taylor,3,Audrey Taylor,TAYLOR,13892.100000,1,90s
9,142417,apollo 13,1995,Jack,8,Jack Swigert,JACK,12508.133333,1,90s
10,268833,goldeneye,1995,Jack,4,Jack Wade,JACK,12508.133333,1,90s
...,...,...,...,...,...,...,...,...,...,...
1207,34344124,terminator 2: judgment day,1991,Dyson,4,Miles Dyson,DYSON,3.833333,1,90s
1225,3700174,jumanji,1995,Shepherd,8,Peter Shepherd,SHEPHERD,3.500000,1,90s
1300,481649,basic instinct,1992,Gus,11,Gus Moran,GUS,1.933333,1,90s
1348,133648,scent of a woman,1992,Ranger,8,Army Ranger Lieutenant Colonel Frank Slade,RANGER,1.250000,1,90s


In [5]:
import plotly.express as px
import plotly.graph_objects as go

sort_significant = influenced_prophet.sort_values(by='Year')


sort_significant['Label'] = sort_significant['Character Name'] + " from " + sort_significant['Movie Name']

# 2. Prepare list of names for each year
names_by_year = (
    sort_significant.groupby('Year')['Label']
    .apply(list)
    .to_dict()
)

# 3. Create a list of frames for each year
frames = []
for year, names in names_by_year.items():
    text = f"Influenced Names in {year}:<br>" + ",<br>".join(names)
    frame = go.Frame(
        data=[],
        name=str(year),
        layout=go.Layout(
            annotations=[
                dict(
                    text=text,
                    x=0.5, y=0.5,
                    xref="paper", yref="paper",
                    showarrow=False,
                    font=dict(size=16),
                    align="center",
                )
            ]
        )
    )
    frames.append(frame)

# 4. Create the initial figure
initial_year = min(names_by_year.keys())
initial_text = f"Influenced Names in {initial_year}:<br>" + ",<br>".join(names_by_year[initial_year])

fig = go.Figure(
    data=[],
    layout=go.Layout(
        annotations=[
            dict(
                text=initial_text,
                x=0.5, y=0.5,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=16),
                align="center",
            )
        ],
        sliders=[dict(
            active=0,
            currentvalue={"prefix": "Year: "},
            pad={"t": 50},
            steps=[
                dict(
                    method="animate",
                    args=[
                        [str(year)],  # Name of the frame
                        {"mode": "immediate", "frame": {"duration": 0, "redraw": True}},
                    ],
                    label=str(year),
                )
                for year in names_by_year.keys()
            ],
        )],
    ),
    frames=frames
)

# 5. Show the figure
fig.show()



In [6]:
import pandas as pd
import pmdarima as pm
import plotly.graph_objects as go

def predict_naming_ARIMA_bis(data, movname:str, name: str, stop_year: int, nb_years: int, plot=False):
    input_data = data.copy()

    # Filter for the specified name and aggregate counts if needed
    name_data = input_data[input_data['Name'] == name]
    name_data = name_data.groupby(['Year']).sum().reset_index()
    name_data = name_data.drop(columns=['Name'])

    # Split the dataset at the stop_year
    train_data = name_data[name_data['Year'] <= stop_year]
    true_data = name_data[name_data['Year'] >= stop_year]
    true_data = true_data[true_data['Year'] <= stop_year + nb_years]

    # Check if there is data to train the model (since some odd names might not have enough data)
    # we want to have more than 10 years of data
    if len(train_data) < 5:
        return None

    # Handle missing years by filling with 0
    all_years = pd.DataFrame({'Year': range(train_data['Year'].min(), stop_year + 1)})
    train_data = pd.merge(all_years, train_data, on='Year', how='left').fillna(0)

    all_years = pd.DataFrame({'Year': range(stop_year, stop_year + nb_years + 1)})
    true_data = pd.merge(all_years, true_data, on='Year', how='left').fillna(0)

    # Split into x and y as an array
    x_train = train_data['Year'].values
    y_train = train_data['Count'].values

    x_true = true_data['Year'].values
    y_true = true_data['Count'].values

    try:
        # Fit the model
        model = pm.auto_arima(y_train, seasonal=True, m=1)
        forecast, conf_int = model.predict(n_periods=nb_years, return_conf_int=True)
    except:
        print(f"An error occurred while predicting the evolution of the name count for {name} using SARIMA.")
        return None

    if plot:
        # Prepare data for plotting
        x_forecast = list(range(stop_year, stop_year + nb_years + 1))

        lower_bound = conf_int[:, 0]
        upper_bound = conf_int[:, 1]

        # Create the plot
        fig = go.Figure()

        # Add training data trace
        fig.add_trace(go.Scatter(x=x_train[-30:], y=y_train[-30:], mode='lines+markers', name='Previous Years', line=dict(color='blue')))

        # Add forecast data trace
        fig.add_trace(go.Scatter(x=x_forecast, y=[y_train[-1]] + list(forecast), mode='lines+markers', name='Forecast', line=dict(color='orange')))

        # Add true data trace
        fig.add_trace(go.Scatter(x=x_true, y=y_true, mode='lines+markers', name='Following years', line=dict(color='green')))

        # Add confidence interval as shaded area
        fig.add_trace(go.Scatter(
            x=list(x_forecast) + list(x_forecast[::-1]),
            y=list([y_train[-1]] + list(lower_bound)) + list([y_train[-1]] + list(upper_bound))[::-1],
            fill='toself',
            fillcolor='rgba(128, 128, 128, 0.2)',
            line=dict(color='rgba(255,255,255,0)'),
            showlegend=True,
            name='Confidence Interval'
        ))

        # Add a vertical line for stop year
        fig.add_trace(go.Scatter(
            x=[stop_year, stop_year],
            y=[min(y_train[-30:]), max([y_train[-1]] + list(upper_bound))],
            mode='lines',
            line=dict(dash='dash', color='red'),
            name=f'Year prior {movname}'
        ))

        # Update layout
        fig.update_layout(
            title=f"Impact of {movname} on the evolution of the name count for {name}",
            xaxis_title='Year',
            yaxis_title='Count',
            legend_title='Legend',
        )

        # Show plot
        fig.show()
        return fig
    

In [7]:
influence_display = influenced_prophet.copy()
# keep only trose with Mean Difference greater or equal to the quantile 0.75 of that value
influence_display = influence_display[influence_display["Mean Difference"] >= influence_display["Mean Difference"].quantile(0.75)]
# generate a list of following format : { id: "1", name: "Movie A", year: "2001", graph: "movieA_graph.html" }
id = [i for i in range(1, len(influence_display)+1)]
influence_display["id"] = id
# first character in every word in movname is capitalized
influence_display["movname"] = influence_display["Movie Name"].str.title()
influence_display["year"] = influence_display["Year"]-1
influence_display["graph"] = influence_display["Movie Name"].str.replace(" ", "_").replace(":","")
influence_display["name"] = influence_display["Character Name"].str.upper()
influence_display = influence_display[["id", "movname", "year", "graph","name"]]
# print it in format [ { id: "1", name: "Movie A", year: "2001", graph: "movieA_graph.html" }, ...]
print(influence_display.to_dict(orient='records')) 
print(len(influence_display))





[{'id': 1, 'movname': 'Mission: Impossible Ii', 'year': 1999, 'graph': 'mission:_impossible_ii', 'name': 'ETHAN'}, {'id': 2, 'movname': 'The Avengers', 'year': 1997, 'graph': 'the_avengers', 'name': 'EMMA'}, {'id': 3, 'movname': 'Man On Fire', 'year': 1986, 'graph': 'man_on_fire', 'name': 'SAMANTHA'}, {'id': 4, 'movname': 'Suspiria', 'year': 1976, 'graph': 'suspiria', 'name': 'SARAH'}, {'id': 5, 'movname': 'Barton Fink', 'year': 1990, 'graph': 'barton_fink', 'name': 'TAYLOR'}, {'id': 6, 'movname': 'Dracula', 'year': 1978, 'graph': 'dracula', 'name': 'JONATHAN'}, {'id': 7, 'movname': 'Apollo 13', 'year': 1994, 'graph': 'apollo_13', 'name': 'JACK'}, {'id': 8, 'movname': 'Goldeneye', 'year': 1994, 'graph': 'goldeneye', 'name': 'JACK'}, {'id': 9, 'movname': 'The Usual Suspects', 'year': 1994, 'graph': 'the_usual_suspects', 'name': 'JACK'}, {'id': 10, 'movname': 'Titanic', 'year': 1996, 'graph': 'titanic', 'name': 'JACK'}, {'id': 11, 'movname': 'Superman', 'year': 1977, 'graph': 'superman',

In [23]:
movlist = [
    {'id': 1, 'movname': 'Mission: Impossible Ii', 'year': 2000, 'graph': 'mission_impossible_ii', 'name': 'ETHAN'},
    {"id": 2, "movname":"Pulp Fiction", "year": 1994, "graph": "pulp_fiction", "name":"MIA"},
    {"id": 3, "movname":"Star Wars Episode IV: A New Hope", "year": 1977, "graph": "star_wars_episode_iv_a_new_hope", "name":"LUKE"},
    {"id": 4, "movname":"Titanic", "year": 1997, "graph": "titanic", "name":"DAWSON"},
    {"id": 5, "movname":"The Little Mermaid", "year": 1989, "graph": "the_little_mermaid", "name":"ARIEL"},
    {"id": 6, "movname":"Doctor Zhivago", "year": 1965, "graph": "doctor_zhivago", "name":"TONYA"},
    {"id": 7, "movname":"Charlie and the Chocolate Factory", "year": 2005, "graph": "charlie_and_the_chocolate_factory", "name":"CHARLIE"},
    {"id": 8, "movname":"The Nightmare Before Christmas", "year": 1993, "graph": "the_nightmare_before_christmas", "name":"JACK"},
]

global_names = pd.read_csv("data/clean/names/global_names.csv")
for movie in movlist:
    plot = predict_naming_ARIMA_bis(global_names, movie["movname"], movie["name"], int(movie["year"])-1, 10, plot=True)
    plot.write_html(f"docs/assets/graphs/{movie['graph']}_graph.html", full_html=False)