In [1]:
import pandas as pd

In [2]:
danish_df = pd.read_csv('data/clean/names/norway_merged.csv')
danish_df.head(5)

Unnamed: 0,Year,Name,Sex,Count
0,1960,AAGE,M,23
1,1960,AASE,F,41
2,1960,AASHILD,F,5
3,1960,AASMUND,M,8
4,1960,ABBAS,M,5


In [48]:
import pandas as pd
import plotly.graph_objects as go

# Step 1: Create a new column for 10-year periods
danish_df['Period'] = (danish_df['Year'] // 10) * 10  # Group by decades (e.g., 1960-1969 becomes 1960)

# Step 2: Aggregate by Period, Name, and Year
aggregated_data = (
    danish_df.groupby(['Period', 'Name', 'Year'], as_index=False)['Count']
    .sum()  # Sum counts for the same name within the same period and year
)

# Step 3: Get top 10 names per period
top_names_by_period = (
    aggregated_data.groupby(['Period', 'Name'], as_index=False)['Count']
    .sum()  # Sum total counts for the entire period
    .sort_values(['Period', 'Count'], ascending=[True, False])  # Sort within each period
    .groupby('Period', as_index=False)
    .head(10)  # Keep only the top 10 names per period
)

# Step 4: Prepare the frames for each period
frames = []
for period in top_names_by_period['Period'].unique():
    # Get the top 10 names for the current period
    top_names = top_names_by_period[top_names_by_period['Period'] == period]['Name'].tolist()
    
    # Filter data for the top 10 names in this period
    filtered_data = aggregated_data[(aggregated_data['Period'] == period) & (aggregated_data['Name'].isin(top_names))]
    
    # Ensure all years in the period are covered for each name
    all_years = list(range(period, period + 10))
    filled_data = []
    for name in top_names:
        name_data = filtered_data[filtered_data['Name'] == name]
        # Fill missing years with 0 counts
        name_data = name_data.set_index('Year').reindex(all_years, fill_value=0).reset_index()
        name_data['Name'] = name  # Ensure the name column is preserved
        filled_data.append(name_data)
    
    # Combine all filled data into one DataFrame
    combined_data = pd.concat(filled_data)
    
    # Add line traces for the evolution of each name
    line_traces = []
    for name in top_names:
        name_data = combined_data[combined_data['Name'] == name]
        line_traces.append(go.Scatter(
            x=name_data['Year'],
            y=name_data['Count'],
            mode='lines',
            name=name,
            line=dict(width=2),
            showlegend=False,  # Hide legend for simplicity
        ))

    # Add text annotation for the top names below the plot
    text = f"Top 10 Names in {period}s: " + ", ".join(top_names)
    frame = go.Frame(
        data=line_traces,
        name=str(period),
        layout=go.Layout(
            annotations=[
                dict(
                    text=text,
                    x=0.5,  # Center horizontally
                    y=-0.2,  # Position below the plot
                    xref="paper", yref="paper",
                    showarrow=False,
                    font=dict(size=14),
                    align="center",  # Align text to the center
                )
            ]
        )
    )
    frames.append(frame)

# Step 5: Create the initial figure
initial_period = min(top_names_by_period['Period'].unique())
initial_top_names = top_names_by_period[top_names_by_period['Period'] == initial_period]['Name'].tolist()
initial_filtered_data = aggregated_data[(aggregated_data['Period'] == initial_period) & (aggregated_data['Name'].isin(initial_top_names))]

# Ensure all years are filled for the initial period
all_years = list(range(initial_period, initial_period + 10))
filled_initial_data = []
for name in initial_top_names:
    name_data = initial_filtered_data[initial_filtered_data['Name'] == name]
    name_data = name_data.set_index('Year').reindex(all_years, fill_value=0).reset_index()
    name_data['Name'] = name
    filled_initial_data.append(name_data)

combined_initial_data = pd.concat(filled_initial_data)

initial_data = []
for name in initial_top_names:
    name_data = combined_initial_data[combined_initial_data['Name'] == name]
    initial_data.append(go.Scatter(
        x=name_data['Year'],
        y=name_data['Count'],
        mode='lines',
        name=name,
        line=dict(width=2),
        showlegend=False,
    ))

initial_text = f"Top 10 Danish Names in {initial_period}s:<br>" + ",<br>".join(initial_top_names)

fig = go.Figure(
    data=initial_data,
    layout=go.Layout(
        annotations=[
            dict(
                text=initial_text,
                x=-0.3, y=0.95,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=16),
                align="center",
            )
        ],
        sliders=[dict(
            active=0,
            currentvalue={"prefix": "Period: "},
            pad={"t": 50},
            steps=[
                dict(
                    method="animate",
                    args=[
                        [str(period)],  # Name of the frame
                        {"mode": "immediate", "frame": {"duration": 500, "redraw": True}}  # Animation settings
                    ],
                    label=f"{period}s",
                )
                for period in top_names_by_period['Period'].unique()
            ],
        )],
    ),
    frames=frames
)

# Step 6: Show the figure
fig.show()

# saving the figure to an html file
fig.write_html("danish_names_evolution_by_decades.html")

How to use this plot : when sliding from decade to an other, please autoscale to visualize the line graphs.

Prénoms qu'on peut étudié car présente des hausses visuellement interessantes:
- Thomas 1975 : not due to movies
- Martin 1993 : not due to movie
- Emma 2003: not
- Jonas 2006: not influenced
- Emile 2014: 
- Nora 2016
- William 2016





In [27]:
norway_mean = pd.read_csv('data/clean/norway_mean.csv')
print("Size of the data: ", norway_mean.shape)
norway_mean.head(5)


Size of the data:  (1647, 8)


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Influence
0,557036,la dolce vita,1960,Emma,10,Emma,EMMA,inf
1,1880660,"the chronicles of narnia: the lion, the witch ...",2005,Edmund,2,Edmund,EDMUND,inf
2,31176,the parent trap,1961,Sharon,7,Sharon telephones Susan,SHARON,inf
3,18952889,"it's a mad, mad, mad, mad world",1963,Marcus,7,Marcus,MARCUS,inf
4,301574,the little mermaid,1989,Ariel,27,Ariel spends,ARIEL,inf


In [13]:
norway_prediction = pd.read_csv('data/clean/norway_prediction.csv')

#size of the dataset
print("Size of the dataset: ", norway_prediction.shape)
norway_prediction.head(5)


Size of the dataset:  (692, 9)


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced
0,343408,the aristocats,1970,Thomas,4,Thomas O'Malley,THOMAS,462.5,0
1,1145761,where eagles dare,1968,Thomas,3,Thomas,THOMAS,347.433333,0
2,633411,the avengers,1998,Emma,15,Emma Peel,EMMA,316.233333,1
3,74537,brazil,1985,Ida,2,Ida,IDA,290.133333,0
4,695227,lethal weapon 2,1989,Martin,9,Martin Riggs,MARTIN,285.9,0


In [28]:
# keep only rows where influenced = 1
norway_prediction_influenced = norway_prediction[norway_prediction['Influenced'] == 1]
print("Size of the dataset: ", norway_prediction_influenced.shape)
norway_prediction_influenced.head(5)

#save the dataset


Size of the dataset:  (107, 9)


Unnamed: 0,Wikipedia ID,Movie Name,Year,Character Name,Count,Full name,Normalized_name,Mean Difference,Influenced
2,633411,the avengers,1998,Emma,15,Emma Peel,EMMA,316.233333,1
6,235418,lethal weapon,1987,Martin,7,Martin Riggs,MARTIN,239.9,1
12,62668,the ring,2002,Noah,6,Noah,NOAH,202.633333,1
15,503573,dazed and confused,1993,Julie,2,Julie Simms,JULIE,190.333333,1
18,2236472,transformers,2007,William,2,William Lennox,WILLIAM,155.833333,1


In [73]:
import plotly.express as px
import plotly.graph_objects as go

sort_significant = norway_prediction_influenced.sort_values(by='Year')
sort_significant['Movie Name'] = sort_significant['Movie Name'].apply(lambda x: x.title())

sort_significant['Label'] = sort_significant['Character Name'] + " from " + sort_significant['Movie Name']

# 2. Prepare list of names for each year
names_by_year = (
    sort_significant.groupby('Year')['Label']
    .apply(list)
    .to_dict()
)

# 3. Create a list of frames for each year
frames = []
for year, names in names_by_year.items():
    text = f"Influenced Danish Names in {year}:<br>" + ",<br>".join(names)
    frame = go.Frame(
        data=[],
        name=str(year),
        layout=go.Layout(
            annotations=[
                dict(
                    text=text,
                    x=0.5, y=0.5,
                    xref="paper", yref="paper",
                    showarrow=False,
                    font=dict(size=16),
                    align="center",
                )
            ]
        )
    )
    frames.append(frame)

# 4. Create the initial figure
initial_year = min(names_by_year.keys())
initial_text = f"Influenced Danish Names in {initial_year}:<br>" + ",<br>".join(names_by_year[initial_year])

fig = go.Figure(
    data=[],
    layout=go.Layout(
        annotations=[
            dict(
                text=initial_text,
                x=0.5, y=0.5,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=16),
                align="center",
            )
        ],
        sliders=[dict(
            active=0,
            currentvalue={"prefix": "Year: "},
            pad={"t": 50},
            steps=[
                dict(
                    method="animate",
                    args=[
                        [str(year)],  # Name of the frame
                        {"mode": "immediate", "frame": {"duration": 0, "redraw": True}},
                    ],
                    label=str(year),
                )
                for year in names_by_year.keys()
            ],
        )],
    ),
    frames=frames
)

# 5. Show the figure
fig.show()



In [82]:
import plotly.express as px
import pandas as pd

# Step 1: Add a 'Decade' column based on the 'Year' column
danish_df['Decade'] = (danish_df['Year'] // 10) * 10

# Step 2: Deduplicate names within each decade before processing
danish_df_unique = danish_df.drop_duplicates(subset=['Decade', 'Name'])

# Step 3: Group by decade and get the top 3 names for each decade
top_names_by_decade = (
    danish_df_unique.groupby('Decade')
    .apply(lambda group: group.nlargest(3, 'Count')[['Decade', 'Name', 'Count']])  # Retain 'Decade' column
    .reset_index(drop=True)  # Flatten the index without dropping columns
    .groupby('Decade')['Name']
    .apply(list)
    .reset_index()
)

# Step 4: Merge the top names with the counts for each decade
counts_by_decade = danish_df.groupby('Decade')['Count'].sum().reset_index()
counts_with_top_names = counts_by_decade.merge(top_names_by_decade, on='Decade', how='left')

# Step 5: Create the bar plot with Plotly
fig = px.bar(
    counts_with_top_names,
    x='Decade',
    y='Count',
    text='Count',  # Display the count value on the bars
    labels={'Decade': 'Decade', 'Count': 'Total Danish Names'},
    title='Evolution of Danish Names Counts by Decade',
    color='Count',  # Optional: Color bars based on the count
    color_continuous_scale='Blues',
)

# Customize the hover template to replace "Label" with "Top 3 Names"
fig.update_traces(
    hovertemplate="<b>Decade: %{x}</b><br>Total Danish Names: %{y}<br>Top 3 Names: %{customdata[0]}<extra></extra>",
    customdata=counts_with_top_names[['Name']]  # Pass top names for the tooltip
)

# Update layout for better visuals
fig.update_layout(
    xaxis=dict(
        tickvals=counts_by_decade['Decade'],  # Ensure only decades appear
        title='Decade'
    ),
    yaxis_title='Total Counts of Danish Names',
    title_x=0.5,
    template='plotly_white'
)

# Show the figure
fig.show()

# Optionally save the plot as an HTML file
# fig.write_html("danish_influenced_names_by_decades_top3.html")






In [91]:
# read CMU movies_char
cmu_movies_chars = pd.read_csv('data\clean\movies_char\CMU_movies_chars.csv')
cmu_movies_chars.head(5)

# keep only the movie_nale and Release_date, Revenue, Genres, Countries
cmu_movies_chars = cmu_movies_chars[['Movie_name', 'Release_date', 'Revenue', 'Genres', 'Countries']]
cmu_movies_chars.head(5)
#drop duplicates in Movie_name  
cmu_movies_chars = cmu_movies_chars.drop_duplicates(subset=['Movie_name'])
cmu_movies_chars.head(5)

# Visualize the count of movies per decade
import plotly.express as px

# Step 1: Add a 'Decade' column based on the 'Release_date' column
cmu_movies_chars['Release_date'] = pd.to_datetime(cmu_movies_chars['Release_date'])
cmu_movies_chars['Decade'] = (cmu_movies_chars['Release_date'].dt.year // 10) * 10

# Step 2: Group by decade and count the number of movies
movies_count_by_decade = cmu_movies_chars['Decade'].value_counts().sort_index().reset_index()
movies_count_by_decade.columns = ['Decade', 'Movie Count']

# Step 3: Create the bar plot with Plotly
fig = px.bar(
    movies_count_by_decade,
    x='Decade',
    y='Movie Count',
    text='Movie Count',  # Display the count value on the bars
    labels={'Decade': 'Decade', 'Movie Count': 'Number of Movies'},
    title='Number of Movies Released per Decade',
    color='Movie Count',  # Optional: Color bars based on the count
    color_continuous_scale='Blues',
)

# Update layout for better visuals
fig.update_layout(
    xaxis=dict(
        tickvals=movies_count_by_decade['Decade'],  # Ensure only decades appear
        title='Decade'
    ),
    yaxis_title='Number of Movies Released',
    title_x=0.5,
    template='plotly_white'
)

# Show the figure
fig.show()



In [None]:
# Create a dataframe with the count of movies per decade and the number of influenced danish names
# takes the columns of movies_count_by_decade and the   

# Step 1: Merge the movie count with the count of influenced Danish names


In [78]:
# Step 1: Add a 'Decade' column based on the 'Year' column
sort_significant['Decade'] = (sort_significant['Year'] // 10) * 10

# Group by decade and get the top 3 names for each decade
top_names_by_decade = (
    sort_significant.groupby('Decade')
    .apply(lambda group: group.nlargest(3, 'Count')[['Decade', 'Label', 'Count']])  # Retain 'Decade' column
    .reset_index(drop=True)  # Flatten the index without dropping columns
    .groupby('Decade')['Label']
    .apply(list)
    .reset_index()
)

# Merge the top names with the counts for each decade
counts_by_decade = sort_significant.groupby('Decade')['Count'].sum().reset_index()
counts_with_top_names = counts_by_decade.merge(top_names_by_decade, on='Decade', how='left')

# Create the bar plot with Plotly
# Create the bar plot with Plotly
fig = px.bar(
    counts_with_top_names,
    x='Decade',
    y='Count',
    text='Count',  # Display the count value on the bars
    labels={'Decade': 'Decade', 'Count': 'Total Influenced Names'},
    title='Evolution of Danish Influenced Names Counts by Decade',
    color='Count',  # Optional: Color bars based on the count
    color_continuous_scale='Blues',
)

# Customize the hover template to replace "Label" with "Top 3 Names"
fig.update_traces(
    hovertemplate="<b>Decade: %{x}</b><br>Total Influenced Names: %{y}<br>Top 3 Names: %{customdata[0]}<extra></extra>",
    customdata=counts_with_top_names[['Label']]  # Pass top names for the tooltip
)

# Update layout for better visuals
fig.update_layout(
    xaxis=dict(
        tickvals=counts_by_decade['Decade'],  # Ensure only decades appear
        title='Decade'
    ),
    yaxis_title='Total Counts of Danish Names Influenced',
    title_x=0.5,
    template='plotly_white'
)

# Show the figure
fig.show()

# save as Html
fig.write_html("docs/_includes/danish_influenced_names_by_decades_top3.html")








In [69]:
# what was the most popular name in the 1960s?
# Get the most popular name in the 1960s

most_popular_name_1960s = (
    danish_df[danish_df['Period'] == 1960]
    .groupby('Name', as_index=False)['Count']
    .sum()
    .sort_values('Count', ascending=False)
    .iloc[0]
)

print(f"The most popular name in the 1960s was '{most_popular_name_1960s['Name']}' with {most_popular_name_1960s['Count']} counts.")

The most popular name in the 1960s was 'ANNE' with 16393 counts.


In [68]:
import pandas as pd
import pmdarima as pm
import plotly.graph_objects as go

def predict_naming_ARIMA_bis(data, movname:str, name: str, stop_year: int, nb_years: int, plot=False):
    input_data = data.copy()

    # Filter for the specified name and aggregate counts if needed
    name_data = input_data[input_data['Name'] == name]
    name_data = name_data.groupby(['Year']).sum().reset_index()
    name_data = name_data.drop(columns=['Name'])

    # Split the dataset at the stop_year
    train_data = name_data[name_data['Year'] <= stop_year]
    true_data = name_data[name_data['Year'] >= stop_year]
    true_data = true_data[true_data['Year'] <= stop_year + nb_years]

    # Check if there is data to train the model (since some odd names might not have enough data)
    # we want to have more than 10 years of data
    if len(train_data) < 5:
        return None

    # Handle missing years by filling with 0
    all_years = pd.DataFrame({'Year': range(train_data['Year'].min(), stop_year + 1)})
    train_data = pd.merge(all_years, train_data, on='Year', how='left').fillna(0)

    all_years = pd.DataFrame({'Year': range(stop_year, stop_year + nb_years + 1)})
    true_data = pd.merge(all_years, true_data, on='Year', how='left').fillna(0)

    # Split into x and y as an array
    x_train = train_data['Year'].values
    y_train = train_data['Count'].values

    x_true = true_data['Year'].values
    y_true = true_data['Count'].values

    try:
        # Fit the model
        model = pm.auto_arima(y_train, seasonal=True, m=1)
        forecast, conf_int = model.predict(n_periods=nb_years, return_conf_int=True)
    except:
        print(f"An error occurred while predicting the evolution of the name count for {name} using SARIMA.")
        return None

    if plot:
        # Prepare data for plotting
        x_forecast = list(range(stop_year, stop_year + nb_years + 1))

        lower_bound = conf_int[:, 0]
        upper_bound = conf_int[:, 1]

        # Create the plot
        fig = go.Figure()

        # Add training data trace
        fig.add_trace(go.Scatter(x=x_train[-30:], y=y_train[-30:], mode='lines+markers', name='Previous Years', line=dict(color='blue')))

        # Add forecast data trace
        fig.add_trace(go.Scatter(x=x_forecast, y=[y_train[-1]] + list(forecast), mode='lines+markers', name='Forecast', line=dict(color='orange')))

        # Add true data trace
        fig.add_trace(go.Scatter(x=x_true, y=y_true, mode='lines+markers', name='Following years', line=dict(color='green')))

        # Add confidence interval as shaded area
        fig.add_trace(go.Scatter(
            x=list(x_forecast) + list(x_forecast[::-1]),
            y=list([y_train[-1]] + list(lower_bound)) + list([y_train[-1]] + list(upper_bound))[::-1],
            fill='toself',
            fillcolor='rgba(128, 128, 128, 0.2)',
            line=dict(color='rgba(255,255,255,0)'),
            showlegend=True,
            name='Confidence Interval'
        ))

        # Add a vertical line for stop year
        fig.add_trace(go.Scatter(
            x=[stop_year, stop_year],
            y=[min(y_train[-30:]), max([y_train[-1]] + list(upper_bound))],
            mode='lines',
            line=dict(dash='dash', color='red'),
            name=f'Year prior {movname}'
        ))

        # Update layout
        fig.update_layout(
            title=f"Impact of {movname} on {name}",
            xaxis_title='Year',
            yaxis_title='Count',
            legend_title='Legend',
        )

        # Show plot
        fig.show()
        return fig
    

In [33]:
influence_display = norway_prediction_influenced.copy()
# keep only trose with Mean Difference greater or equal to the quantile 0.75 of that value
influence_display = influence_display[influence_display["Mean Difference"] >= influence_display["Mean Difference"].quantile(0.75)]
# generate a list of following format : { id: "1", name: "Movie A", year: "2001", graph: "movieA_graph.html" }
id = [i for i in range(1, len(influence_display)+1)]
influence_display["id"] = id
# first character in every word in movname is capitalized
influence_display["movname"] = influence_display["Movie Name"].str.title()
influence_display["year"] = influence_display["Year"]-1
influence_display["graph"] = influence_display["Movie Name"].str.replace(" ", "_").replace(":","")
influence_display["name"] = influence_display["Character Name"].str.upper()
influence_display = influence_display[["id", "movname", "year", "graph","name"]]
# print it in format [ { id: "1", name: "Movie A", year: "2001", graph: "movieA_graph.html" }, ...]
print(influence_display.to_dict(orient='records')) 
print(len(influence_display))





[{'id': 1, 'movname': 'The Avengers', 'year': 1997, 'graph': 'the_avengers', 'name': 'EMMA'}, {'id': 2, 'movname': 'Lethal Weapon', 'year': 1986, 'graph': 'lethal_weapon', 'name': 'MARTIN'}, {'id': 3, 'movname': 'The Ring', 'year': 2001, 'graph': 'the_ring', 'name': 'NOAH'}, {'id': 4, 'movname': 'Dazed And Confused', 'year': 1992, 'graph': 'dazed_and_confused', 'name': 'JULIE'}, {'id': 5, 'movname': 'Transformers', 'year': 2006, 'graph': 'transformers', 'name': 'WILLIAM'}, {'id': 6, 'movname': 'There Will Be Blood', 'year': 2006, 'graph': 'there_will_be_blood', 'name': 'WILLIAM'}, {'id': 7, 'movname': 'Batman Begins', 'year': 2004, 'graph': 'batman_begins', 'name': 'WILLIAM'}, {'id': 8, 'movname': 'Wedding Crashers', 'year': 2004, 'graph': 'wedding_crashers', 'name': 'WILLIAM'}, {'id': 9, 'movname': 'Tombstone', 'year': 1992, 'graph': 'tombstone', 'name': 'MARCUS'}, {'id': 10, 'movname': 'Gladiator', 'year': 1999, 'graph': 'gladiator', 'name': 'NOAH'}, {'id': 11, 'movname': 'Jaws 2', '

In [35]:
movlist = [
    {'id': 1, 'movname': 'The Avengers', 'year': 1997, 'graph': 'the_avengers', 'name': 'EMMA'},
    {'id': 2, 'movname': 'Lethal Weapon', 'year': 1986, 'graph': 'lethal_weapon', 'name': 'MARTIN'},
    {'id': 3, 'movname': 'The Ring', 'year': 2001, 'graph': 'the_ring', 'name': 'NOAH'},
    {'id': 4, 'movname': 'Dazed And Confused', 'year': 1992, 'graph': 'dazed_and_confused', 'name': 'JULIE'},
    {'id': 5, 'movname': 'Transformers', 'year': 2006, 'graph': 'transformers', 'name': 'WILLIAM'},
    {'id': 6, 'movname': 'There Will Be Blood', 'year': 2006, 'graph': 'there_will_be_blood', 'name': 'WILLIAM'},
    {'id': 7, 'movname': 'Batman Begins', 'year': 2004, 'graph': 'batman_begins', 'name': 'WILLIAM'},
    {'id': 8, 'movname': 'Wedding Crashers', 'year': 2004, 'graph': 'wedding_crashers', 'name': 'WILLIAM'},
    {'id': 9, 'movname': 'Tombstone', 'year': 1992, 'graph': 'tombstone', 'name': 'MARCUS'},
    {'id': 10, 'movname': 'Gladiator', 'year': 1999, 'graph': 'gladiator', 'name': 'NOAH'},
    {'id': 11, 'movname': 'Jaws 2', 'year': 1977, 'graph': 'jaws_2', 'name': 'MARTIN'},
    {'id': 12, 'movname': 'Pirates Of The Caribbean: The Curse Of The Black Pearl', 'year': 2002, 'graph': 'pirates_of_the_caribbean:_the_curse_of_the_black_pearl', 'name': 'WILLIAM'},
    {'id': 13, 'movname': 'Ghost', 'year': 1989, 'graph': 'ghost', 'name': 'ODA'},
    {'id': 14, 'movname': 'Forrest Gump', 'year': 1993, 'graph': 'forrest_gump', 'name': 'BENJAMIN'},
    {'id': 15, 'movname': 'The Fast And The Furious', 'year': 2000, 'graph': 'the_fast_and_the_furious', 'name': 'MIA'},
    {'id': 16, 'movname': 'Children Of Men', 'year': 2005, 'graph': 'children_of_men', 'name': 'THEO'},
    {'id': 17, 'movname': "Wayne'S World", 'year': 1991, 'graph': "wayne's_world", 'name': 'BENJAMIN'},
    {'id': 18, 'movname': 'Almost Famous', 'year': 1999, 'graph': 'almost_famous', 'name': 'WILLIAM'},
    {'id': 19, 'movname': 'Notting Hill', 'year': 1998, 'graph': 'notting_hill', 'name': 'WILLIAM'},
    {'id': 20, 'movname': 'Meet Joe Black', 'year': 1997, 'graph': 'meet_joe_black', 'name': 'WILLIAM'},
    {'id': 21, 'movname': 'Tootsie', 'year': 1981, 'graph': 'tootsie', 'name': 'JULIE'},
    {'id': 22, 'movname': 'Constantine', 'year': 2004, 'graph': 'constantine', 'name': 'GABRIEL'},
    {'id': 23, 'movname': 'Heat', 'year': 1994, 'graph': 'heat', 'name': 'HANNA'},
    {'id': 24, 'movname': 'Braveheart', 'year': 1994, 'graph': 'braveheart', 'name': 'WILLIAM'},
    {'id': 25, 'movname': 'Home Alone', 'year': 1989, 'graph': 'home_alone', 'name': 'KEVIN'},
    {'id': 26, 'movname': 'Edward Scissorhands', 'year': 1989, 'graph': 'edward_scissorhands', 'name': 'KEVIN'},
    {'id': 27, 'movname': 'The Poseidon Adventure', 'year': 1971, 'graph': 'the_poseidon_adventure', 'name': 'MARTIN'}
]


danish_names = pd.read_csv("data/clean/names/norway_merged.csv")
for movie in movlist:
    plot = predict_naming_ARIMA_bis(danish_names, movie["movname"], movie["name"], int(movie["year"])-1, 10, plot=True)
    plot.write_html(f"docs/assets/graphs/{movie['graph']}_graph.html", full_html=False)

In [13]:
influenced_names = pd.read_csv('data\clean\influenced_names_prophet.csv')
influenced_names.head(10)

# keep only Movie Name, Year, Full name, Normalized_name, Mean Difference
influenced_names = influenced_names[['Movie Name', 'Year', 'Full name', 'Normalized_name', 'Mean Difference']]
print("size of influenced_names: ", influenced_names.shape)
influenced_names.head(10)


size of influenced_names:  (1447, 5)


Unnamed: 0,Movie Name,Year,Full name,Normalized_name,Mean Difference
0,mission: impossible ii,2000,Ethan Hunt,ETHAN,15725.466667
1,the avengers,1998,Emma Peel,EMMA,14985.966667
2,man on fire,1987,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5
3,suspiria,1977,Sarah,SARAH,14372.466667
4,point break,1991,Tyler Endicott,TYLER,14176.666667
5,barton fink,1991,Audrey Taylor,TAYLOR,13892.1
6,dracula,1979,Jonathan Harker,JONATHAN,13219.566667
7,i spit on your grave,1978,Matthew hides,MATTHEW,12837.433333
8,invasion of the body snatchers,1978,Matthew Bennell,MATTHEW,12837.433333
9,apollo 13,1995,Jack Swigert,JACK,12508.133333


In [55]:
# create a dataframe from influenced_names witht only the rows that have a Normalized name found in danish_df
influenced_names = influenced_names[influenced_names['Normalized_name'].isin(danish_df['Name'])]
print("size of influenced_names: ", influenced_names.shape)
influenced_names.head(10)


size of influenced_names:  (495, 5)


Unnamed: 0,Movie Name,Year,Full name,Normalized_name,Mean Difference
1,the avengers,1998,Emma Peel,EMMA,14985.966667
2,man on fire,1987,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5
3,suspiria,1977,Sarah,SARAH,14372.466667
6,dracula,1979,Jonathan Harker,JONATHAN,13219.566667
7,i spit on your grave,1978,Matthew hides,MATTHEW,12837.433333
8,invasion of the body snatchers,1978,Matthew Bennell,MATTHEW,12837.433333
9,apollo 13,1995,Jack Swigert,JACK,12508.133333
10,goldeneye,1995,Jack Wade,JACK,12508.133333
11,the usual suspects,1995,Jack Baer,JACK,12508.133333
12,titanic,1997,Jack Dawson,JACK,12348.266667
