In [1]:
import pandas as pd

In [9]:
danish_df = pd.read_csv('data/clean/names/norway_merged.csv')
danish_df.head(5)

Unnamed: 0,Year,Name,Sex,Count
0,1960,AAGE,M,23
1,1960,AASE,F,41
2,1960,AASHILD,F,5
3,1960,AASMUND,M,8
4,1960,ABBAS,M,5


In [53]:
import pandas as pd
import plotly.graph_objects as go

# Step 1: Create a new column for 10-year periods
danish_df['Period'] = (danish_df['Year'] // 10) * 10  # Group by decades (e.g., 1960-1969 becomes 1960)

# Step 2: Aggregate by Period, Name, and Year
aggregated_data = (
    danish_df.groupby(['Period', 'Name', 'Year'], as_index=False)['Count']
    .sum()  # Sum counts for the same name within the same period and year
)

# Step 3: Get top 10 names per period
top_names_by_period = (
    aggregated_data.groupby(['Period', 'Name'], as_index=False)['Count']
    .sum()  # Sum total counts for the entire period
    .sort_values(['Period', 'Count'], ascending=[True, False])  # Sort within each period
    .groupby('Period', as_index=False)
    .head(10)  # Keep only the top 10 names per period
)

# Step 4: Prepare the frames for each period
frames = []
for period in top_names_by_period['Period'].unique():
    # Get the top 10 names for the current period
    top_names = top_names_by_period[top_names_by_period['Period'] == period]['Name'].tolist()
    
    # Filter data for the top 10 names in this period
    filtered_data = aggregated_data[(aggregated_data['Period'] == period) & (aggregated_data['Name'].isin(top_names))]
    
    # Ensure all years in the period are covered for each name
    all_years = list(range(period, period + 10))
    filled_data = []
    for name in top_names:
        name_data = filtered_data[filtered_data['Name'] == name]
        # Fill missing years with 0 counts
        name_data = name_data.set_index('Year').reindex(all_years, fill_value=0).reset_index()
        name_data['Name'] = name  # Ensure the name column is preserved
        filled_data.append(name_data)
    
    # Combine all filled data into one DataFrame
    combined_data = pd.concat(filled_data)
    
    # Add line traces for the evolution of each name
    line_traces = []
    for name in top_names:
        name_data = combined_data[combined_data['Name'] == name]
        line_traces.append(go.Scatter(
            x=name_data['Year'],
            y=name_data['Count'],
            mode='lines',
            name=name,
            line=dict(width=2),
            showlegend=False,  # Hide legend for simplicity
        ))

        # Add text annotation for the top names below the plot
    text = f"Top 10 Names in {period}s: " + ", ".join(top_names)
    frame = go.Frame(
        data=line_traces,
        name=str(period),
        layout=go.Layout(
            annotations=[
                dict(
                    text=text,
                    x=0.5,  # Center horizontally
                    y=-0.2,  # Position below the plot
                    xref="paper", yref="paper",
                    showarrow=False,
                    font=dict(size=14),
                    align="center",  # Align text to the center
                )
            ]
        )
    )
    frames.append(frame)

# Step 5: Create the initial figure
initial_period = min(top_names_by_period['Period'].unique())
initial_top_names = top_names_by_period[top_names_by_period['Period'] == initial_period]['Name'].tolist()
initial_filtered_data = aggregated_data[(aggregated_data['Period'] == initial_period) & (aggregated_data['Name'].isin(initial_top_names))]

# Ensure all years are filled for the initial period
all_years = list(range(initial_period, initial_period + 10))
filled_initial_data = []
for name in initial_top_names:
    name_data = initial_filtered_data[initial_filtered_data['Name'] == name]
    name_data = name_data.set_index('Year').reindex(all_years, fill_value=0).reset_index()
    name_data['Name'] = name
    filled_initial_data.append(name_data)

combined_initial_data = pd.concat(filled_initial_data)

initial_data = []
for name in initial_top_names:
    name_data = combined_initial_data[combined_initial_data['Name'] == name]
    initial_data.append(go.Scatter(
        x=name_data['Year'],
        y=name_data['Count'],
        mode='lines',
        name=name,
        line=dict(width=2),
        showlegend=False,
    ))

initial_text = f"Top 10 Danish Names in {initial_period}s:<br>" + ",<br>".join(initial_top_names)

fig = go.Figure(
    data=initial_data,
    layout=go.Layout(
        annotations=[
            dict(
                text=initial_text,
                x=-0.3, y=0.95,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=16),
                align="center",
            )
        ],
        sliders=[dict(
            active=0,
            currentvalue={"prefix": "Period: "},
            pad={"t": 50},
            steps=[
                dict(
                    method="animate",
                    args=[
                        [str(period)],  # Name of the frame
                        {"mode": "immediate", "frame": {"duration": 500, "redraw": True}}  # Animation settings
                    ],
                    label=f"{period}s",
                )
                for period in top_names_by_period['Period'].unique()
            ],
        )],
    ),
    frames=frames
)

# Step 6: Show the figure
fig.show()


In [13]:
influenced_names = pd.read_csv('data\clean\influenced_names_prophet.csv')
influenced_names.head(10)

# keep only Movie Name, Year, Full name, Normalized_name, Mean Difference
influenced_names = influenced_names[['Movie Name', 'Year', 'Full name', 'Normalized_name', 'Mean Difference']]
print("size of influenced_names: ", influenced_names.shape)
influenced_names.head(10)


size of influenced_names:  (1447, 5)


Unnamed: 0,Movie Name,Year,Full name,Normalized_name,Mean Difference
0,mission: impossible ii,2000,Ethan Hunt,ETHAN,15725.466667
1,the avengers,1998,Emma Peel,EMMA,14985.966667
2,man on fire,1987,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5
3,suspiria,1977,Sarah,SARAH,14372.466667
4,point break,1991,Tyler Endicott,TYLER,14176.666667
5,barton fink,1991,Audrey Taylor,TAYLOR,13892.1
6,dracula,1979,Jonathan Harker,JONATHAN,13219.566667
7,i spit on your grave,1978,Matthew hides,MATTHEW,12837.433333
8,invasion of the body snatchers,1978,Matthew Bennell,MATTHEW,12837.433333
9,apollo 13,1995,Jack Swigert,JACK,12508.133333


In [55]:
# create a dataframe from influenced_names witht only the rows that have a Normalized name found in danish_df
influenced_names = influenced_names[influenced_names['Normalized_name'].isin(danish_df['Name'])]
print("size of influenced_names: ", influenced_names.shape)
influenced_names.head(10)


size of influenced_names:  (495, 5)


Unnamed: 0,Movie Name,Year,Full name,Normalized_name,Mean Difference
1,the avengers,1998,Emma Peel,EMMA,14985.966667
2,man on fire,1987,"Samantha ""Sam"" Balletto",SAMANTHA,14453.5
3,suspiria,1977,Sarah,SARAH,14372.466667
6,dracula,1979,Jonathan Harker,JONATHAN,13219.566667
7,i spit on your grave,1978,Matthew hides,MATTHEW,12837.433333
8,invasion of the body snatchers,1978,Matthew Bennell,MATTHEW,12837.433333
9,apollo 13,1995,Jack Swigert,JACK,12508.133333
10,goldeneye,1995,Jack Wade,JACK,12508.133333
11,the usual suspects,1995,Jack Baer,JACK,12508.133333
12,titanic,1997,Jack Dawson,JACK,12348.266667


In [None]:
import pandas as pd
import plotly.graph_objects as go

# Step 1: Create a new column for 10-year periods
influenced_names['Period'] = (influenced_names['Year'] // 10) * 10  # Group by decades (e.g., 1960-1969 becomes 1960)

# Step 2: Aggregate by Period, Name, and Year
aggregated_data = (
    influenced_names.groupby(['Period', 'Normalized Name', 'Year'], as_index=False)['Count']
    .sum()  # Sum counts for the same name within the same period and year
)

# Step 3: Get top 10 names per period
top_names_by_period = (
    aggregated_data.groupby(['Period', 'Normalized'], as_index=False)['Count']
    .sum()  # Sum total counts for the entire period
    .sort_values(['Period', 'Count'], ascending=[True, False])  # Sort within each period
    .groupby('Period', as_index=False)
    .head(10)  # Keep only the top 10 names per period
)

# Step 4: Prepare the frames for each period
frames = []
for period in top_names_by_period['Period'].unique():
    # Get the top 10 names for the current period
    top_names = top_names_by_period[top_names_by_period['Period'] == period]['Name'].tolist()
    
    # Filter data for the top 10 names in this period
    filtered_data = aggregated_data[(aggregated_data['Period'] == period) & (aggregated_data['Name'].isin(top_names))]
    
    # Ensure all years in the period are covered for each name
    all_years = list(range(period, period + 10))
    filled_data = []
    for name in top_names:
        name_data = filtered_data[filtered_data['Name'] == name]
        # Fill missing years with 0 counts
        name_data = name_data.set_index('Year').reindex(all_years, fill_value=0).reset_index()
        name_data['Name'] = name  # Ensure the name column is preserved
        filled_data.append(name_data)
    
    # Combine all filled data into one DataFrame
    combined_data = pd.concat(filled_data)
    
    # Add line traces for the evolution of each name
    line_traces = []
    for name in top_names:
        name_data = combined_data[combined_data['Name'] == name]
        line_traces.append(go.Scatter(
            x=name_data['Year'],
            y=name_data['Count'],
            mode='lines',
            name=name,
            line=dict(width=2),
            showlegend=False,  # Hide legend for simplicity
        ))

        # Add text annotation for the top names below the plot
    text = f"Top 10 Names in {period}s: " + ", ".join(top_names)
    frame = go.Frame(
        data=line_traces,
        name=str(period),
        layout=go.Layout(
            annotations=[
                dict(
                    text=text,
                    x=0.5,  # Center horizontally
                    y=-0.2,  # Position below the plot
                    xref="paper", yref="paper",
                    showarrow=False,
                    font=dict(size=14),
                    align="center",  # Align text to the center
                )
            ]
        )
    )
    frames.append(frame)

# Step 5: Create the initial figure
initial_period = min(top_names_by_period['Period'].unique())
initial_top_names = top_names_by_period[top_names_by_period['Period'] == initial_period]['Name'].tolist()
initial_filtered_data = aggregated_data[(aggregated_data['Period'] == initial_period) & (aggregated_data['Name'].isin(initial_top_names))]

# Ensure all years are filled for the initial period
all_years = list(range(initial_period, initial_period + 10))
filled_initial_data = []
for name in initial_top_names:
    name_data = initial_filtered_data[initial_filtered_data['Name'] == name]
    name_data = name_data.set_index('Year').reindex(all_years, fill_value=0).reset_index()
    name_data['Name'] = name
    filled_initial_data.append(name_data)

combined_initial_data = pd.concat(filled_initial_data)

initial_data = []
for name in initial_top_names:
    name_data = combined_initial_data[combined_initial_data['Name'] == name]
    initial_data.append(go.Scatter(
        x=name_data['Year'],
        y=name_data['Count'],
        mode='lines',
        name=name,
        line=dict(width=2),
        showlegend=False,
    ))

initial_text = f"Top 10 Danish Names in {initial_period}s:<br>" + ",<br>".join(initial_top_names)

fig = go.Figure(
    data=initial_data,
    layout=go.Layout(
        annotations=[
            dict(
                text=initial_text,
                x=-0.3, y=0.95,
                xref="paper", yref="paper",
                showarrow=False,
                font=dict(size=16),
                align="center",
            )
        ],
        sliders=[dict(
            active=0,
            currentvalue={"prefix": "Period: "},
            pad={"t": 50},
            steps=[
                dict(
                    method="animate",
                    args=[
                        [str(period)],  # Name of the frame
                        {"mode": "immediate", "frame": {"duration": 500, "redraw": True}}  # Animation settings
                    ],
                    label=f"{period}s",
                )
                for period in top_names_by_period['Period'].unique()
            ],
        )],
    ),
    frames=frames
)

# Step 6: Show the figure
fig.show()
