In [133]:
import os
import numpy as np
import pandas as pd

# Matching
from sklearn.neighbors import NearestNeighbors
import networkx as nx

# Statical analysis
from sklearn.linear_model import LinearRegression
from scipy.stats import chi2_contingency
import scipy.stats as st

# Ploting
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

# Parallel questions

In this notebook, we try answering the question called 'parallel questions' listed in the [README](./README.md) file.

In [252]:
folder_processed_data_path = './data/processed_data/'

# Import movie metadata
movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'))
movie_df.set_index(['wiki_ID'], inplace=True)
display(movie_df.sample(2))

# Import character metadata
name_by_movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'name_by_movie_ordered_pvalue_10_5_df.csv'))
name_by_movie_df.set_index(['wiki_ID'], inplace=True)
display(name_by_movie_df.sample(2))

# Dataset containing movie genre
movie_genres_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_genres_df.csv'))
movie_genres_df.set_index(['wiki_ID'], inplace=True)
display(movie_genres_df.sample(2))

Unnamed: 0_level_0,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
28634595,La provinciale,1981,1.0,,201,6.7
4173885,The Woman Chaser,1999,10.0,,697,7.1


Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6791799,Soldier,,M,,,0.0
15799422,Choya,0.0,M,-0.215389,0.833405,6e-06


Unnamed: 0_level_0,genre
wiki_ID,Unnamed: 1_level_1
16233994,Drama
156745,Comedy


The significance value `alpha` is 0.05

In [135]:
alpha = 0.05

Let's combine the `name_by_movie` and the `movie` dataframes and use it to answer to the paralleles questions.

In [136]:
# add movie info to the character dataframe
name_by_movie_aggregate_df = name_by_movie_df.merge(movie_df, how='left', left_on='wiki_ID', right_on='wiki_ID').copy(deep=True)
display(name_by_movie_aggregate_df.sample(2))

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
179044,Johnson,0.0,M,-1.644511,0.128317,0.000339,The Jerk,1979,12.0,73691419.0,62707,7.1
8432450,Ambrose,16.0,M,0.305896,0.765395,-4.9e-05,The Mist,2007,11.0,57293715.0,331176,7.1


# Question 1 : Effect of movie's release period of the year

Looking at studies showing that baby conception rates are at the highest in fall or winter season leading to higher birth in the summer, will movies released in summer show the highest correlation with newborn naming?

In order to study this question, we will divide the movies by season of release and then look at the seasonly/monthly proportion of influenced names with respect to all the names considered. Then, we look at the average influence over all the movies for the 4 different seasons.

First, we separate the dataframe into four, one for each season.

In [185]:
summer = [6.0, 7.0, 8.0]
fall = [9.0,10.0,11.0]
winter = [12.0,1.0,2.0]
spring = [3.0,4.0,5.0]
summer_movies_df = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['month'].isin(summer))]
fall_movies_df = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['month'].isin(fall))]
winter_movies_df = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['month'].isin(winter))]
spring_movies_df = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['month'].isin(spring))]

### Proportion of influenced names during each season

We first compute the proportion of influenced names per season to perform statistical test about their difference.

In [193]:
# Function to compute proportion of significant values and corresponding standard error for confidence interval
def q1_prop_and_ci(data):
    proportion = (data['p_value']<alpha).mean()
    se = st.sem((data['p_value']<alpha).astype(int))

    ci_upper = proportion + 1.96*se
    ci_lower = proportion - 1.96*se
    return pd.Series({
        'proportion': proportion,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper
    })

prop_summer = q1_prop_and_ci(summer_movies_df)
prop_fall = q1_prop_and_ci(fall_movies_df)
prop_winter = q1_prop_and_ci(winter_movies_df)
prop_spring = q1_prop_and_ci(spring_movies_df)

In [194]:
# Create a Plotly figure
fig = go.Figure()

seasons = ['Summer', 'Fall', 'Winter', 'Spring']
colors = ['#e31a1c', '#ff7f00', '#1f78b4', '#33a02c']

# Loop through seasons
for season, color in zip(seasons, colors):
    prop_data = locals()[f'prop_{season.lower()}']  # Access the variable dynamically
    
    # Define y values and error array
    y_values = [prop_data['proportion']]
    error_array = [prop_data['proportion'] - prop_data['ci_lower'], prop_data['ci_upper'] - prop_data['proportion']]
    
    # Add trace to the figure
    fig.add_trace(go.Scatter(
        x=[season],
        y=y_values,
        error_y=dict(type='data', array=error_array),
        mode='markers+lines',
        name=season,
        marker=dict(color=color, size=10)
    ))

# Update layout for better visualization
fig.update_layout(
    yaxis=dict(title='Proportion of Significant Values'),
    title='Proportion of Significant Values with 95% Confidence Intervals by Season'
)

# Update layout to adjust the range of the x-axis
fig.update_layout(
    xaxis=dict(range=[-0.5, len(prop_data.index) + 1 - 0.5]),  # Adjust the range based on your data
)

# Show the plot
fig.show()

Statistical test to assess whether proportion for different season are different or not.

H0 : The proportions are all equal i.e. no movie season release affects baby naming more than the other

In [188]:
# Organize the data into a contingency table
observed_data = [
    [len(summer_movies_df[summer_movies_df['p_value'] < alpha]), len(summer_movies_df['p_value'])],
    [len(fall_movies_df[fall_movies_df['p_value'] < alpha]), len(fall_movies_df['p_value'])],
    [len(winter_movies_df[winter_movies_df['p_value'] < alpha]), len(winter_movies_df['p_value'])],
    [len(spring_movies_df[spring_movies_df['p_value'] < alpha]), len(spring_movies_df['p_value'])]
]

# Perform the chi-squared test
chi2, p, _, _ = chi2_contingency(observed_data)

# Print the results
print("Chi-squared value:", chi2)
print("P-value:", p)

Chi-squared value: 10.22583914103114
P-value: 0.01674082055652168


Since the Chi-squared value is 10.23, we can reject the null hypothesis that the proportion of inlfuenced names are the same between the season at the 5% significance level.

### Proportion of influenced names during each season : timeline

We now try to visualize the variation of the percentage of significantly influence names per season of release per year. Also, we look at the mean magnitude influence per season over the years with the corresponding 5% confidence interval.

In [195]:
# Sort by year first, then apply seasonal filter
def seasonal_filter(season_df):
    season_df_sorted = season_df.groupby('year').apply(lambda x: pd.Series({
        'avg': x[x['p_value']<alpha]['slope_change'].dropna().abs().mean(),
        'se': x[x['p_value']<alpha]['slope_change'].dropna().abs().sem(),
        'nb_names':  x[x['p_value']<alpha]['p_value'].count(),
        'prop_influenced': len(x[x['p_value']<alpha])/len(x['p_value'])
    }))
    season_df_sorted.reset_index(inplace=True)
    return season_df_sorted

summer_movies_df_sorted = seasonal_filter(summer_movies_df)
fall_movies_df_sorted = seasonal_filter(fall_movies_df)
winter_movies_df_sorted = seasonal_filter(winter_movies_df)
spring_movies_df_sorted = seasonal_filter(spring_movies_df)

In [196]:
fig = go.Figure()

# Define the main line plot for each season
seasons = ['Summer', 'Fall', 'Winter', 'Spring']
colors = ['red', 'orange', 'blue', 'green']
data = [summer_movies_df_sorted, fall_movies_df_sorted, winter_movies_df_sorted, spring_movies_df_sorted]

for i, season in enumerate(seasons):
    main_trace = go.Scatter(
        x=data[i]['year'],  
        y=data[i]['prop_influenced'],  
        mode='lines+markers',
        line_shape='linear',
        name=season,
        line=dict(color=colors[i]),
        legendgroup=season,
        # visible=(season == 'Summer')
    )
    
    # Add the main line trace to the figure
    fig.add_trace(main_trace)

# Update the layout
fig.update_layout(
    title='Yearly evolution of proportion of influenced baby names per season of release',
    xaxis_title='Year',
    yaxis_title='Proportion of inlfuenced names'
)

# Show the figure
fig.show()

From the above plot, it doesn't seem to have big difference between season of movie release. We can now look at monthly variation in proportion to assess whether an eventual effect is more pronounced individually rather than seasonaly.

### Proportion of influenced names during each month : timeline

In [197]:
monthly_prop_df = name_by_movie_aggregate_df.groupby(['year','month']).apply(lambda x: pd.Series({
    'prop_significant': len(x[x['p_value']<alpha])/len(x['p_value'])
}))
monthly_prop_df_reset = monthly_prop_df.reset_index()
display(monthly_prop_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_significant
year,month,Unnamed: 2_level_1
1895,8.0,0.000000
1898,7.0,0.000000
1900,11.0,0.000000
1902,9.0,0.000000
1903,1.0,0.000000
...,...,...
2013,8.0,0.176471
2013,9.0,0.111111
2013,10.0,0.038462
2013,11.0,0.200000


In [198]:
# Convert 'month' column to numeric
monthly_prop_df_reset['month'] = pd.to_numeric(monthly_prop_df_reset['month'], errors='coerce')

# Create a line plot using Plotly
fig = px.line(
    monthly_prop_df_reset,
    x='year',
    y='prop_significant',
    color='month',
    markers=True,
    line_shape='linear', 
    labels={'prop_significant': 'Proportion of Significant Values'},
    title='Proportion of Significant influences Over Months for Each Year',
)

# Show the plot
fig.show()


### Number of influenced names during each season : timeline 

The yearly variation of the number of influenced names per season can be used as a first way to try to quantify the influence of movie release. We then show the qualitative effect of movie release season, rather than the quantitative effect, which will be study in the following cells.

In [199]:
# Define the main line plot for each season
seasons = ['Summer', 'Fall', 'Winter', 'Spring']
colors = ['red', 'orange', 'blue', 'green']
data = [summer_movies_df_sorted, fall_movies_df_sorted, winter_movies_df_sorted, spring_movies_df_sorted]

# Bar chart plot
fig = go.Figure()
for i, season in enumerate(seasons):
    fig.add_trace(go.Bar(
        x=data[i]['year'],
        y=data[i]['nb_names'],
        name=seasons[i],
        marker_color=colors[i],
        offsetgroup=1
    ))

# Mise en forme du tracé
fig.update_layout(
    title = "Evolution of number of influenced names per season of movie release",
    xaxis=dict(title='Year'),
    yaxis=dict(title='# influenced names'),
    barmode='stack'  # 'stack' empile les barres pour chaque order  
      
)
fig.update_xaxes(range=[1920, 2015])

### Average monthly influence magnitude : timeline
After looking at the yearly variation in proportion of significantly influenced names across season/month, it is time to try to quantify the quantitative influenced differentiation between months. First, we study the mean magnitude slope change for each month (January - Decembre) on the average of all significantly influenced names.

In [200]:
# compute the average and standard error of the slope change for each month
influence_per_month_df = name_by_movie_aggregate_df[name_by_movie_aggregate_df['p_value']<alpha].groupby('month').apply(lambda x: pd.Series ({
    'avg_slope_change_significant_per_month': x['slope_change'].mean(),
    'se_slope_change_significant_per_month': x['slope_change'].sem(),
    'avg_mag_slope_change_significant_per_month': x['slope_change'].abs().mean(),
    'se_mag_slope_change_significant_per_month': x['slope_change'].abs().sem()
}))

display(influence_per_month_df.sample(2))

Unnamed: 0_level_0,avg_slope_change_significant_per_month,se_slope_change_significant_per_month,avg_mag_slope_change_significant_per_month,se_mag_slope_change_significant_per_month
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2.0,0.0022,0.000877,0.014785,0.000755
11.0,0.000701,0.000809,0.014428,0.000677


In [201]:
fig = go.Figure([
    go.Scatter(
        name='All film infuence',
        x=influence_per_month_df.index,
        y=influence_per_month_df['avg_mag_slope_change_significant_per_month'],
        mode='lines',
        line=dict(color='rgb(31, 119, 180)'),
    ),
    go.Scatter(
        name='Upper Bound',
        x=influence_per_month_df.index,
        y=influence_per_month_df['avg_mag_slope_change_significant_per_month']+1.96*influence_per_month_df['se_mag_slope_change_significant_per_month'],
        mode='lines',
        marker=dict(color="#444"),
        line=dict(width=0),
        showlegend=False
    ),
    go.Scatter(
        name='Lower Bound',
        x=influence_per_month_df.index,
        y=influence_per_month_df['avg_mag_slope_change_significant_per_month']-1.96*influence_per_month_df['se_mag_slope_change_significant_per_month'],
        marker=dict(color="#444"),
        line=dict(width=0),
        mode='lines',
        fillcolor='rgba(68, 68, 68, 0.3)',
        fill='tonexty',
        showlegend=False
    )
])
fig.update_layout(
    yaxis_title='Average magnitude influence',
    title='Average general influence of films per month',
    hovermode="x"
)
fig.show()

Now we look at the variation of magnitude influence per year across season.

In [202]:
# Create the main figure
fig = go.Figure()

# Define the main line plot for each season
seasons = ['Summer', 'Fall', 'Winter', 'Spring']
colors = ['red', 'orange', 'blue', 'green']
data = [summer_movies_df_sorted, fall_movies_df_sorted, winter_movies_df_sorted, spring_movies_df_sorted]

for i, season in enumerate(seasons):
    main_trace = go.Scatter(
        x=data[i]['year'],  
        y=data[i]['avg'],  
        mode='lines+markers',
        line_shape='linear',
        name=season,
        line=dict(color=colors[i]),
        legendgroup=season,
        #visible=(season == 'Summer')
    )
    
    # Add the main line trace to the figure
    fig.add_trace(main_trace)
    
    # Calculate confidence interval data
    lower_ci = data[i]['avg'] - 1.96*data[i]['se'] 
    upper_ci = data[i]['avg'] + 1.96*data[i]['se']
    
    # Add the trace for confidence interval
    ci_trace = go.Scatter(
        x=data[i]['year'],
        y=upper_ci,
        mode='lines',
        line=dict(color=colors[i], width=0),
        name=f'{season} 95% CI',
        showlegend=False,
        legendgroup=season,
        #visible=(season == 'Summer')
    )
    
    fig.add_trace(ci_trace)
    
    # Add the filled area between the main line and confidence interval
    fig.add_trace(go.Scatter(
        x=data[i]['year'],
        y=lower_ci,
        mode='lines',
        line=dict(color=colors[i], width=0),
        name=f'{season} 95% CI',
        fill='tonexty',
        #fillcolor=f'rgba{((colors[i]), 0.2)}',
        #showlegend=False,
        #legendgroup=season
        #visible=(season == 'Summer')
    ))

# Update the layout
fig.update_layout(
    title='Evolution of movies influence baby on names per season of release',
    xaxis_title='Year',
    yaxis_title='Average magnitude influence',
    #yaxis=dict(type="log"),
    xaxis=dict(range=[1900, 2020])
)


# Show the figure
fig.show()


### Influence magnitude for each genre : timeline
A question we can further dive into might be the possible covariates of movie genre on the month of release concerning the influence of moving on baby naming.

Plot the mean magnitude slope change for each month (January - Decembre) for the 10 most represented movie genres in the dataset

In [149]:
# First, aggregate dataframe with p_value with dataframe containing movie genre
movie_genre_aggregate_df = name_by_movie_df.merge(movie_genres_df, how='left', left_on='wiki_ID', right_on='wiki_ID').copy(deep=True)
movie_genre_aggregate_df.reset_index(inplace=True)

In [175]:
# Merge dataframe so that we have the genre of a movie, 
# with its month of release and p_value/slope change
movie_genre_caracteristics_aggregate_df = movie_genre_aggregate_df.merge(movie_df, how='left', left_on='wiki_ID', right_on='wiki_ID')
display(movie_genre_caracteristics_aggregate_df.sample(2))

Unnamed: 0,wiki_ID,char_words,order,gender,t_stat,p_value,slope_change,genre,mov_name,year,month,revenue,numVotes,averageRating
316791,3524655,Peter,2.0,M,-0.990856,0.343052,0.002462,Supernatural,Scoop,2006,7.0,39215642.0,86681,6.6
173047,1164544,Ginny,5.0,F,-0.972006,0.351936,0.00011,Psychological thriller,Identity,2003,4.0,90259536.0,261937,7.3


In [176]:
# Select 5 most represented movie genre in ths dataset
most_representative_genre = movie_genres_df['genre'].value_counts().nlargest(5).index
display(most_representative_genre)


Index(['Drama', 'Comedy', 'Romance Film', 'Thriller', 'Action'], dtype='object', name='genre')

In [177]:
influence_per_month_per_genre_df = movie_genre_caracteristics_aggregate_df[(movie_genre_caracteristics_aggregate_df['p_value']<alpha) & (movie_genre_aggregate_df['genre'].isin(most_representative_genre))].groupby(['genre','month']).apply(lambda x: pd.Series ({
    'avg_slope_change_significant_per_month': x['slope_change'].mean(),
    'se_slope_change_significant_per_month': x['slope_change'].sem(),
    'avg_mag_slope_change_significant_per_month': x['slope_change'].abs().mean(),
    'se_mag_slope_change_significant_per_month': x['slope_change'].abs().sem()
}))

display(influence_per_month_per_genre_df.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_slope_change_significant_per_month,se_slope_change_significant_per_month,avg_mag_slope_change_significant_per_month,se_mag_slope_change_significant_per_month
genre,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Action,4.0,0.001943,0.001682,0.012363,0.001469
Comedy,7.0,0.002748,0.001404,0.014329,0.00122


In [179]:
# Create the main figure
fig = go.Figure()

# Define the main line plot for each season
genres = most_representative_genre
colors = ['brown', 'lightgreen', 'pink', 'green', 'blue']
#data = [summer_movies_df_sorted, fall_movies_df_sorted, winter_movies_df_sorted, spring_movies_df_sorted]
influence_per_month_per_genre_df.reset_index(inplace=True)


for i, season in enumerate(genres):
    data = influence_per_month_per_genre_df[influence_per_month_per_genre_df['genre'] == season]
    main_trace = go.Scatter(
        x=data['month'],  # Replace with the actual data
        y=data['avg_mag_slope_change_significant_per_month'],  # Replace with the actual data
        mode='lines+markers',
        line_shape='linear',
        name=season,
        line=dict(color=colors[i]),
        legendgroup=season,
        #visible=(season == 'Summer')
    )
    
    # Add the main line trace to the figure
    fig.add_trace(main_trace)
    
    # Calculate confidence interval data
    lower_ci = data['avg_mag_slope_change_significant_per_month'] - 1.96*data['se_mag_slope_change_significant_per_month']  # Replace with the actual data
    upper_ci = data['avg_mag_slope_change_significant_per_month'] + 1.96*data['se_mag_slope_change_significant_per_month']  # Replace with the actual data
    
    # Add the trace for confidence interval
    ci_trace = go.Scatter(
        x=data['month'],  # Replace with the actual data
        y=upper_ci,
        mode='lines',
        line=dict(color=colors[i], width=0),
        name=f'{season} 95% CI',
        showlegend=False,
        legendgroup=season,
        #visible=(season == 'Summer')
    )
    
    fig.add_trace(ci_trace)
    
    # Add the filled area between the main line and confidence interval
    fig.add_trace(go.Scatter(
        x=data['month'],  # Replace with the actual data
        y=lower_ci,
        mode='lines',
        line=dict(color=colors[i], width=0),
        name=f'{season} 95% CI',
        fill='tonexty',
        # fillcolor=f'rgba{((colors[i]), 0.2)}',  # Adjust the transparency as needed
        #showlegend=False,
        legendgroup=season
        #visible=(season == 'Summer')
    ))

# Update the layout
fig.update_layout(
    title='Monthly evolution of movie genre influence',
    xaxis_title='Months',
    yaxis_title='Average magnitude influence'
)

# Show the figure
fig.show()


# Question 2 :

# Question 3 : Effect of movie's popularity on baby naming

We have p_value and slope_change <br>
p_value : represent how much we are certain about whether there is a variation or not <br>
slope_change : represent the intensity of the variation <br>
t_stat : we don't care, there it the slope change for that <br>

problem 1 : older movies have less ratings and there are less old movies than recent movies. Therefore we have to take into account the period. <br>
solution : proximity matching on the release year. Don't match characters of the same movie!!

Step 1 : aggregate the two dataframe to create a dataframe containing all the movie character with the relative information about their movie

In [None]:
# add movie genre to the character dataframe
name_by_movie_aggregate_genre_df = name_by_movie_aggregate_df.merge(movie_genres_df, how='left', on='wiki_ID').copy(deep=True)
display(name_by_movie_aggregate_genre_df.sample(2))

In [None]:
# make a new unique index for each character
name_by_movie_aggregate_genre_df.set_index(pd.Index(list(range(0,len(name_by_movie_aggregate_genre_df)))), inplace=True)
display(name_by_movie_aggregate_genre_df.sample(2))

In [None]:
# keep only the rows with a p_value < 0.05
name_by_movie_aggregate_genre_df = name_by_movie_aggregate_genre_df[name_by_movie_aggregate_genre_df['p_value'] < 0.05].copy(deep=True)

In [None]:
name_by_movie_aggregate_genre_df.dropna(subset=['order', 'numVotes', 'p_value', 'gender'], inplace=True)
name_by_movie_aggregate_genre_df['order'] = name_by_movie_aggregate_genre_df['order'].astype(int)

We need to encode the columns `gender` and `genre` because the KNN algorithm doesn't accept string values.

In [None]:
# add a new column in name_by_movie_aggregate_genre_df that encode the movie genre
name_by_movie_aggregate_genre_df['genre'] = name_by_movie_aggregate_genre_df['genre'].astype('category')
name_by_movie_aggregate_genre_df['genre_code'] = name_by_movie_aggregate_genre_df['genre'].cat.codes

# add a new column in name_by_movie_aggregate_genre_df that encode the gender column
name_by_movie_aggregate_genre_df['gender'] = name_by_movie_aggregate_genre_df['gender'].astype('category')
name_by_movie_aggregate_genre_df['gender_code'] = name_by_movie_aggregate_genre_df['gender'].cat.codes

display(name_by_movie_aggregate_genre_df.sample(2))

Step 2 : create treatment and control group with a separation of the character based on the median of the `numVotes` values

In [None]:
# separate the name_by_movie_aggregate_genre_df into two dataframes, one control and one treatment based on whether the attribute 'numVotes' is higher or lower than the median.
q3_control_pop_df = name_by_movie_aggregate_genre_df[name_by_movie_aggregate_genre_df['numVotes'] < 500]
q3_treatment_pop_df = name_by_movie_aggregate_genre_df[name_by_movie_aggregate_genre_df['numVotes'] >= 100000]

# add a column to each dataframe that contains whether the character is in the control or treatment population
q3_control_pop_df['is_treated'] = 0
q3_treatment_pop_df['is_treated'] = 1

display(q3_control_pop_df.sample(2))
print(f"Length of control population: {len(q3_control_pop_df)}")
display(q3_treatment_pop_df.sample(2))
print(f"Length of treatment population: {len(q3_treatment_pop_df)}")


Step 3 : matching

In [None]:
# Columns to use for matching
matching_columns = ['order', 'year', 'genre_code', 'gender_code']
# matching_columns = ['order', 'year', 'gender_code']

# Initialize a list to store the matched pairs
matched_pairs = []

# Iterate through each row in the control dataframe
for control_index, control_row in q3_control_pop_df.iterrows():
    # Filter the treatment dataframe based on the matching columns
    matching_rows = q3_treatment_pop_df[
        (q3_treatment_pop_df[matching_columns] == control_row[matching_columns]).all(axis=1)
    ]

    # Check if there is a match
    if not matching_rows.empty:
        # Store the index of the matched pair
        treatment_index = matching_rows.index[0]
        matched_pairs.append((control_index, treatment_index))

# Display the matched pairs
print("Matched Pairs:")
print(matched_pairs) # control_index, treatment_index


In [None]:
# create a graph in order to avoid having the same index in two different pairs
G = nx.Graph()

for pair in matched_pairs:
    G.add_edge(pair[0], pair[1])

# Compute the best matching
matching = nx.maximal_matching(G)

print(f"number of matched pairs: {len(matching)}")

In [None]:
index_1 = [i[0] for i in list(matching)]
index_2 = [i[1] for i in list(matching)]
print(index_1)
print(index_2)

In [None]:
# check the matching for a specific pair
check_index = 200

display(q3_control_pop_df.loc[index_1[check_index]])
display(q3_treatment_pop_df.loc[index_2[check_index]])

In [None]:
# create the matched dataframe
matched_control_df = q3_control_pop_df.loc[index_1].copy(deep=True)
matched_treatment_df = q3_treatment_pop_df.loc[index_2].copy(deep=True)

# concatenate the two dataframes
matched_df = pd.concat([matched_control_df, matched_treatment_df], axis=0)
display(matched_df.sample(2))

In [None]:
# Sample data
# treatment_control = np.array([0, 1, 0 ,1])  # 0 represents control, 1 represents treatment
# y_values = np.array([10, 20, 5, 15])  # Replace with your actual continuous y-values
treatment_control = matched_df['is_treated'].values
y_values = abs(matched_df['slope_change'].values)
print(f"")

# Reshape the data to meet the requirements of scikit-learn
X = treatment_control.reshape(-1, 1)
y = y_values.reshape(-1, 1)

# Create a linear regression model
model = LinearRegression()

# Fit the model to the data
model.fit(X, y)

# Predict y-values based on the model
y_pred = model.predict(X)

# Plot the original data points
plt.scatter(treatment_control, y_values, color='blue', label='Actual Data')

# Plot the regression line
plt.plot(treatment_control, y_pred, color='red', linewidth=2, label='Regression Line')

# Add labels and legend
plt.xlabel('Treatment or Control')
plt.ylabel('Continuous Y Values')
plt.legend()
plt.yscale('log')

# Show the plot
plt.show()

slope = model.coef_[0, 0]
print(f"slope = {slope}")

In [None]:
name_by_movie_df

# Question 4 : Character importance in the film
In this section, we try to answer the question whether the importance of the character in the movie plays a role on the influence on baby names.

First, let's plot the number of character played in a year with significant impact with respect to the importance in movie.

In [203]:
# count the number of significant characters per movie
signi_count_per_order = name_by_movie_df[name_by_movie_df['p_value'] <= 0.05].reset_index().groupby(['order'])['wiki_ID'].count().copy(deep=True)

In [204]:
# Create a line plot using Plotly
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=signi_count_per_order.index,
    y=signi_count_per_order.values,
    mode='lines',
    name='Significant Impact',
    line=dict(color='blue')  # You can customize the line color
))

# Add labels and title
fig.update_layout(
    title='Significant impact relative to character importance',
    xaxis=dict(title='Order'),
    yaxis=dict(title='Count', type='log')
)

# Show the plot
fig.show()

With this first inspection, we can see that the greater the importance of characters in the film (i.e. small order), the greater the number of characters having a significant impact on the names given to newborns. 
This result is in line with what we might intuitively expect, that characters make more of an impression on the viewer than characters who appear only infrequently in the film.

We will consider only the first 15 order of importance.

In [205]:
# remove the rows with order higher than 15
name_by_movie_15_df = name_by_movie_df[name_by_movie_df['order'] <= 15].copy(deep=True)

Let's see the proportion of significant character to avoid being biased by the number of movies in each categories.

In [206]:
# compute the percentage of significant characters per order
signi_count_per_order = name_by_movie_15_df[name_by_movie_15_df['p_value'] <= 0.05].reset_index().groupby(['order'])['wiki_ID'].count().copy(deep=True)
count_per_order = name_by_movie_15_df.reset_index().groupby(['order'])['wiki_ID'].count().copy(deep=True)
percentage_per_order = (signi_count_per_order / count_per_order)*100
percentage_per_order = percentage_per_order.fillna(0)

# keep only the 15 first orders
percentage_per_order = percentage_per_order[:15]

In [207]:
# Plotting with Plotly
fig = px.bar(x=percentage_per_order.index, y=percentage_per_order.values,
             labels={'x': 'Order', 'y': 'Percentage of significant impact'},
             title='Character name impact per order',
             category_orders={'x': percentage_per_order.index})

# Set Y-axis range to highlight small differences
fig.update_layout(yaxis=dict(range=[9, 11.8]))

# Show the plot
fig.show()

Looking at the percentage of characters with a significant impact within each character order allows us to identify which role type has the greatest number of character names influencing the names of newborns.

Intesting, but let's see the proportion confidence intervals

In [208]:
# add a column in name_by_movie_15_df that contains a one if the variation is significant and zero otherwise
name_by_movie_15_df['is_significant'] = np.where(name_by_movie_15_df['p_value'] <= 0.05, 1, 0)

display(name_by_movie_15_df.sample(2))

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,is_significant
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
31012555,Tokie,2.0,F,,,0.0,0
20700027,Willy,4.0,M,0.798738,0.441347,-0.0001,0


In [209]:
def significant_proportion_and_ci(data):

    p = data['is_significant'].mean()
    se = np.sqrt(p * (1 - p) / len(data))

    # Compute confidence interval of 95%, using the mean and standard error
    ci_lower = p - 1.96 * se
    ci_upper = p + 1.96 * se

    # ************************ Alternative way to compute the standard error bis ************************
    # se = st.sem(data['is_significant'])
    # ci_lower = p - 1.96 * se
    # ci_upper = p + 1.96 * se
    # ************************ Alternative way to compute the standard error bis ************************

    # ************************ Alternative way to compute the standard error bis ************************
    # nb_significant = data['is_significant'].sum()
    # total = len(data)
    # (ci_lower, ci_upper) = sm.stats.proportion_confint(nb_significant, total, alpha=0.05)
    # ************************ Alternative way to compute the standard error bis ************************

    # Useful because we will put this in the dataframe
    return pd.Series({
        'mean': p,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper
    })

ci_df = name_by_movie_15_df.groupby(['order']).apply(significant_proportion_and_ci).reset_index()

In [210]:
# Plotting with Plotly
fig = px.bar(ci_df, x='order', y='mean',
             labels={'x': 'Order', 'y': 'Percentage of significant impact'},
             title='Character name impact per order',
             error_y='ci_upper',
             error_y_minus='ci_lower',)

# Show the plot
fig.show()

The intervals are far too large to deduce anything

## Study positive and negative variation

Among the character who played in a year with significant variation on the baby names, let's see the positive and negative variation's mean to compare.

In [211]:
name_by_movie_significant = name_by_movie_15_df[name_by_movie_15_df['is_significant'] == 1].copy(deep=True)

In [239]:
# Create a new column for positive and negative scores
name_by_movie_significant['slope_change_sign'] = name_by_movie_significant['slope_change'].apply(lambda x: 'negative' if x < 0 else 'positive')

# Group by 'order' and 'slope_change_sign', then calculate the mean
grouped_df = name_by_movie_significant.groupby(['order', 'slope_change_sign'])['slope_change'].mean().reset_index()

# Pivot the table for better plotting
pivot_df = grouped_df.pivot(index='order', columns='slope_change_sign', values='slope_change').reset_index()

# Group by 'order' and 'score_type', then calculate the se
se_df = name_by_movie_significant.groupby(['order', 'slope_change_sign'])['slope_change'].sem().reset_index()

# Add the se to the pivot table
pivot_df['neg_se'] = se_df[se_df['slope_change_sign'] == 'negative']['slope_change'].values
pivot_df['pos_se'] = se_df[se_df['slope_change_sign'] == 'positive']['slope_change'].values
display(pivot_df.sample(2))

slope_change_sign,order,negative,positive,neg_se,pos_se
2,2.0,-0.026332,0.010672,0.001708,0.000526
8,8.0,-0.023375,0.009195,0.003118,0.000515


In [240]:
# Plotting with Plotly
fig = px.bar(pivot_df, x='order', y='negative',
             labels={'negative': 'Mean Slope Change', 'order': 'Order'},
             title='Mean Negative and Positive Slope Change by Order',
             height=600,
             error_y='neg_se')

# Update existing traces to show the legend for the color blue
fig.update_traces(marker=dict(color='blue'), selector=dict(type='bar'), showlegend=True, name='Negative')

# Add positive bars with their error values
fig.add_bar(x=pivot_df['order'], y=pivot_df['positive'], 
            error_y=dict(type='data', array=pivot_df['pos_se']),
            name='Positive',
            marker_color='red')

# Show the plot
fig.show()

We note that the average slope change per order for negative impacts has a greater magnitude than for positive impacts. However, the proportion of negatively impacting feature names represents less than a quarter of all significant slope change.
For positive slope changes, we note that for the first 5 orders, the influence tends to diminish the less important the character's role. 
This could be intuitively explained by the fact that the less important a character is, the less he or she will influence the audience.

## Magnitude of the variation

We can express the magnitude of the variation by computing the difference of slop before and after the movie release. These differences are already available in the `name_by_movie` dataframe on the column `slope_change`. Let's plot the mean slope change for each order.

In [214]:
alpha = 0.05 # Significance level

name_by_movie_magnitude = name_by_movie_df.copy(deep=True)

# Keep only the significant characters
name_by_movie_magnitude = name_by_movie_magnitude[name_by_movie_magnitude['p_value'] <= alpha]

# Keep only the first 30 orders
name_by_movie_magnitude = name_by_movie_magnitude[name_by_movie_magnitude['order'] <= 20]

# Compute the absolute mean slope change
name_by_movie_magnitude['abs_slope_change'] = name_by_movie_magnitude['slope_change'].abs()
abs_mean_slope_change = name_by_movie_magnitude.groupby("order")['abs_slope_change'].mean()

# Compute the standard error of absolute slope change
abs_slope_change_se = name_by_movie_magnitude.groupby("order")['abs_slope_change'].sem()

# Create a dataframe with the mean and standard error
slope_change_mean_se_df = pd.DataFrame({'abs_mean_slope_change': abs_mean_slope_change, 'abs_slope_change_se': abs_slope_change_se})
display(slope_change_mean_se_df.sample(2))

Unnamed: 0_level_0,abs_mean_slope_change,abs_slope_change_se
order,Unnamed: 1_level_1,Unnamed: 2_level_1
9.0,0.012948,0.000972
16.0,0.012571,0.002503


In [215]:
# Création du bar chart interactif avec sous-graphiques
fig = go.Figure()

# Tracé pour avg magnitude slope change avec erreur
fig.add_trace(go.Bar(
    x=slope_change_mean_se_df.index,
    y=slope_change_mean_se_df['abs_mean_slope_change'],
    name='Avg Magnitude Slope Change',
    marker_color='orange',
    error_y=dict(
        type='data',
        array=slope_change_mean_se_df['abs_slope_change_se'],
        visible=True
    )
))

# Set the y-axis range
y_range = [0, 0.02]

# Mise en forme du tracé
fig.update_layout(
    xaxis=dict(title='Character Order'),
    yaxis=dict(title='Slope Change Magnitude', range=y_range),
    barmode='stack'
)

fig.show()

The magnitude of the slope change tends to decrease for the first 5 orders. This reflects the trend of positive slope changes, which are proportionately in the majority for each order.

## Subquestion : Does the order of a name influence differently according to gender ?

In [258]:
name_by_order_by_gender_prop_df = name_by_movie_df.groupby(['order','gender']).apply(lambda x: pd.Series({
        'prop_signif_per_order_per_genre': (x['p_value'] < alpha).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].mean(),
        'avg_slope_change_global': x['slope_change'].mean(),
        'avg_magnitude_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].abs().mean(),
        'avg_magnitude_slope_change_global': x['slope_change'].abs().mean(),
        'total_number_signif_per_order_per_genre': (x['p_value'] < alpha).sum(),
        'se_slope_change_magnitude_significant': x[x['p_value'] < alpha]['slope_change'].abs().sem()
    }))
display(name_by_order_by_gender_prop_df.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_order_per_genre,avg_slope_change_significant,avg_slope_change_global,avg_magnitude_slope_change_significant,avg_magnitude_slope_change_global,total_number_signif_per_order_per_genre,se_slope_change_magnitude_significant
order,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4.0,F,0.1142,0.002783,0.000328,0.011912,0.003007,419.0,0.000875
16.0,F,0.112219,0.00299,0.000389,0.006913,0.001968,45.0,0.001063


In [259]:
# Keep only the first 15 orders
name_by_order_by_gender_prop_df.reset_index(inplace=True)
name_by_order_by_gender_prop_df = name_by_order_by_gender_prop_df[name_by_order_by_gender_prop_df['order'] <= 15].copy(deep=True)
name_by_order_by_gender_prop_df.set_index(['order', 'gender'], inplace=True)
display(name_by_order_by_gender_prop_df.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_order_per_genre,avg_slope_change_significant,avg_slope_change_global,avg_magnitude_slope_change_significant,avg_magnitude_slope_change_global,total_number_signif_per_order_per_genre,se_slope_change_magnitude_significant
order,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2.0,M,0.101472,0.003103,0.000413,0.012551,0.002687,827.0,0.000684
12.0,F,0.108051,0.007333,0.000854,0.013617,0.002825,102.0,0.002237


In [260]:
order_values = name_by_order_by_gender_prop_df.index.get_level_values('order').unique()

df_reset = name_by_order_by_gender_prop_df.reset_index()

df_reset['gender'] = df_reset['gender'].replace({'M': 'Male', 'F': 'Female'})

# Separate Male and female for the plot
genders_to_plot = ['Female', 'Male']

# Set the scatter marker size and separation
marker_size = 10  # Adjust as needed
marker_separation = 0.2  # Adjust as needed

# Create traces for each gender
traces = []
for i, gender in enumerate(genders_to_plot):
    # Extract data for the current gender
    gender_data = df_reset[df_reset['gender'] == gender]
    
    # Calculate the offset for side-by-side placement
    offset = (i - 0.5) * marker_separation
    # offset = 0
    
    # Create a scatter trace with error bars
    trace = go.Scatter(
        name=gender,
        x=order_values + offset,
        y=gender_data['avg_magnitude_slope_change_significant'],
        mode='markers',
        marker=dict(size=marker_size),
        error_y=dict(
            type='data',
            array=gender_data['se_slope_change_magnitude_significant'],
            visible=True
        ),
        hoverinfo='y+text',
        text=gender_data['prop_signif_per_order_per_genre'].apply(lambda x: f'Proportion of significant: {x:.2%}'),
        opacity=0.75
    )
    
    traces.append(trace)

fig = go.Figure(data=traces)

# Add layout for better visualization
fig.update_layout(
    xaxis=dict(title='Character Order', tickmode='array', tickvals=order_values, ticktext=order_values),
    yaxis=dict(title='Slope Change'),
    title='Average Magnitude of Significant Slope Change for each order and each Gender'
)

fig.show()

# Question 5 : Is it possible to differentiate character influence between its gender?

In [261]:
name_by_order_by_gender_prop_df_year = name_by_movie_aggregate_df.groupby(['year','gender']).apply(lambda x: pd.Series({
        'prop_signif_per_order_per_gender': (x['p_value'] < alpha).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].mean(),
        'avg_slope_change_global': x['slope_change'].mean(),
        'avg_magnitude_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].abs().mean(),
        'avg_magnitude_slope_change_global': x['slope_change'].abs().mean(),
        'total_number_signif_per_order_per_genre': (x['p_value'] < alpha).sum(),
    }))
display(name_by_order_by_gender_prop_df_year.sample(2))

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_order_per_gender,avg_slope_change_significant,avg_slope_change_global,avg_magnitude_slope_change_significant,avg_magnitude_slope_change_global,total_number_signif_per_order_per_genre
year,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1948,M,0.08,-0.013181,-0.00166,0.02523,0.003416,36.0
1927,M,0.027273,0.00785,-0.000725,0.023061,0.002647,3.0


In [262]:
#Plot the average magnitude for significant slope change for Male and Female over years
df_reset = name_by_order_by_gender_prop_df_year.reset_index()

df_reset['gender'] = df_reset['gender'].replace({'M': 'Male', 'F': 'Female'})

# Filter the data for years between 1960 and 2009
df_filtered = df_reset[(df_reset['year'] >= 1960) & (df_reset['year'] <= 2009)]

# Get the unique years from the filtered DataFrame
order_values = df_filtered['year'].unique()

# Select the genders you want to plot
genders_to_plot = ['Female', 'Male']


fig = go.Figure()

# Define the main line plot for each gender using the filtered data
colors = ['red', 'blue']

for i, gender in enumerate(genders_to_plot):
    y_values = df_filtered[df_filtered['gender'] == gender]['avg_magnitude_slope_change_significant']
    error_y_values = y_values.sem()

    main_trace = go.Scatter(
        x=order_values,
        y=y_values,
        mode='lines+markers',
        name=gender,
        line=dict(color=colors[i]),
        legendgroup=gender,
    )

    # Add the main line trace to the figure
    fig.add_trace(main_trace)

    # Calculate confidence interval data
    lower_ci = y_values - 1.96 * error_y_values
    upper_ci = y_values + 1.96 * error_y_values

    # Add the trace for confidence interval
    ci_trace = go.Scatter(
        x=order_values,
        y=upper_ci,
        mode='lines',
        line=dict(color=colors[i], width=0),
        name=f'{gender} 95% CI',
        showlegend=False,
        legendgroup=gender,
    )

    fig.add_trace(ci_trace)

    # Add the filled area between the main line and confidence interval
    fig.add_trace(go.Scatter(
        x=order_values,
        y=lower_ci,
        mode='lines',
        line=dict(color=colors[i], width=0),
        name=f'{gender} 95% CI',
        fill='tonexty',
    ))

# Update the layout
fig.update_layout(
    title='Average Magnitude of Significant Slope Change for year order and each Gender',
    xaxis_title='Year',
    yaxis_title='Slope Change',
)


fig.show()

In [263]:
name_by_movie_df['slope_change_abs'] = name_by_movie_df['slope_change'].abs()
male_name_by_movie_df = name_by_movie_df[(name_by_movie_df['gender'] == 'M') & (name_by_movie_df['p_value'] <= 0.05)].copy(deep=True)
female_name_by_movie_df = name_by_movie_df[(name_by_movie_df['gender'] == 'F') & (name_by_movie_df['p_value'] <= 0.05)].copy(deep=True)

In [264]:
# Function to compute proportion of significant values and corresponding standard error for confidence interval
def q5_prop_and_ci(data):
    proportion = data['slope_change_abs'].mean()
    se = st.sem(data['slope_change_abs'])

    ci_upper = proportion + 1.96*se
    ci_lower = proportion - 1.96*se
    return pd.Series({
        'proportion': proportion,
        'ci_lower': ci_lower,
        'ci_upper': ci_upper
    })

prop_male = q5_prop_and_ci(male_name_by_movie_df)
prop_female = q5_prop_and_ci(female_name_by_movie_df)

In [265]:
# Create a Plotly figure
fig = go.Figure()

genders = ['Male', 'Female']
colors = ['#4361ee', '#FC8EAC']

for gender, color in zip(genders, colors):
    prop_data = locals()[f'prop_{gender.lower()}']  # Access the variable dynamically
    fig.add_trace(go.Scatter(
        x=[gender],
        y=[prop_data['proportion']],
        error_y=dict(type='data', array=[prop_data['proportion'] - prop_data['ci_lower'], prop_data['ci_upper'] - prop_data['proportion']]),
        mode='markers+lines',
        name=gender,
        marker=dict(color=color, size=10)
    ))

# Update layout for better visualization
fig.update_layout(
    yaxis=dict(title='Slope Change Magnitude'),
    title='Variation magnitude per gender'
)
# Update layout to adjust the range of the x-axis
fig.update_layout(
    xaxis=dict(range=[-0.5, len(genders) - 0.5]),  # Adjust the range based on your data
)
# Show the plot
fig.show()