In [None]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

## Import processed data

In [None]:
folder_processed_data_path = './data/processed_data/'

movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'))
movie_df.set_index(['wiki_ID'], inplace=True)
# Verify the indexes are unique
print(f"Is the indexing unique in movie_df ? {movie_df.index.is_unique}")

name_by_movie_ordered_df = pd.read_csv(os.path.join(folder_processed_data_path, 'name_by_movie_ordered_df.csv'))
# Verify the indexes are unique
print(f"Is the indexing unique in name_by_movie_df ? {name_by_movie_ordered_df.index.is_unique}")

baby_name_df = pd.read_csv(os.path.join(folder_processed_data_path, 'baby_name_df.csv'))
baby_name_df.set_index(['name', 'year'], inplace=True)
# Verify the indexes are unique
print(f"Is the indexing unique in baby_name_df ? {baby_name_df.index.is_unique}")

print("movie_df :")
display(movie_df.head())
print("name_by_movie_ordered_df :")
display(name_by_movie_ordered_df.head())
print("baby_name_df :")
display(baby_name_df.head())

Choices :

In [None]:
movie_df.query("mov_name == 'The Godfather'")

In [None]:
name_by_movie_ordered_df[name_by_movie_ordered_df['wiki_ID'] == 2466773]

In [None]:
nb_year_after = 5
nb_year_before = 10

chosen_movie_ID = 2466773
chosen_name = 'Vito'

print(f"The chosen movie is {movie_df.loc[chosen_movie_ID]['mov_name']} in {movie_df.loc[chosen_movie_ID]['release']}")

In [None]:
chosen_baby_name_df = baby_name_df.loc[chosen_name].sort_values(by=['year'])
display(chosen_baby_name_df.head())

In [None]:
# get the release date
release_year = movie_df.loc[chosen_movie_ID]['release']
print(f"release year = {release_year}")

In [None]:
before_df = chosen_baby_name_df[(chosen_baby_name_df.index >= release_year-nb_year_before) & (chosen_baby_name_df.index <= release_year-1)].reset_index()
after_df = chosen_baby_name_df[(chosen_baby_name_df.index >= release_year) & (chosen_baby_name_df.index <= release_year+nb_year_after-1)].reset_index()
display(before_df)
display(after_df)

In [None]:
union_before_after_df = pd.concat([before_df, after_df], ignore_index=True)

x_values = union_before_after_df['year'].values
y_values = union_before_after_df['percentage'].values

# Create a scatter plot
plt.plot(x_values, y_values)
plt.xlabel('Years')
plt.ylabel('Percentage of total births')
plt.title(f'Given name {chosen_name} with the release of the movie {movie_df.loc[chosen_movie_ID].mov_name}')

# Draw vertical lines for each movie release
plt.axvline(x=release_year, color='r', linestyle='--', label=f'x={release_year}')
plt.legend()

plt.show()

In [None]:
import statsmodels.formula.api as smf

# model before release
mod_before = smf.ols(formula = 'percentage ~ year' , data = before_df)
np.random.seed(2)
res_before = mod_before.fit()
display(res_before.summary())

coefficients_before = res_before.params.values
beta_before = coefficients_before[1]
intercept_before = coefficients_before[0]


# model after release
mod_after = smf.ols(formula = 'percentage ~ year' , data = after_df)
res_after = mod_after.fit()
display(res_after.summary())

coefficients_after = res_after.params.values
beta_after = coefficients_after[1]
intercept_after = coefficients_after[0]

print(f"beta_before = {beta_before}")
print(f"beta_after = {beta_after}")
print(f"intercept_before = {intercept_before}")
print(f"intercept_after = {intercept_after}")

In [None]:
union_before_after_df = pd.concat([before_df, after_df], ignore_index=True)

x_values = union_before_after_df['year'].values
y_values = union_before_after_df['percentage'].values

# Create a scatter plot
plt.plot(x_values, y_values)
plt.xlabel('Years')
plt.ylabel('Percentage of total births')
plt.title(f'Given name {chosen_name} with the release of the movie {movie_df.loc[chosen_movie_ID].mov_name}')

# Calculate y values for the regression lines
reg_before = beta_before * x_values + intercept_before
reg_after = beta_after * x_values + intercept_after
# Plot the regression lines
plt.plot(x_values, reg_before, label=f'Before (slope={beta_before}, intercept={intercept_before})', color='red')
plt.plot(x_values, reg_after, label=f'After (slope={beta_after}, intercept={intercept_after})', color='blue')

# Draw vertical lines for each movie release
plt.axvline(x=release_year, color='r', linestyle='--', label=f'x={release_year}')
plt.legend()

plt.show()

In [None]:
### Testing
from scipy.stats import t

# Beta_before and Beta_after are the coefficients we want to compare

# Calculate the standard errors of the coefficients
se_before = res_before.bse[1]  # Standard error for Beta_before
se_after = res_after.bse[1]    # Standard error for Beta_after

# Calculate the t-statistic for the difference between the coefficients
t_statistic = (beta_before - beta_after) / np.sqrt(se_before**2 + se_after**2)

# Degrees of freedom
degree_freedom = len(before_df) + len(after_df) - 2  # Total sample size - number of parameters

# Two-tailed t-test for the difference between coefficients
p_value = 2 * (1 - t.cdf(np.abs(t_statistic), df=degree_freedom))

# Print the t-statistic and p-value
print("t-statistic:", t_statistic)
print("p-value:", p_value)

# Check if the difference is statistically significant at 5% level
if p_value < 0.05:
    print("The difference is statistically significant at 5% level.")
else:
    print("There is no significant difference between the coefficients.")

# Another way to do the t-test

In [None]:
import statsmodels.api as sm

# Assuming df_before and df_after are your datasets
# and 'time' and 'frequency' are columns in your datasets
model_before = sm.OLS(before_df['percentage'], sm.add_constant(before_df['year'])).fit()
model_after = sm.OLS(after_df['percentage'], sm.add_constant(after_df['year'])).fit()

display(model_before.summary())
display(model_after.summary())

In [None]:
para_values_before = model_before.params.values
intercept_dany_before = para_values_before[0]
beta_dany_before = para_values_before[1]

para_values_after = model_after.params.values
intercept_dany_after = para_values_after[0]
beta_dany_after = para_values_after[1]

In [None]:
union_before_after_df = pd.concat([before_df, after_df], ignore_index=True)

x_values = union_before_after_df['year'].values
y_values = union_before_after_df['percentage'].values

# Create a scatter plot
plt.plot(x_values, y_values)
plt.xlabel('Years')
plt.ylabel('Percentage of total births')
plt.title(f'Given name {chosen_name} with the release of the movie {movie_df.loc[chosen_movie_ID].mov_name}')

# Calculate y values for the regression lines
reg_before = beta_dany_before * x_values + intercept_dany_before
reg_after = beta_dany_after * x_values + intercept_dany_after
# Plot the regression lines
plt.plot(x_values, reg_before, label=f'Before (slope={beta_dany_before}, intercept={intercept_dany_before})', color='red')
plt.plot(x_values, reg_after, label=f'After (slope={beta_dany_after}, intercept={intercept_dany_after})', color='blue')

# Draw vertical lines for each movie release
plt.axvline(x=release_year, color='r', linestyle='--', label=f'x={release_year}')
plt.legend()

plt.show()

In [None]:
from scipy import stats

# Assuming you're comparing the slopes of the two regression models
t_stat, p_value = stats.ttest_ind(model_before.params, model_after.params)

print("T-statistic:", t_stat)
print("P-value:",p_value)

# Linkai idea

In [None]:
from scipy import stats

before_diffs = before_df['percentage'][:-1] - before_df['percentage'][1:].values
after_diffs = after_df['percentage'][:-1] - after_df['percentage'][1:].values

# Assuming you're comparing the slopes of the two regression models
t_stat, p_value = stats.ttest_ind(before_diffs, after_diffs)

print("T-statistic:", t_stat)
print("P-value:",p_value)


It doesn't seem to work so lets keep the other method

# Apply function to compute the p-value

In [None]:
name_by_movie_ordered_df.head()

In [None]:
movie_df.head()

lets check if there are wiki_ID present in the name_by_movie_df which are not in the movie_df 

In [None]:
display(name_by_movie_ordered_df.head())
name_by_movie_uniqueID = name_by_movie_ordered_df['wiki_ID'].unique()
uniqueID_in_name_by_movie_df = pd.DataFrame(name_by_movie_uniqueID, columns=['wiki_ID'])
display(uniqueID_in_name_by_movie_df.head())
display(movie_df.head())

print(f"len(uniqueID_in_name_by_movie_df) = {len(uniqueID_in_name_by_movie_df)}")
print(f"len(movie_df) = {len(movie_df)}")

# Merge the DataFrames on the common 'ID' column
merged_df = pd.merge(movie_df, uniqueID_in_name_by_movie_df, on='wiki_ID', how='outer')

print(f"len(merged_df) = {len(merged_df)}")

In [None]:
# import sys

# # Save the original sys.stdout
# original_stdout = sys.stdout

# # Specify the file path where you want to save the output
# output_file_path = 'output.txt'

# # Open the file in write mode (this will overwrite the file if it already exists)
# with open(output_file_path, 'w') as f:
#     # Redirect sys.stdout to the file
#     sys.stdout = f

#     # Your code goes here, including print statements
#     print("This will be written to the file.")
#     ###################################################################################################################################
#     def apply_regression(row):
#         chosen_movie_ID = row['wiki_ID']
#         chosen_name = row['char_words']

#         if (chosen_movie_ID == np.nan) or (chosen_name == np.nan):
#             return np.nan
        
#         chosen_baby_name_df = baby_name_df.loc[chosen_name].sort_values(by=['year'])

#         release_year = movie_df.loc[chosen_movie_ID]['release']

#         if (len(chosen_baby_name_df) == 0) or (release_year == np.nan):
#             return np.nan

        
#         before_df = chosen_baby_name_df[(chosen_baby_name_df.index >= release_year-nb_year_before) & (chosen_baby_name_df.index <= release_year-1)].reset_index()
#         after_df = chosen_baby_name_df[(chosen_baby_name_df.index >= release_year) & (chosen_baby_name_df.index <= release_year+nb_year_after-1)].reset_index()
        
#         np.random.seed(2)
        
#         if (len(before_df) != nb_year_before) & (len(after_df) != nb_year_after):
#             # print("not enough datapoints : return NaN")
#             return np.nan
        
#         res_before = smf.ols(formula = 'percentage ~ year' , data = before_df).fit()
#         res_after = smf.ols(formula = 'percentage ~ year' , data = after_df).fit()
        
#         beta_before = res_before.params.values[1]
#         beta_after = res_after.params.values[1]

#         # Calculate the standard errors of the coefficients
#         se_before = res_before.bse[1]  # Standard error for Beta_before
#         se_after = res_after.bse[1]    # Standard error for Beta_after
        
#         # Calculate the t-statistic for the difference between the coefficients
#         t_statistic = (beta_before - beta_after) / np.sqrt(se_before**2 + se_after**2)

#         # Degrees of freedom
#         degree_freedom = len(before_df) + len(after_df) - 2  # Total sample size - number of parameters

#         # Two-tailed t-test for the difference between coefficients
#         p_value = 2 * (1 - t.cdf(np.abs(t_statistic), df=degree_freedom))
        
#         print(f"Movie : {movie_df.loc[chosen_movie_ID]['mov_name']}, Name : {chosen_name}, P-VALUE = {p_value}")
#         return p_value

#     # Apply the function to create a new 'TotalIncome' column
#     name_by_movie_ordered_df['p_value'] = name_by_movie_ordered_df.apply(apply_regression, axis=1)

#     display(name_by_movie_ordered_df)
    
#     # Reset sys.stdout to the original value
#     sys.stdout = original_stdout

In [None]:
data = [(3,5,7), (2,4,6), (5,8,9)]
df = pd.DataFrame(data, columns = ['A','B','C'])

# Reurn multiple columns from apply()
def multiply(row):
   row['A1'] = row[0] * 2
   row['B1'] = row[1] * 3
   row['C1'] = row[2] * 4
   return row

df = df.apply(multiply, axis=1)
print(df)

In [None]:
type(df)

In [None]:
# iter = 0

# nb_yr_before = 10
# nb_yr_after = 5

# # Function to calculate total income (salary + bonus)
# def apply_regression(row):
#     global iter

#     chosen_movie_ID = row['wiki_ID']
#     chosen_name = row['char_words']

#     if (chosen_movie_ID == np.nan) or (chosen_name == np.nan):
#         return np.nan
    
#     chosen_baby_name_df = baby_name_df.loc[chosen_name].sort_values(by=['year'])

#     release_year = movie_df.loc[chosen_movie_ID]['release']

#     if (len(chosen_baby_name_df) == 0) or (release_year == np.nan):
#         return np.nan

    
#     before_df = chosen_baby_name_df[(chosen_baby_name_df.index >= release_year-nb_yr_before) & (chosen_baby_name_df.index <= release_year-1)].reset_index()
#     after_df = chosen_baby_name_df[(chosen_baby_name_df.index >= release_year) & (chosen_baby_name_df.index <= release_year+nb_yr_after-1)].reset_index()
    
#     np.random.seed(2)
    
#     if (len(before_df) != nb_yr_before) or (len(after_df) != nb_yr_after):
#         # print("not enough datapoints : return NaN")
#         return np.nan
    
#     res_before = smf.ols(formula = 'percentage ~ year' , data = before_df).fit()
#     res_after = smf.ols(formula = 'percentage ~ year' , data = after_df).fit()
    
#     beta_before = res_before.params.values[1]
#     beta_after = res_after.params.values[1]

#     # Calculate the standard errors of the coefficients
#     se_before = res_before.bse[1]  # Standard error for Beta_before
#     se_after = res_after.bse[1]    # Standard error for Beta_after

#     if np.isnan(se_before) or np.isnan(se_after):
#         return np.nan
    
#     # Calculate the t-statistic for the difference between the coefficients
#     t_statistic = (beta_before - beta_after) / np.sqrt(se_before**2 + se_after**2)

#     # Degrees of freedom
#     degree_freedom = len(before_df) + len(after_df) - 2  # Total sample size - number of parameters

#     if np.isnan(t_statistic) or np.isnan(degree_freedom):
#         return np.nan

#     # Two-tailed t-test for the difference between coefficients
#     p_value = 2 * (1 - t.cdf(np.abs(t_statistic), df=degree_freedom))
    
#     iter += 1
#     print("Batch {} of {}".format(iter, len(name_by_movie_ordered_df)), end="\r")

#     row['t_stat'] = t_statistic
#     row['p_value'] = p_value

#     # print(f"Movie ID : {chosen_movie_ID}, Name : {chosen_name}, P-VALUE = {p_value}")
#     return row

# # Apply the function to create a new 'TotalIncome' column
# name_by_movie_ordered_pvalue_df = name_by_movie_ordered_df.copy(deep=True)
# name_by_movie_ordered_pvalue_df = name_by_movie_ordered_pvalue_df.apply(apply_regression, axis=1)

# display(name_by_movie_ordered_df)

In [None]:
from scipy import stats

iter = 0

nb_yr_before = 10
nb_yr_after = 5

# Function to calculate total income (salary + bonus)
def apply_regression(row):
    global iter

    chosen_movie_ID = row['wiki_ID']
    chosen_name = row['char_words']
    
    chosen_baby_name_df = baby_name_df.loc[chosen_name].sort_values(by=['year'])

    release_year = movie_df.loc[chosen_movie_ID]['release']
    
    before_df = chosen_baby_name_df[(chosen_baby_name_df.index >= release_year-nb_yr_before) & (chosen_baby_name_df.index <= release_year-1)].reset_index()
    after_df = chosen_baby_name_df[(chosen_baby_name_df.index >= release_year) & (chosen_baby_name_df.index <= release_year+nb_yr_after-1)].reset_index()
    
    if (len(before_df) != nb_yr_before) or (len(after_df) != nb_yr_after):
        # print("not enough datapoints : return NaN")
        row['t_stat'] = np.NaN
        row['p_value'] = np.NaN
    else:
        before_diffs = before_df['percentage'][:-1] - before_df['percentage'][1:].values
        after_diffs = after_df['percentage'][:-1] - after_df['percentage'][1:].values

        # Assuming you're comparing the slopes of the two regression models
        t_stat, p_value = stats.ttest_ind(before_diffs, after_diffs)
        
        iter += 1
        print("Batch {} of {}".format(iter, len(name_by_movie_ordered_df)), end="\r")

        row['t_stat'] = t_stat
        row['p_value'] = p_value

    # print(f"Movie ID : {chosen_movie_ID}, Name : {chosen_name}, P-VALUE = {p_value}")
    return row

# Apply the function to create a new 'TotalIncome' column
name_by_movie_ordered_pvalue_df = name_by_movie_ordered_df[:1000].copy(deep=True)
name_by_movie_ordered_pvalue_df = name_by_movie_ordered_pvalue_df.apply(apply_regression, axis=1)

display(name_by_movie_ordered_pvalue_df)

In [None]:
display(name_by_movie_ordered_pvalue_df.to_frame())

In [None]:
processed_folder = './data/processed_data/'
name_by_movie_ordered_pvalue_df.to_csv(os.path.join(processed_folder, 'name_by_movie_ordered_pvalue_10_5_df.csv'), index=False)

In [None]:
significant_names = len(name_by_movie_ordered_df[name_by_movie_ordered_df['p_value'] <= 0.05])/len(name_by_movie_ordered_df)
print(f"Proportion of significant character name : {significant_names}")

In [None]:
not_significant_names = len(name_by_movie_ordered_df[name_by_movie_ordered_df['p_value'] >= 0.05])/len(name_by_movie_ordered_df)
print(f"Proportion of non significant character name : {not_significant_names}")

In [None]:
nan_p_values = len(name_by_movie_ordered_df[name_by_movie_ordered_df['p_value'].isna()])/len(name_by_movie_ordered_df)
print(f"Proportion of non significant character name : {nan_p_values}")

In [None]:
a+b+c