# Parallel questions study
In this notebook, we carry out the study of the parallel questions related to the influence of movies on baby names, therefore conduction a global analysis.

In [294]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from scipy import stats
import seaborn as sns
from statsmodels.stats.proportion import proportions_ztest
import plotly.graph_objects as go
import plotly.express as px

from dash import dcc, html, dash_table
import dash
from dash.dependencies import Input, Output

In [295]:
folder_processed_data_path = './data/processed_data/'

# Dataset containing month of release
movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'))
movie_df.set_index(['wiki_ID'], inplace=True)
display(movie_df)

# Dataset containing p_value
name_by_movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'name_by_movie_ordered_pvalue_10_5_df.csv'))
name_by_movie_df.set_index(['wiki_ID'], inplace=True)
display(name_by_movie_df)

# Dataset containing movie genre
movie_genres_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_genres_df.csv'))
movie_genres_df.set_index(['wiki_ID'], inplace=True)
display(movie_genres_df)

# Selection of significance level
alpha = 0.05

Unnamed: 0_level_0,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
975900,Ghosts of Mars,2001,8.0,14010832.0,56880,4.9
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000,2.0,,69,6.0
28463795,Brun bitter,1988,,,40,5.6
9363483,White Of The Eye,1987,,,2891,6.1
261236,A Woman in Flames,1983,,,623,5.9
...,...,...,...,...,...,...
35228177,Mermaids: The Body Found,2011,3.0,,1711,4.6
34980460,Knuckle,2011,1.0,,3192,6.8
9971909,Another Nice Mess,1972,9.0,,111,5.8
913762,The Super Dimension Fortress Macross II: Lover...,1992,5.0,,657,6.0


Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3217,Gold,6.0,,,,0.000000
3217,Linda,7.0,F,-0.416786,0.684853,0.000673
3217,Henry,4.0,M,-2.031668,0.067058,0.002513
3217,Duke,4.0,M,0.579441,0.573967,-0.000113
3217,Warrior,9.0,M,,,0.000000
...,...,...,...,...,...,...
37478048,Ajay,9.0,M,-0.819213,0.430057,0.000130
37501922,Murphy,3.0,F,1.264175,0.232298,-0.000365
37501922,Hunter,1.0,M,-7.083089,0.000020,0.036603
37501922,John,1.0,M,-2.172964,0.052505,0.012557


Unnamed: 0_level_0,genre
wiki_ID,Unnamed: 1_level_1
330,Comedy-drama
330,Drama
3217,Action
3217,Comedy
3217,Time travel
...,...
37476824,Crime Comedy
37476824,Caper story
37476824,Crime Fiction
37478048,Comedy film


How much movie genre ?


In [296]:
display(len(movie_genres_df['genre'].unique()))

363

**name_by_movie_df**: dataframe with names, p_value, slope_change

**movie_df**: dataframe with film caracteristics

**movie_genre_df**: dataframe with movie genre

**name_by_movie_aggregate_df**: **name_by_movie_df** + **movie_df**: dataframe with names, p_value, slope change + film caracteristics

**movie_genre_aggregate_df**: **name_by_movie_df** + **movie_genre_df**: dataframe with names, p_value, slope change + film genre

**movie_genre_aggregate_with_years_df**: dataframe with names, p_value, slope change + film genre + years

## Question 1: Month of release

In [297]:
# First, aggregate dataframe with p_value table with dataframe containing release month 
name_by_movie_aggregate_df = name_by_movie_df.merge(movie_df, how='left', left_on='wiki_ID', right_on='wiki_ID')
display(name_by_movie_aggregate_df)

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3217,Gold,6.0,,,,0.000000,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Linda,7.0,F,-0.416786,0.684853,0.000673,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Henry,4.0,M,-2.031668,0.067058,0.002513,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Duke,4.0,M,0.579441,0.573967,-0.000113,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Warrior,9.0,M,,,0.000000,Army of Darkness,1992,10.0,21502796.0,191068,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...
37478048,Ajay,9.0,M,-0.819213,0.430057,0.000130,Mr. Bechara,1996,,,395,5.4
37501922,Murphy,3.0,F,1.264175,0.232298,-0.000365,Terminal Bliss,1992,,,245,4.4
37501922,Hunter,1.0,M,-7.083089,0.000020,0.036603,Terminal Bliss,1992,,,245,4.4
37501922,John,1.0,M,-2.172964,0.052505,0.012557,Terminal Bliss,1992,,,245,4.4


Divise year per season

In [298]:
summer = [6.0, 7.0, 8.0]
fall = [9.0,10.0,11.0]
winter = [12.0,1.0,2.0]
spring = [3.0,4.0,5.0]
summer_movies_df = name_by_movie_aggregate_df[name_by_movie_aggregate_df['month'].isin(summer)]
fall_movies_df = name_by_movie_aggregate_df[name_by_movie_aggregate_df['month'].isin(fall)]
winter_movies_df = name_by_movie_aggregate_df[name_by_movie_aggregate_df['month'].isin(winter)]
spring_movies_df = name_by_movie_aggregate_df[name_by_movie_aggregate_df['month'].isin(spring)]

display(summer_movies_df)
display(fall_movies_df)
display(winter_movies_df)
display(spring_movies_df)

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3746,Deckard,0.0,M,,,0.000000,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Eldon,8.0,M,-0.454573,0.658256,0.000106,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Lewis,12.0,M,-1.014454,0.332160,0.000707,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Bear,11.0,M,0.181738,0.859094,-0.000003,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Leon,7.0,M,0.758120,0.464312,-0.000544,Blade Runner,1982,6.0,33139618.0,804384,8.1
...,...,...,...,...,...,...,...,...,...,...,...,...
36699915,Luke,5.0,M,0.216557,0.832517,-0.001600,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7
36699915,Underwood,1.0,M,,,0.000000,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7
36699915,Chase,2.0,F,1.559383,0.147195,-0.011920,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7
36699915,Circe,,F,0.394402,0.700823,-0.000015,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7


Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3217,Gold,6.0,,,,0.000000,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Linda,7.0,F,-0.416786,0.684853,0.000673,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Henry,4.0,M,-2.031668,0.067058,0.002513,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Duke,4.0,M,0.579441,0.573967,-0.000113,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Warrior,9.0,M,,,0.000000,Army of Darkness,1992,10.0,21502796.0,191068,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...
37322106,Major,0.0,M,-1.922979,0.080743,0.002631,Jab Tak Hai Jaan,2012,11.0,,58012,6.7
37373877,Beth,5.0,F,-0.810731,0.434710,0.000273,Crazy Eights,2006,10.0,,3338,3.8
37373877,Patterson,5.0,F,-0.539253,0.600457,0.000041,Crazy Eights,2006,10.0,,3338,3.8
37373877,Jennifer,0.0,F,-0.395613,0.699955,0.003273,Crazy Eights,2006,10.0,,3338,3.8


Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3837,Lamarr,3.0,M,0.272089,0.790593,-0.000041,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
3837,Van,11.0,M,-1.222164,0.247188,0.000463,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
3837,Bart,0.0,M,0.186272,0.855622,-0.000167,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
3837,Lyle,6.0,M,-0.150477,0.883112,0.000103,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
3837,Buddy,18.0,M,0.041667,0.967511,-0.000017,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
...,...,...,...,...,...,...,...,...,...,...,...,...
36956792,Kid,18.0,M,,,0.000000,The Water Horse: Legend of the Deep,2007,12.0,103071443.0,42523,6.4
36956792,Charlie,5.0,M,-5.446114,0.000202,0.006446,The Water Horse: Legend of the Deep,2007,12.0,103071443.0,42523,6.4
36956792,Beach,18.0,M,,,0.000000,The Water Horse: Legend of the Deep,2007,12.0,103071443.0,42523,6.4
36956792,Walker,8.0,M,-0.597936,0.561991,0.000593,The Water Horse: Legend of the Deep,2007,12.0,103071443.0,42523,6.4


Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4560,Morrison,19.0,M,-1.433674,0.179473,0.000053,Braveheart,1995,5.0,211409945.0,1072580,8.3
4560,Edward,3.0,M,-0.358692,0.726615,0.000825,Braveheart,1995,5.0,211409945.0,1072580,8.3
4560,Campbell,5.0,M,-1.732399,0.111109,0.000489,Braveheart,1995,5.0,211409945.0,1072580,8.3
4560,Murron,1.0,F,,,0.000000,Braveheart,1995,5.0,211409945.0,1072580,8.3
4560,William,0.0,M,-3.378640,0.006157,0.015610,Braveheart,1995,5.0,211409945.0,1072580,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...
36814246,Girl,4.0,F,,,0.000000,Eraserhead,1977,3.0,7000000.0,124128,7.3
36814246,Mary,1.0,F,-2.783137,0.017804,0.041502,Eraserhead,1977,3.0,7000000.0,124128,7.3
36814246,Beautiful,4.0,F,,,0.000000,Eraserhead,1977,3.0,7000000.0,124128,7.3
36814246,Hall,4.0,F,-1.055993,0.313613,0.000021,Eraserhead,1977,3.0,7000000.0,124128,7.3


In [299]:
summer_movies_df.index.unique()

Index([    3746,     3947,     4231,     4726,     4727,     4728,     4729,
           4730,     8481,     9979,
       ...
       36306987, 36329343, 36354051, 36354224, 36422681, 36448415, 36478252,
       36566804, 36617100, 36699915],
      dtype='int64', name='wiki_ID', length=4096)

In [300]:
prop_summer = len(summer_movies_df[summer_movies_df['p_value']<alpha])/len(summer_movies_df['p_value'])
display(prop_summer)
prop_fall = len(fall_movies_df[fall_movies_df['p_value']<alpha])/len(fall_movies_df['p_value'])
display(prop_fall)
prop_winter = len(winter_movies_df[winter_movies_df['p_value']<alpha])/len(winter_movies_df['p_value'])
display(prop_winter)
prop_spring = len(spring_movies_df[spring_movies_df['p_value']<alpha])/len(spring_movies_df['p_value'])
display(prop_spring)

0.14160127633249764

0.14982267304878352

0.14396230805025592

0.14255289724796016

Statistical test to assess whether proportion for different season are different or not

In [301]:
from scipy.stats import chi2_contingency

# Organize the data into a contingency table
observed_data = [
    [len(summer_movies_df[summer_movies_df['p_value'] < alpha]), len(summer_movies_df['p_value'])],
    [len(fall_movies_df[fall_movies_df['p_value'] < alpha]), len(fall_movies_df['p_value'])],
    [len(winter_movies_df[winter_movies_df['p_value'] < alpha]), len(winter_movies_df['p_value'])],
    [len(spring_movies_df[spring_movies_df['p_value'] < alpha]), len(spring_movies_df['p_value'])]
]

# Perform the chi-squared test
chi2, p, _, _ = chi2_contingency(observed_data)

# Print the results
print("Chi-squared value:", chi2)
print("P-value:", p)

Chi-squared value: 9.260172541828045
P-value: 0.026024471557969192


H0 : The proportions are all equal 

We can reject the null hypothesis at the 5% significance level.

In [345]:
# ################################################ slope change 
# import plotly.express as px

# fig = px.line(summer_movies_df.sort_values('year', ascending=True), x="year", y="slope_change", title='Slope Change Over Years for Summer')
# fig.show()

######################################## Absolute value 
import plotly.graph_objects as go
import numpy as np
display(summer_movies_df)
# Tri par année
summer_movies_df_sorted = summer_movies_df.groupby('year').apply(lambda x : pd.Series({'avg': x['slope_change'].dropna().abs().mean()}))
summer_movies_df_sorted.reset_index(inplace=True)

fall_movies_df_sorted = fall_movies_df.groupby('year').apply(lambda x : pd.Series({'avg': x['slope_change'].dropna().abs().mean()}))
fall_movies_df_sorted.reset_index(inplace=True)

display(summer_movies_df_sorted)
fig = go.Figure()

# Ajouter la courbe avec des points de données
fig.add_trace(go.Scatter(
    x=summer_movies_df_sorted['year'],
    y=summer_movies_df_sorted['avg'].abs(),
    mode='lines+markers',
    line_shape='spline',
    name='Slope Change'
))

# Ajouter la courbe avec des points de données
fig.add_trace(go.Scatter(
    x=fall_movies_df_sorted['year'],
    y=fall_movies_df_sorted['avg'].abs(),
    mode='lines+markers',
    line_shape='spline',
    name='Slope Change'
))


# Mise à jour du layout
fig.update_layout(
    title='Absolute Slope Change Over Years for Summer',
    xaxis_title='Year',
    yaxis_title='Absolute Slope Change'
)

# Afficher la figure
fig.show()






Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3746,Deckard,0.0,M,,,0.000000,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Eldon,8.0,M,-0.454573,0.658256,0.000106,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Lewis,12.0,M,-1.014454,0.332160,0.000707,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Bear,11.0,M,0.181738,0.859094,-0.000003,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Leon,7.0,M,0.758120,0.464312,-0.000544,Blade Runner,1982,6.0,33139618.0,804384,8.1
...,...,...,...,...,...,...,...,...,...,...,...,...
36699915,Luke,5.0,M,0.216557,0.832517,-0.001600,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7
36699915,Underwood,1.0,M,,,0.000000,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7
36699915,Chase,2.0,F,1.559383,0.147195,-0.011920,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7
36699915,Circe,,F,0.394402,0.700823,-0.000015,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7


Unnamed: 0,year,avg
0,1895,0.002970
1,1898,0.000176
2,1909,0.000630
3,1910,0.006645
4,1912,0.009433
...,...,...
98,2009,0.002162
99,2010,0.002038
100,2011,0.002036
101,2012,0.002303


## Question 2: Movie Genre has an impact ?

In [302]:
# First, aggregate dataframe with p_value with dataframe containing movie genre
# Outer merge required in order to obtain for each name of each film, all the possible genre it can be associated to 
movie_genre_aggregate_df = name_by_movie_df.merge(movie_genres_df, how='outer', left_on='wiki_ID', right_on='wiki_ID')
movie_genre_aggregate_df.head(25)

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,genre
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3217,Gold,6.0,,,,0.0,Action
3217,Gold,6.0,,,,0.0,Comedy
3217,Gold,6.0,,,,0.0,Time travel
3217,Gold,6.0,,,,0.0,Black comedy
3217,Gold,6.0,,,,0.0,Zombie Film
3217,Gold,6.0,,,,0.0,Horror Comedy
3217,Gold,6.0,,,,0.0,Action/Adventure
3217,Gold,6.0,,,,0.0,Costume drama
3217,Gold,6.0,,,,0.0,Stop motion
3217,Gold,6.0,,,,0.0,Horror


In [303]:
# Need to drop the duplicates i.e. the instances that have the same wiki_ID for the same genre and same char words
movie_genre_aggregate_df.reset_index().drop_duplicates(subset=['genre', 'wiki_ID'], inplace=True)

First groupby test: can be removed when cleaning notebook

In [304]:
name_by_genre_significant_df = movie_genre_aggregate_df.groupby('genre').apply(lambda x: x[x['p_value'] < alpha])
display(name_by_genre_significant_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,char_words,order,gender,t_stat,p_value,slope_change,genre
genre,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Absurdism,19701,Tim,1.0,M,-4.091383,0.001785,0.010195,Absurdism
Absurdism,46505,Ted,0.0,M,-2.225789,0.047878,0.001111,Absurdism
Absurdism,46505,Johnny,10.0,M,-2.226029,0.047858,0.002591,Absurdism
Absurdism,75261,Robert,9.0,M,-3.585998,0.004273,0.053382,Absurdism
Absurdism,75261,Dave,19.0,M,-2.481872,0.030472,0.001477,Absurdism
...,...,...,...,...,...,...,...,...
Zombie Film,28362996,Burke,,M,-2.804857,0.017125,0.000248,Zombie Film
Zombie Film,30430079,Holly,1.0,F,-2.884757,0.014844,0.003713,Zombie Film
Zombie Film,33432215,Sarah,7.0,F,-6.204518,0.000067,0.021320,Zombie Film
Zombie Film,33432215,Mack,4.0,M,-2.626380,0.023559,0.001174,Zombie Film


Trying to see why there are for some "movie genre" NaN value for sem computation but not for mean computation.

 ANSWER: due to the fact that there is only one data point in after the groupy and filtering in a given movie genre.

In [305]:
name_by_genre_significant_df.loc['Acid western']

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,genre
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
113651,William,0.0,M,-3.37864,0.006157,0.01561,Acid western
413426,Walker,0.0,M,-3.102857,0.010053,0.000597,Acid western
5579768,Jake,0.0,M,-2.400606,0.035195,0.001247,Acid western
6415208,Matthew,2.0,M,-2.213936,0.048881,0.030503,Acid western


In [306]:
# Try to compute number of film per genre
display(movie_genre_aggregate_df.reset_index().groupby('genre')['wiki_ID'].nunique())

# Sanity check for "Acid Western" ––> 9 movies
display(movie_genre_aggregate_df[movie_genre_aggregate_df['genre'] == 'Acid western'])
display(len(movie_genre_aggregate_df[movie_genre_aggregate_df['genre'] == 'Acid western']))

# Look at number of names/char_words per genre, here on "Acid Western"
display(movie_genre_aggregate_df[movie_genre_aggregate_df['genre'] == 'Acid western']['char_words'].nunique())

genre
Absurdism             91
Acid western           9
Action              7859
Action Comedy        162
Action Thrillers     497
                    ... 
World History         20
World cinema        7073
Wuxia                115
Z movie                3
Zombie Film          266
Name: wiki_ID, Length: 363, dtype: int64

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,genre
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
113651,Blake,0.0,M,2.090286,0.060611,-0.006569,Acid western
113651,Cole,3.0,M,-1.087006,0.300279,0.00858,Acid western
113651,William,0.0,M,-3.37864,0.006157,0.01561,Acid western
113651,Marvin,,,-0.113402,0.911755,8.2e-05,Acid western
113651,Thel,11.0,F,,,0.0,Acid western
113651,Charlie,9.0,M,-1.241759,0.240151,0.000651,Acid western
113651,Tench,10.0,M,,,0.0,Acid western
113651,Russell,11.0,F,-1.39867,0.189469,0.001816,Acid western
113651,Conway,4.0,M,-0.201327,0.844117,1.9e-05,Acid western
113651,John,6.0,M,-1.018001,0.330546,0.005455,Acid western


32

28

In [307]:
# Compute proportion of impacted names by genre
# Also computation of non significant and nan proportion for sanity check
name_by_genre_prop_df = movie_genre_aggregate_df.groupby('genre').apply(lambda x: pd.Series({
        # Number of film in a given movie genre 
        'nb_films_in_genre': x.reset_index()['wiki_ID'].count(),
        # Number of total different names that appear in a given movie genre
        'nb_names_in_genre': x['char_words'].count(),
        # Number of different names per genre that are significantly impacted by a movie release from that genre
        'nb_names_signi_in_genre': x[x['p_value'] < alpha]['char_words'].count(),
        # Proportion of names significantly impacted by a movie genre divided by total number of films in this movie genre
        'prop_names_signi_in_genre_per_total_film_in_genre': (x[x['p_value'] < alpha]['char_words'].count())/(x.reset_index()['wiki_ID'].count()),
        'is_na_sum': x['slope_change'].isna().sum(),
        'prop_signif_per_genre': (x['p_value'] < alpha).sum()/len(x['p_value']),
        'prop_non_signi': (x['p_value'] > alpha).sum()/len(x['p_value']),
        'prop_nan': (x['p_value'].isna()).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].mean(),
        'se_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].sem(),
        'avg_mag_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].abs().mean(),
        'se_mag_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].abs().sem(),
        'avg_slope_change_global': x['slope_change'].mean()
    }))
display(name_by_genre_prop_df)
name_by_genre_prop_df.head(50)


Unnamed: 0_level_0,nb_films_in_genre,nb_names_in_genre,nb_names_signi_in_genre,prop_names_signi_in_genre_per_total_film_in_genre,is_na_sum,prop_signif_per_genre,prop_non_signi,prop_nan,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_mag_slope_change_significant,avg_slope_change_global
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Absurdism,740.0,721.0,64.0,0.086486,19.0,0.086486,0.778378,0.135135,0.003319,0.003524,0.014686,0.003029,0.000370
Acid western,32.0,30.0,4.0,0.125000,2.0,0.125000,0.687500,0.187500,0.011989,0.007077,0.011989,0.007077,0.002101
Action,34780.0,31575.0,2995.0,0.086113,3205.0,0.086113,0.692984,0.220903,0.001634,0.000441,0.013268,0.000370,0.000169
Action Comedy,1036.0,984.0,92.0,0.088803,52.0,0.088803,0.723938,0.187259,-0.000049,0.002079,0.012144,0.001644,0.000061
Action Thrillers,2911.0,2755.0,274.0,0.094126,156.0,0.094126,0.734112,0.171762,-0.000393,0.001593,0.014340,0.001336,0.000197
...,...,...,...,...,...,...,...,...,...,...,...,...,...
World History,20.0,0.0,0.0,0.000000,20.0,0.000000,0.000000,1.000000,,,,,
World cinema,19067.0,15344.0,945.0,0.049562,3723.0,0.049562,0.631353,0.319085,0.000796,0.000640,0.009445,0.000561,-0.000050
Wuxia,215.0,134.0,7.0,0.032558,81.0,0.032558,0.395349,0.572093,0.002028,0.000882,0.002028,0.000882,0.000107
Z movie,3.0,0.0,0.0,0.000000,3.0,0.000000,0.000000,1.000000,,,,,


Unnamed: 0_level_0,nb_films_in_genre,nb_names_in_genre,nb_names_signi_in_genre,prop_names_signi_in_genre_per_total_film_in_genre,is_na_sum,prop_signif_per_genre,prop_non_signi,prop_nan,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_mag_slope_change_significant,avg_slope_change_global
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Absurdism,740.0,721.0,64.0,0.086486,19.0,0.086486,0.778378,0.135135,0.003319,0.003524,0.014686,0.003029,0.00037
Acid western,32.0,30.0,4.0,0.125,2.0,0.125,0.6875,0.1875,0.011989,0.007077,0.011989,0.007077,0.002101
Action,34780.0,31575.0,2995.0,0.086113,3205.0,0.086113,0.692984,0.220903,0.001634,0.000441,0.013268,0.00037,0.000169
Action Comedy,1036.0,984.0,92.0,0.088803,52.0,0.088803,0.723938,0.187259,-4.9e-05,0.002079,0.012144,0.001644,6.1e-05
Action Thrillers,2911.0,2755.0,274.0,0.094126,156.0,0.094126,0.734112,0.171762,-0.000393,0.001593,0.01434,0.001336,0.000197
Action/Adventure,21112.0,19502.0,1942.0,0.091986,1610.0,0.091986,0.70666,0.201355,0.001518,0.000543,0.013323,0.000452,0.000156
Addiction Drama,245.0,228.0,31.0,0.126531,17.0,0.126531,0.738776,0.134694,0.000706,0.004193,0.013325,0.003418,0.000212
Adult,365.0,244.0,33.0,0.090411,121.0,0.090411,0.49863,0.410959,0.003339,0.008254,0.021607,0.007341,0.000553
Adventure,20830.0,18801.0,1852.0,0.08891,2029.0,0.08891,0.671675,0.239414,0.001156,0.000581,0.013427,0.000491,5.1e-05
Adventure Comedy,875.0,827.0,75.0,0.085714,48.0,0.085714,0.726857,0.187429,0.003231,0.002182,0.011944,0.001724,0.000446


In [308]:
name_by_genre_prop_df.isna().sum()
# Drop NaN values
name_by_genre_prop_df.dropna(inplace=True)
display(name_by_genre_prop_df)
# Sanity check
name_by_genre_prop_df.isna().sum()

Unnamed: 0_level_0,nb_films_in_genre,nb_names_in_genre,nb_names_signi_in_genre,prop_names_signi_in_genre_per_total_film_in_genre,is_na_sum,prop_signif_per_genre,prop_non_signi,prop_nan,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_mag_slope_change_significant,avg_slope_change_global
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Absurdism,740.0,721.0,64.0,0.086486,19.0,0.086486,0.778378,0.135135,0.003319,0.003524,0.014686,0.003029,0.000370
Acid western,32.0,30.0,4.0,0.125000,2.0,0.125000,0.687500,0.187500,0.011989,0.007077,0.011989,0.007077,0.002101
Action,34780.0,31575.0,2995.0,0.086113,3205.0,0.086113,0.692984,0.220903,0.001634,0.000441,0.013268,0.000370,0.000169
Action Comedy,1036.0,984.0,92.0,0.088803,52.0,0.088803,0.723938,0.187259,-0.000049,0.002079,0.012144,0.001644,0.000061
Action Thrillers,2911.0,2755.0,274.0,0.094126,156.0,0.094126,0.734112,0.171762,-0.000393,0.001593,0.014340,0.001336,0.000197
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Women in prison films,62.0,52.0,7.0,0.112903,10.0,0.112903,0.645161,0.241935,0.017780,0.010870,0.020819,0.009930,0.002551
Workplace Comedy,677.0,654.0,76.0,0.112260,23.0,0.112260,0.776957,0.110783,0.002557,0.001460,0.008550,0.001116,0.000441
World cinema,19067.0,15344.0,945.0,0.049562,3723.0,0.049562,0.631353,0.319085,0.000796,0.000640,0.009445,0.000561,-0.000050
Wuxia,215.0,134.0,7.0,0.032558,81.0,0.032558,0.395349,0.572093,0.002028,0.000882,0.002028,0.000882,0.000107


nb_films_in_genre                                    0
nb_names_in_genre                                    0
nb_names_signi_in_genre                              0
prop_names_signi_in_genre_per_total_film_in_genre    0
is_na_sum                                            0
prop_signif_per_genre                                0
prop_non_signi                                       0
prop_nan                                             0
avg_slope_change_significant                         0
se_slope_change_significant                          0
avg_mag_slope_change_significant                     0
se_mag_slope_change_significant                      0
avg_slope_change_global                              0
dtype: int64

### Saving data

In [309]:
ready_for_web = './data/web_data/'
# Add the genre as a column of the dataframe and save as csv
name_by_genre_prop_df.to_csv(os.path.join(ready_for_web, 'movie_genre_significant.csv'), index=True)

### Analysis looking at time effects

In [310]:
# Need to merge datasets containing "p_value" (name_by_movie_df), "movie_genre" (movie_genres_df), "release_date" (movie_df)
# => aggregate "name_by_movie_aggregate_df" with "movie_genres_df"
movie_genre_aggregate_with_years_df = movie_genre_aggregate_df.merge(movie_df, how='left', left_on='wiki_ID', right_on='wiki_ID')
display(movie_genre_aggregate_with_years_df)

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,genre,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3217,Gold,6.0,,,,0.0,Action,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Gold,6.0,,,,0.0,Comedy,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Gold,6.0,,,,0.0,Time travel,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Gold,6.0,,,,0.0,Black comedy,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Gold,6.0,,,,0.0,Zombie Film,Army of Darkness,1992,10.0,21502796.0,191068,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37241569,,,,,,,Action,Cold War,2012,11.0,,5033,6.6
37476824,,,,,,,Comedy,I Love New Year,2011,,,876,3.4
37476824,,,,,,,Crime Comedy,I Love New Year,2011,,,876,3.4
37476824,,,,,,,Caper story,I Love New Year,2011,,,876,3.4


In [311]:
# name_by_genre_per_year_prop_df = movie_genre_aggregate_with_years_filled_df.groupby(['genre','year']).apply(lambda x: x[x['p_value'] < 0.1])
name_by_genre_per_year_prop_df = movie_genre_aggregate_with_years_df.groupby(['genre','year']).apply(lambda x: x[x['p_value'] < alpha])
name_by_genre_per_year_prop_df.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,char_words,order,gender,t_stat,p_value,slope_change,genre,mov_name,year,month,revenue,numVotes,averageRating
genre,year,wiki_ID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Absurdism,1964,248601,George,2.0,M,-2.573227,0.025901,0.01219,Absurdism,A Hard Day's Night,1964,7.0,,47276,7.5
Absurdism,1974,19701,Tim,1.0,M,-4.091383,0.001785,0.010195,Absurdism,Monty Python and the Holy Grail,1974,4.0,,560662,8.2
Absurdism,1977,903082,Man,20.0,M,-4.220301,0.001436,3.7e-05,Absurdism,The Kentucky Fried Movie,1977,8.0,20000000.0,19727,6.4
Absurdism,1978,75261,Robert,9.0,M,-3.585998,0.004273,0.053382,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Dave,19.0,M,-2.481872,0.030472,0.001477,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Barbara,15.0,F,-3.618317,0.004038,0.018721,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Kent,8.0,M,-2.642745,0.022881,0.002435,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Donald,6.0,M,-2.96567,0.012844,0.010112,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Dean,2.0,M,-3.168871,0.008937,0.009001,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,John,0.0,M,-3.482099,0.005129,0.059891,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4


In [312]:
# Compute proportion of impacted names by genre by year
name_by_genre_per_year_prop_df = movie_genre_aggregate_with_years_df.groupby(['genre','year']).apply(lambda x: pd.Series({
        'prop_signif_per_genre_per_year': (x['p_value'] < alpha).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].mean(),
        'se_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].sem(),
        'avg_mag_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].abs().mean(),
        'se_slope_change_magnitude_significant': x[x['p_value'] < alpha]['slope_change'].abs().sem(),
        'avg_slope_change_global': x['slope_change'].mean()
    }))
display(name_by_genre_per_year_prop_df)
#name_by_genre_per_year_prop_df.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
genre,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Absurdism,1929,0.000000,,,,,
Absurdism,1930,0.000000,,,,,0.000005
Absurdism,1932,0.000000,,,,,-0.000088
Absurdism,1938,0.000000,,,,,
Absurdism,1940,0.000000,,,,,
...,...,...,...,...,...,...,...
Zombie Film,2008,0.108434,0.005349,0.002500,0.006609,0.002090,0.000667
Zombie Film,2009,0.046512,-0.013458,0.013578,0.013578,0.013458,-0.001215
Zombie Film,2010,0.137931,0.005537,0.005221,0.013275,0.002543,0.001270
Zombie Film,2011,0.230769,0.009126,0.006299,0.015096,0.003279,0.003075


#### Need to fill the missing year for each genre with 0

In [313]:
# Define a function to fill gaps and add corresponding values
all_years_df = pd.DataFrame({'year': range(movie_df['year'].min(), movie_df['year'].max() + 1)}).reset_index(drop=True)
all_years_df = all_years_df.set_index('year', drop=True)
#display(all_years_df)
def fill_gaps(group):
    filled_group = pd.merge(all_years_df, group, on='year', how='left').fillna(0)
    return filled_group

name_by_genre_per_year_prop_df.reset_index(inplace=True)
display(name_by_genre_per_year_prop_df)

name_by_genre_per_year_prop_filled_df = name_by_genre_per_year_prop_df.groupby('genre').apply(fill_gaps)
display(name_by_genre_per_year_prop_filled_df)

Unnamed: 0,genre,year,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
0,Absurdism,1929,0.000000,,,,,
1,Absurdism,1930,0.000000,,,,,0.000005
2,Absurdism,1932,0.000000,,,,,-0.000088
3,Absurdism,1938,0.000000,,,,,
4,Absurdism,1940,0.000000,,,,,
...,...,...,...,...,...,...,...,...
13827,Zombie Film,2008,0.108434,0.005349,0.002500,0.006609,0.002090,0.000667
13828,Zombie Film,2009,0.046512,-0.013458,0.013578,0.013578,0.013458,-0.001215
13829,Zombie Film,2010,0.137931,0.005537,0.005221,0.013275,0.002543,0.001270
13830,Zombie Film,2011,0.230769,0.009126,0.006299,0.015096,0.003279,0.003075


Unnamed: 0_level_0,Unnamed: 1_level_0,year,genre,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Absurdism,0,1888,0,0.0,0.0,0.0,0.0,0.0,0.0
Absurdism,1,1889,0,0.0,0.0,0.0,0.0,0.0,0.0
Absurdism,2,1890,0,0.0,0.0,0.0,0.0,0.0,0.0
Absurdism,3,1891,0,0.0,0.0,0.0,0.0,0.0,0.0
Absurdism,4,1892,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
Zombie Film,124,2012,Zombie Film,0.0,0.0,0.0,0.0,0.0,0.0
Zombie Film,125,2013,0,0.0,0.0,0.0,0.0,0.0,0.0
Zombie Film,126,2014,0,0.0,0.0,0.0,0.0,0.0,0.0
Zombie Film,127,2015,0,0.0,0.0,0.0,0.0,0.0,0.0


In [314]:
# Keep only movie genre for which there is at least 10 nonzero values
name_by_genre_per_year_prop_filled_df.drop(columns=['genre'], inplace=True)
name_by_genre_per_year_prop_filled_df.reset_index(inplace=True)
display(name_by_genre_per_year_prop_filled_df)
# Count the number of non-zero values for each genre
genre_counts = name_by_genre_per_year_prop_filled_df[name_by_genre_per_year_prop_filled_df['avg_slope_change_significant'] != 0].groupby('genre')['year'].nunique()
display(genre_counts)

# Filter out genres with fewer than 10 non-zero years
selected_genres = genre_counts[genre_counts >= 10].index
display(selected_genres)

# Filter the original DataFrame based on the selected genres
name_by_genre_per_year_prop_filled_filtered_df = name_by_genre_per_year_prop_filled_df[name_by_genre_per_year_prop_filled_df['genre'].isin(selected_genres)]
display(name_by_genre_per_year_prop_filled_filtered_df)



Unnamed: 0,genre,level_1,year,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
0,Absurdism,0,1888,0.0,0.0,0.0,0.0,0.0,0.0
1,Absurdism,1,1889,0.0,0.0,0.0,0.0,0.0,0.0
2,Absurdism,2,1890,0.0,0.0,0.0,0.0,0.0,0.0
3,Absurdism,3,1891,0.0,0.0,0.0,0.0,0.0,0.0
4,Absurdism,4,1892,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
46822,Zombie Film,124,2012,0.0,0.0,0.0,0.0,0.0,0.0
46823,Zombie Film,125,2013,0.0,0.0,0.0,0.0,0.0,0.0
46824,Zombie Film,126,2014,0.0,0.0,0.0,0.0,0.0,0.0
46825,Zombie Film,127,2015,0.0,0.0,0.0,0.0,0.0,0.0


genre
Absurdism                25
Acid western              4
Action                   89
Action Comedy            23
Action Thrillers         42
                         ..
Women in prison films     3
Workplace Comedy         22
World cinema             64
Wuxia                     4
Zombie Film              19
Name: year, Length: 294, dtype: int64

Index(['Absurdism', 'Action', 'Action Comedy', 'Action Thrillers',
       'Action/Adventure', 'Addiction Drama', 'Adult', 'Adventure',
       'Adventure Comedy', 'Airplanes and airports',
       ...
       'Thriller', 'Time travel', 'Tragedy', 'Tragicomedy', 'War film',
       'Western', 'Whodunit', 'Workplace Comedy', 'World cinema',
       'Zombie Film'],
      dtype='object', name='genre', length=179)

Unnamed: 0,genre,level_1,year,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
0,Absurdism,0,1888,0.0,0.0,0.0,0.0,0.0,0.0
1,Absurdism,1,1889,0.0,0.0,0.0,0.0,0.0,0.0
2,Absurdism,2,1890,0.0,0.0,0.0,0.0,0.0,0.0
3,Absurdism,3,1891,0.0,0.0,0.0,0.0,0.0,0.0
4,Absurdism,4,1892,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
46822,Zombie Film,124,2012,0.0,0.0,0.0,0.0,0.0,0.0
46823,Zombie Film,125,2013,0.0,0.0,0.0,0.0,0.0,0.0
46824,Zombie Film,126,2014,0.0,0.0,0.0,0.0,0.0,0.0
46825,Zombie Film,127,2015,0.0,0.0,0.0,0.0,0.0,0.0


In [315]:
# Dropping columns 
name_by_genre_per_year_prop_filled_filtered_df.drop(columns=['level_1'], inplace=True)
display(name_by_genre_per_year_prop_filled_filtered_df)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,genre,year,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
0,Absurdism,1888,0.0,0.0,0.0,0.0,0.0,0.0
1,Absurdism,1889,0.0,0.0,0.0,0.0,0.0,0.0
2,Absurdism,1890,0.0,0.0,0.0,0.0,0.0,0.0
3,Absurdism,1891,0.0,0.0,0.0,0.0,0.0,0.0
4,Absurdism,1892,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
46822,Zombie Film,2012,0.0,0.0,0.0,0.0,0.0,0.0
46823,Zombie Film,2013,0.0,0.0,0.0,0.0,0.0,0.0
46824,Zombie Film,2014,0.0,0.0,0.0,0.0,0.0,0.0
46825,Zombie Film,2015,0.0,0.0,0.0,0.0,0.0,0.0


In [316]:
# #reset index and reset index
# name_by_genre_per_year_prop_filled_df.reset_index(inplace=True).set_index(['genre'], inplace=True)
# display(name_by_genre_per_year_prop_filled_df)

In [317]:
display(name_by_genre_per_year_prop_filled_filtered_df.isna().sum())
# # Drop NaN values
# name_by_genre_per_year_prop_df.fillna(0, inplace=True)
# display(name_by_genre_per_year_prop_df)
# # Sanity check
# display(name_by_genre_per_year_prop_df.isna().sum())

genre                                    0
year                                     0
prop_signif_per_genre_per_year           0
avg_slope_change_significant             0
se_slope_change_significant              0
avg_mag_slope_change_significant         0
se_slope_change_magnitude_significant    0
avg_slope_change_global                  0
dtype: int64

### Saving the data

In [318]:
# Add the genre as a column of the dataframe and save as csv
name_by_genre_per_year_prop_filled_filtered_df.to_csv(os.path.join(ready_for_web, 'movie_genre_per_year_significant.csv'), index=False)

In [319]:
# Resaving data for Circle Packing with only movie genre kept in time analysis
# Add the genre as a column of the dataframe and save as csv
# Filter the original DataFrame based on the selected genres
name_by_genre_prop_df.reset_index(inplace=True)
name_by_genre_prop_filtered_df = name_by_genre_prop_df[name_by_genre_prop_df['genre'].isin(selected_genres)]
display(name_by_genre_prop_filtered_df)
name_by_genre_prop_filtered_df.to_csv(os.path.join(ready_for_web, 'movie_genre_significant_filtered.csv'), index=False)

Unnamed: 0,genre,nb_films_in_genre,nb_names_in_genre,nb_names_signi_in_genre,prop_names_signi_in_genre_per_total_film_in_genre,is_na_sum,prop_signif_per_genre,prop_non_signi,prop_nan,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_mag_slope_change_significant,avg_slope_change_global
0,Absurdism,740.0,721.0,64.0,0.086486,19.0,0.086486,0.778378,0.135135,0.003319,0.003524,0.014686,0.003029,0.000370
2,Action,34780.0,31575.0,2995.0,0.086113,3205.0,0.086113,0.692984,0.220903,0.001634,0.000441,0.013268,0.000370,0.000169
3,Action Comedy,1036.0,984.0,92.0,0.088803,52.0,0.088803,0.723938,0.187259,-0.000049,0.002079,0.012144,0.001644,0.000061
4,Action Thrillers,2911.0,2755.0,274.0,0.094126,156.0,0.094126,0.734112,0.171762,-0.000393,0.001593,0.014340,0.001336,0.000197
5,Action/Adventure,21112.0,19502.0,1942.0,0.091986,1610.0,0.091986,0.706660,0.201355,0.001518,0.000543,0.013323,0.000452,0.000156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268,Western,6028.0,4999.0,501.0,0.083112,1029.0,0.083112,0.617618,0.299270,-0.000646,0.001662,0.020063,0.001399,-0.000286
269,Whodunit,390.0,360.0,43.0,0.110256,30.0,0.110256,0.705128,0.184615,-0.001600,0.007410,0.025495,0.006284,-0.000346
271,Workplace Comedy,677.0,654.0,76.0,0.112260,23.0,0.112260,0.776957,0.110783,0.002557,0.001460,0.008550,0.001116,0.000441
272,World cinema,19067.0,15344.0,945.0,0.049562,3723.0,0.049562,0.631353,0.319085,0.000796,0.000640,0.009445,0.000561,-0.000050


## Question 3: Attendence/popularity + ratings

In [320]:
# The dataframe "name_by_movie_aggregate_df" already contains the wanted caracteristics
display(name_by_movie_aggregate_df)
name_by_movie_aggregate_df['numVotes'].max()

#Proportion of the film that had an influence in data segmented by number of votes

prop_0_10k = len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] < 10000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]) / len(name_by_movie_aggregate_df[name_by_movie_aggregate_df['numVotes'] < 10000])

print(f"Proportion of movies with numVotes < 10k and p_value < 0.1: {prop_0_10k :.3%}")

prop_10k_100k = len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 10000) & (name_by_movie_aggregate_df['numVotes'] < 100000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]) / len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 10000) & (name_by_movie_aggregate_df['numVotes'] < 100000)])

print(f"Proportion of movies with numVotes in [10k-100k] and p_value < 0.1: {prop_10k_100k :.3%}")

prop_100k_1M = len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 100000) & (name_by_movie_aggregate_df['numVotes'] < 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]) / len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 100000) & (name_by_movie_aggregate_df['numVotes'] < 1000000)])

print(f"Proportion of movies with numVotes in [100k-1M] and p_value < 0.1: {prop_100k_1M :.3%}")

prop_greater_1M = len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]) / len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 1000000)])

print(f"Proportion of movies with numVotes > 1M and p_value < 0.1: {prop_greater_1M :.3%}")


len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]['numVotes'].unique())


Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3217,Gold,6.0,,,,0.000000,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Linda,7.0,F,-0.416786,0.684853,0.000673,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Henry,4.0,M,-2.031668,0.067058,0.002513,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Duke,4.0,M,0.579441,0.573967,-0.000113,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Warrior,9.0,M,,,0.000000,Army of Darkness,1992,10.0,21502796.0,191068,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...
37478048,Ajay,9.0,M,-0.819213,0.430057,0.000130,Mr. Bechara,1996,,,395,5.4
37501922,Murphy,3.0,F,1.264175,0.232298,-0.000365,Terminal Bliss,1992,,,245,4.4
37501922,Hunter,1.0,M,-7.083089,0.000020,0.036603,Terminal Bliss,1992,,,245,4.4
37501922,John,1.0,M,-2.172964,0.052505,0.012557,Terminal Bliss,1992,,,245,4.4


Proportion of movies with numVotes < 10k and p_value < 0.1: 13.350%
Proportion of movies with numVotes in [10k-100k] and p_value < 0.1: 15.580%
Proportion of movies with numVotes in [100k-1M] and p_value < 0.1: 15.919%
Proportion of movies with numVotes > 1M and p_value < 0.1: 14.890%


52

Assumption: 
-Attendence is estimated by the number of votes
-A threshold of # of votes anove wich we start to study the influence of rating 

Ideas: 

-separate data according to number of votes & then separate data accordimng to rating 

-separate first according to votes and then in the segments of votes separates bad and good reviews

Question 4 : Faire la moyenne

In [321]:
#We segment the data frame according to the number of votes

votes_seg_0_10k = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] < 10000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]
votes_seg_10k_100k = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 10000) & (name_by_movie_aggregate_df['numVotes'] < 100000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]
votes_seg_100k_1M = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 100000) & (name_by_movie_aggregate_df['numVotes'] < 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]
votes_seg_1M_inf = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]

a = [votes_seg_0_10k['slope_change'].mean(), votes_seg_10k_100k['slope_change'].mean(), votes_seg_100k_1M['slope_change'].mean(), votes_seg_1M_inf['slope_change'].mean()]
index_names = ['0-10k', '10k-100k', '100k-1M', '1M-inf']
results = pd.DataFrame(a, index=index_names,columns = ['avg_slope_change'])
results.index.name = 'Seg_numVotes'
display(results)

Unnamed: 0_level_0,avg_slope_change
Seg_numVotes,Unnamed: 1_level_1
0-10k,0.001185
10k-100k,0.001746
100k-1M,0.002241
1M-inf,0.00075


In [322]:
name_by_movie_aggregate_df_significant = name_by_movie_aggregate_df[name_by_movie_aggregate_df['p_value'] < 0.1]

#We segment the data frame according to the number of votes

##Calculate the average cahnge of slopes for the different number of vote segments 

numVotes_bins = [0,10000,100000,1000000,np.inf]
segments_numVotes_label = ['0-10000','10000-100000','100000-1000000','1000000+']
name_by_movie_aggregate_df_significant['numVotes_segmented']  = pd.cut(name_by_movie_aggregate_df_significant['numVotes'],numVotes_bins,labels=segments_numVotes_label,right=True)

avg_magnitude_slopes_change_numVotes = name_by_movie_aggregate_df_significant.groupby('numVotes_segmented').apply(lambda x: pd.Series({
    'avg_magnitude_slopes_change': x['slope_change'].abs().mean(), 
    'avg_slope_change': x['slope_change'].mean()
    }))
display(avg_magnitude_slopes_change_numVotes)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,avg_magnitude_slopes_change,avg_slope_change
numVotes_segmented,Unnamed: 1_level_1,Unnamed: 2_level_1
0-10000,0.013599,0.001185
10000-100000,0.010688,0.001746
100000-1000000,0.00951,0.002241
1000000+,0.010704,0.00075


#### Segementing w.r.t. movie rating

In [323]:
#We segment the data frame according to the rating
#Calculate the average change of slopes for the different rating segements

rating_quantiles = np.quantile(name_by_movie_aggregate_df_significant['averageRating'],[0.25,0.5,0.75])
#display(rating_quantiles)

# display((name_by_movie_aggregate_df_significant['averageRating']<= 5.5).sum()/len(name_by_movie_aggregate_df_significant))

rating_bins = [0,rating_quantiles[0],rating_quantiles[1],rating_quantiles[2],10]
segments_rating_label = ['0-{}'.format(rating_quantiles[0]),'{}-{}'.format(rating_quantiles[0], rating_quantiles[1]),'{}-{}'.format(rating_quantiles[1], rating_quantiles[2]),'{}-10'.format(rating_quantiles[2])]
name_by_movie_aggregate_df_significant['rating_segmented']  = pd.cut(name_by_movie_aggregate_df_significant['averageRating'],rating_bins,labels=segments_rating_label,right=True)
avg_slopes_change_rating = name_by_movie_aggregate_df_significant.groupby('rating_segmented').apply(lambda x: pd.Series({
    'avg_slopes_change': x['slope_change'].mean(),
    'avg_magnitude_slopes_change': x['slope_change'].abs().mean()
}))

display(avg_slopes_change_rating)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,avg_slopes_change,avg_magnitude_slopes_change
rating_segmented,Unnamed: 1_level_1,Unnamed: 2_level_1
0-5.5,0.001597,0.011542
5.5-6.3,0.001772,0.012487
6.3-6.9,0.001278,0.012434
6.9-10,0.001332,0.012003


## Question 4: Character Importance in film

In [324]:
# The dataframe "name_by_movie_df" already contains the wanted caracteristics ("order")
# display(name_by_order_df)
name_by_order_df = name_by_movie_df.groupby("order").apply(lambda x: x[x['p_value'] < 0.1])
display(name_by_order_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,char_words,order,gender,t_stat,p_value,slope_change
order,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,4560,William,0.0,M,-3.378640,0.006157,0.015610
0.0,5035,Eric,0.0,M,-6.765221,0.000031,0.025314
0.0,5729,Harold,0.0,M,-2.233082,0.047271,0.001985
0.0,13901,Sawyer,0.0,F,-2.173715,0.052437,0.001466
0.0,19715,Gracie,0.0,F,-2.941462,0.013413,0.008645
...,...,...,...,...,...,...,...
94.0,9834441,Lily,94.0,F,4.655481,0.000699,-0.024797
95.0,20777420,Thomas,95.0,M,-4.265520,0.001331,0.011104
98.0,370064,Anderson,98.0,F,-4.352241,0.001151,0.003382
98.0,25079197,Tyson,98.0,M,4.232343,0.001407,-0.003961


In [325]:
name_by_order_prop_df = name_by_movie_df.groupby("order").apply(lambda x: pd.Series({
        'prop_signif_per_order': (x['p_value'] < 0.1).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].mean(),
        'avg_slope_change_global': x['slope_change'].mean(),
        'avg_magnitude_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].abs().mean(),
        'total_number_signif_per_order': (x['p_value'] < 0.1).sum(),
        'proportion_negative_SC' : (x[x['p_value'] < 0.1]['slope_change'] < 0).sum() / len(x[x['p_value'] < 0.1]['slope_change']),
        'proportion_positive_SC' : (x[x['p_value'] < 0.1]['slope_change'] > 0).sum() / len(x[x['p_value'] < 0.1]['slope_change']),
        'se_slope_change_magnitude_significant': x[x['p_value'] < 0.1]['slope_change'].abs().sem()
    }))
display(name_by_order_prop_df)



invalid value encountered in scalar divide


invalid value encountered in scalar divide



Unnamed: 0_level_0,prop_signif_per_order,avg_slope_change_significant,avg_slope_change_global,avg_magnitude_slope_change_significant,total_number_signif_per_order,proportion_negative_SC,proportion_positive_SC,se_slope_change_magnitude_significant
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,0.158931,0.001619,0.000161,0.014779,3883.0,0.265516,0.734484,0.000381
1.0,0.161459,0.000665,-0.000027,0.013909,3054.0,0.270792,0.729208,0.000432
2.0,0.155372,0.001877,0.000206,0.012083,2256.0,0.240248,0.759752,0.000436
3.0,0.151769,0.001556,0.000271,0.012001,1810.0,0.248066,0.751934,0.000445
4.0,0.149642,0.002004,0.000253,0.011517,1506.0,0.237052,0.762948,0.000475
...,...,...,...,...,...,...,...,...
151.0,0.000000,,-0.001630,,0.0,,,
152.0,0.000000,,0.000917,,0.0,,,
169.0,0.000000,,0.000035,,0.0,,,
300.0,0.000000,,0.000036,,0.0,,,


In [326]:
# Limiter les données jusqu'à l'ordre 100
filtered_df = name_by_order_prop_df[(name_by_order_prop_df.index <= 100) & (name_by_order_prop_df.index > 0)]

# Limiter la plage de la hauteur entre 0 et 0.1
y_range = [0, 0.03]

# Création du bar chart interactif avec sous-graphiques
fig = go.Figure()

# # Tracé pour avg magnitude slope change
# fig.add_trace(go.Bar(
#     x=filtered_df.index,
#     y=filtered_df['avg_magnitude_slope_change_significant'],
#     name='Avg Magnitude Slope Change',
#     marker_color='blue'
# ))

# Tracé pour la proportion de slope change négatif à l'intérieur de la barre de magnitude
fig.add_trace(go.Bar(
    x=filtered_df.index,
    y=filtered_df['proportion_negative_SC'] * filtered_df['avg_magnitude_slope_change_significant'],
    name='Proportion Slope Change Negatif',
    marker_color='red',
    offsetgroup=1
))

# Tracé pour la proportion de slope change positif à l'intérieur de la barre de magnitude
fig.add_trace(go.Bar(
    x=filtered_df.index,
    y=filtered_df['proportion_positive_SC'] * filtered_df['avg_magnitude_slope_change_significant'],
    name='Proportion Slope Change Positif',
    marker_color='green',
    offsetgroup=1
))

# Mise en forme du tracé
fig.update_layout(
    xaxis=dict(title='Order'),
    yaxis=dict(title='Magnitude / Proportion', range=y_range),
    barmode='stack'  # 'stack' empile les barres pour chaque order
)

In [327]:
fig.write_html("CaracterRole.html")

In [328]:
# Assumez que votre dataframe s'appelle name_by_order_prop_df

# Limiter les données jusqu'à l'ordre 20
filtered_df = name_by_order_prop_df[(name_by_order_prop_df.index <= 20) & (name_by_order_prop_df.index > 0)]

# Limiter la plage de la hauteur entre 0 et 0.03
y_range = [0, 0.03]

# Création du bar chart interactif avec sous-graphiques
fig = go.Figure()

# Tracé pour avg magnitude slope change avec erreur
fig.add_trace(go.Bar(
    x=filtered_df.index,
    y=filtered_df['avg_magnitude_slope_change_significant'],
    name='Avg Magnitude Slope Change',
    marker_color='orange',
    error_y=dict(
        type='data',
        array=filtered_df['se_slope_change_magnitude_significant'],
        visible=True
    )
))

# Mise en forme du tracé
fig.update_layout(
    xaxis=dict(title='Charater Order'),
    yaxis=dict(title='Slope Change Magnitude', range=y_range),
    barmode='stack'  # 'stack' empile les barres pour chaque order
)

# Affichage du graphique
fig.show()

#### Does movie genre and caracter role are linked ?

In [329]:
# Does the order influence is impacted by movie genre ? Study of impact due to role importance per movie genre
name_by_order_by_genre_prop_df = movie_genre_aggregate_df.groupby(['order','genre']).apply(lambda x: pd.Series({
        'prop_signif_per_order_per_genre': (x['p_value'] < 0.1).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].mean(),
        'avg_magnitude_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].abs().mean(),
        'avg_slope_change_global': x['slope_change'].mean(),
        'total_number_signif_per_order_per_genre': (x['p_value'] < 0.1).sum(),
    }))
display(name_by_order_by_genre_prop_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_order_per_genre,avg_slope_change_significant,avg_magnitude_slope_change_significant,avg_slope_change_global,total_number_signif_per_order_per_genre
order,genre,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,Absurdism,0.202532,-0.014142,0.028102,-0.002293,16.0
0.0,Acid western,0.500000,0.002721,0.006006,0.000786,4.0
0.0,Action,0.160751,0.000318,0.013684,0.000086,702.0
0.0,Action Comedy,0.113402,-0.010151,0.012877,-0.000509,11.0
0.0,Action Thrillers,0.167102,-0.003755,0.016838,0.000162,64.0
...,...,...,...,...,...,...
302.0,Biographical film,0.000000,,,0.000000,0.0
302.0,Biography,0.000000,,,0.000000,0.0
302.0,Drama,0.000000,,,0.000000,0.0
302.0,Period piece,0.000000,,,0.000000,0.0


### Does the order of a name influence differently according to gender ?
<span style="color:red"> *Prendre seulement les valeur ou p less 0.1 pour faire l'etude des slopes ? Si on les gardes ça va influencer nos moyenne avec des truc pas significantes *</span>

<span style="color:red"> **Revoir **</span>

In [330]:
# Calculate the average magnitude of slope change on all the data
# Calculate the average magnitude of slope change on data having a slope change statistically significant
# Calculate the average of slope change on data having a slope change statistically significant
name_by_order_by_gender_prop_df = name_by_movie_df.groupby(['order','gender']).apply(lambda x: pd.Series({
        'prop_signif_per_order_per_genre': (x['p_value'] < 0.1).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].mean(),
        'avg_slope_change_global': x['slope_change'].mean(),
        'avg_magnitude_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].abs().mean(),
        'avg_magnitude_slope_change_global': x['slope_change'].abs().mean(),
        'total_number_signif_per_order_per_genre': (x['p_value'] < 0.1).sum(),
    }))
display(name_by_order_by_gender_prop_df)


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_order_per_genre,avg_slope_change_significant,avg_slope_change_global,avg_magnitude_slope_change_significant,avg_magnitude_slope_change_global,total_number_signif_per_order_per_genre
order,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,F,0.177497,0.000784,-0.000065,0.014627,0.004073,1322.0
0.0,M,0.150572,0.002175,0.000282,0.014887,0.003522,2512.0
1.0,F,0.171285,-0.000144,-0.000253,0.014458,0.003878,1607.0
1.0,M,0.152623,0.001670,0.000217,0.013241,0.003267,1420.0
2.0,F,0.170154,0.001120,-0.000063,0.013023,0.003640,1049.0
...,...,...,...,...,...,...,...
151.0,M,0.000000,,-0.001630,,0.001630,0.0
152.0,F,0.000000,,0.000917,,0.000917,0.0
169.0,M,0.000000,,0.000035,,0.000035,0.0
300.0,M,0.000000,,0.000036,,0.000036,0.0


## Question 5: Caracter gender in film
<span style="color:green"> ok</span>

In [344]:
# The dataframe "name_by_movie" has everything we need
# Keep only significant value (5% level) an values higher than 10e-5
name_by_gender_df = name_by_movie_df.groupby('gender').apply(lambda x: x[(x['p_value'] <= alpha)])
display(name_by_gender_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,char_words,order,gender,t_stat,p_value,slope_change
gender,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,4231,Jennifer,6.0,F,-2.455800,0.031916,0.088259
F,4560,Isabelle,2.0,F,-8.004577,0.000006,0.008832
F,5224,Susan,2.0,F,-4.547336,0.000834,0.063266
F,9835,Maggie,7.0,F,-2.247749,0.046071,0.002362
F,9979,Amanda,,F,-2.735891,0.019373,0.043621
...,...,...,...,...,...,...,...
M,36699915,Jackson,0.0,M,4.587186,0.000781,-0.026825
M,36814246,Man,6.0,M,-4.220301,0.001436,0.000037
M,36956792,Gunner,13.0,M,-4.245248,0.001377,0.002472
M,36956792,Charlie,5.0,M,-5.446114,0.000202,0.006446


In [332]:
# Assuming you have a Dash app set up
app = dash.Dash(__name__)

# Sample data
threshold = 10e-4

name_by_gender_df['abs_slope_change'] = name_by_gender_df['slope_change'].abs()
#name_by_gender_filtered_df = name_by_gender_df[name_by_gender_df['abs_slope_change'] > threshold]

# Create the initial figure
fig = go.Figure()

# Define color scale for both genders and signs
color_scale = {'M': {'Positive': 'orange', 'Negative': 'blue'},
               'F': {'Positive': 'orange', 'Negative': 'blue'}}

for gender in ['M', 'F']:
    for sign in ['Positive', 'Negative']:
        subset = name_by_gender_df[(name_by_gender_df['gender'] == gender) & (name_by_gender_df['slope_change'] * (-1) ** (sign == 'Positive') > 0)]
        fig.add_trace(go.Violin(x=subset['gender'], y=subset['abs_slope_change'],
                                name=f'{gender} ({sign})', side='positive' if sign == 'Positive' else 'negative',
                                line_color=color_scale[gender][sign]))

# Create the Dash layout
app.layout = html.Div([
    dcc.Slider(
        id='threshold-slider',
        min=10e-6,
        max=10e-2,
        step=10e-6,
        value=threshold,
        marks={i: f"{i:.0e}" for i in [10e-6, 10e-5, 10e-4, 10e-3]},
        tooltip={'placement': 'bottom', 'always_visible': True}
    ),
    dcc.Graph(id='gender-violin-plot', figure=fig),
])

# Define callback to update the plot based on the slider value
@app.callback(
    Output('gender-violin-plot', 'figure'),
    [Input('threshold-slider', 'value')]
)
def update_plot(threshold_value):
    updated_df = name_by_gender_df[name_by_gender_df['abs_slope_change'] > threshold_value]
    updated_fig = go.Figure()

    for gender in ['M', 'F']:
        for sign in ['Positive', 'Negative']:
            subset = updated_df[(updated_df['gender'] == gender) & (updated_df['slope_change'] * (-1) ** (sign == 'Positive') > 0)]
            updated_fig.add_trace(go.Violin(x=subset['gender'], y=subset['abs_slope_change'],
                                            name=f'{gender} ({sign})', side='positive' if sign == 'Positive' else 'negative',
                                            line_color=color_scale[gender][sign]))

    return updated_fig

# Run the app
if __name__ == '__main__':
    app.run_server(debug=True)



In [333]:
# Doesn't work as intended => can't use this plot
# fig = go.Figure()

# # Male (M) Violin Plot
# fig.add_trace(go.Violin(x=name_by_gender_filtered_df['gender'][(name_by_gender_filtered_df['gender'] == 'M') & (name_by_gender_filtered_df['slope_change'] < 0)],
#                         y=name_by_gender_filtered_df['abs_slope_change'][(name_by_gender_filtered_df['gender'] == 'M') & (name_by_gender_filtered_df['slope_change'] < 0)],
#                         legendgroup='Male', scalegroup='Male', name='Male (Negative)',
#                         side='negative',
#                         line_color='blue')
#              )
# fig.add_trace(go.Violin(x=name_by_gender_filtered_df['gender'][(name_by_gender_filtered_df['gender'] == 'M') & (name_by_gender_filtered_df['slope_change'] > 0)],
#                         y=name_by_gender_filtered_df['abs_slope_change'][(name_by_gender_filtered_df['gender'] == 'M') & (name_by_gender_filtered_df['slope_change'] > 0)],
#                         legendgroup='Male', scalegroup='Male', name='Male (Positive)',
#                         side='positive',
#                         line_color='orange')
#              )
# # Female (F) Violin Plot
# fig.add_trace(go.Violin(x=name_by_gender_filtered_df['gender'][(name_by_gender_filtered_df['gender'] == 'F') & (name_by_gender_filtered_df['slope_change'] < 0)],
#                         y=name_by_gender_filtered_df['abs_slope_change'][(name_by_gender_filtered_df['gender'] == 'F') & (name_by_gender_filtered_df['slope_change'] < 0)],
#                         legendgroup='Female', scalegroup='Female', name='Female (Negative)',
#                         side='negative',
#                         line_color='blue')
#              )
# fig.add_trace(go.Violin(x=name_by_gender_filtered_df['gender'][(name_by_gender_filtered_df['gender'] == 'F') & (name_by_gender_filtered_df['slope_change'] > 0)],
#                         y=name_by_gender_filtered_df['abs_slope_change'][(name_by_gender_filtered_df['gender'] == 'F') & (name_by_gender_filtered_df['slope_change'] > 0)],
#                         legendgroup='Female', scalegroup='Female', name='Female (Positive)',
#                         side='positive',
#                         line_color='orange')
#              )

# #fig.update_yaxes(type="log")  # Set y-axis to logarithmic scale
# fig.update_traces(meanline_visible=True)
# fig.update_layout(violingap=0, violinmode='overlay')
# fig.show()
# fig.write_html("Question5_1.html")


In [334]:
#Average slope change 
name_by_gender_prop_df = name_by_movie_df.groupby("gender").apply(lambda x: pd.Series({
        'prop_signif_per_gender': (x['p_value'] < alpha).sum()/len(x['p_value']),
        'avg_slope_change_per_gender_significant': x[x['p_value'] < alpha]['slope_change'].mean(),
        'se_slope_change_per_gender_significant': x[x['p_value'] < alpha]['slope_change'].sem(),
        'avg_slope_change_per_gender_global': x['slope_change'].mean(),
        'avg_mag_slope_change_per_gender_significant': x[x['p_value'] < alpha]['slope_change'].abs().mean(),
        'se_mag_slope_change_per_gender_significant': x[x['p_value'] < alpha]['slope_change'].abs().sem(),
        'avg_mag_slope_change_per_gender_global': x['slope_change'].abs().mean(),
        'total_number_signif_per_gender': (x['p_value'] < alpha).sum(),
    }))
display(name_by_gender_prop_df)

Unnamed: 0_level_0,prop_signif_per_gender,avg_slope_change_per_gender_significant,se_slope_change_per_gender_significant,avg_slope_change_per_gender_global,avg_mag_slope_change_per_gender_significant,se_mag_slope_change_per_gender_significant,avg_mag_slope_change_per_gender_global,total_number_signif_per_gender
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,0.113346,0.001698,0.000346,0.00011,0.014831,0.000299,0.0032,7107.0
M,0.090571,0.002384,0.000249,0.000249,0.01363,0.000208,0.002523,9487.0


In [335]:
# Compute the standard error - Sanity check - Validated
#se_F = name_by_gender_df[name_by_gender_df.index.isin(['F'])]['abs_slope_change'].sem()
#se_M = name_by_gender_df[name_by_gender_df.index.isin(['M'])]['abs_slope_change'].sem()

fig = go.Figure()
colors = {'M': 'blue', 'F': 'pink'}
fig.add_trace(go.Bar(
    #name='Control',
    x=name_by_gender_prop_df.index,
    y=name_by_gender_prop_df['avg_mag_slope_change_per_gender_significant'],
    error_y=dict(type='data', array=2*name_by_gender_prop_df['se_mag_slope_change_per_gender_significant']),
    marker_color=[colors[gender] for gender in name_by_gender_prop_df.index]
))
# fig.add_trace(go.Bar(
#     name='Experimental',
#     x=['Trial 1', 'Trial 2', 'Trial 3'], y=[4, 7, 3],
#     error_y=dict(type='data', array=[0.5, 1, 2])
# ))
fig.update_layout(barmode='group')
#fig.update_yaxes(type="log")
fig.show()

In [336]:
t_value, p_value = stats.ttest_ind(name_by_gender_df.loc[name_by_gender_df['gender'] == 'M']['abs_slope_change'], name_by_gender_df.loc[name_by_gender_df['gender'] == 'F']['abs_slope_change'])
display("p-value is {:.5f}".format(p_value))

'p-value is 0.00067'

#### Does caracter gender and movie genre are linked ?

##### Try to group by movie genre and see the distribution Men/Woment for the 5 movie genre with most data 

In [337]:
display(movie_genre_aggregate_df)
most_data_per_genre = movie_genre_aggregate_df[movie_genre_aggregate_df['p_value'] < alpha].groupby(['genre']).count().nlargest(5, columns="p_value").index
display(most_data_per_genre)
genre_with_most_data_df = movie_genre_aggregate_df[movie_genre_aggregate_df['genre'].isin(most_data_per_genre) & (movie_genre_aggregate_df['p_value'] < alpha)]
display(genre_with_most_data_df)

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,genre
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3217,Gold,6.0,,,,0.0,Action
3217,Gold,6.0,,,,0.0,Comedy
3217,Gold,6.0,,,,0.0,Time travel
3217,Gold,6.0,,,,0.0,Black comedy
3217,Gold,6.0,,,,0.0,Zombie Film
...,...,...,...,...,...,...,...
37241569,,,,,,,Action
37476824,,,,,,,Comedy
37476824,,,,,,,Crime Comedy
37476824,,,,,,,Caper story


Index(['Drama', 'Comedy', 'Thriller', 'Romance Film', 'Action'], dtype='object', name='genre')

Unnamed: 0_level_0,char_words,order,gender,t_stat,p_value,slope_change,genre
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3837,Jim,1.0,M,-2.715964,0.020076,0.006715,Comedy
3947,Hunter,13.0,M,-4.938567,0.000444,0.002903,Thriller
4231,Jennifer,6.0,F,-2.455800,0.031916,0.088259,Action
4231,Jennifer,6.0,F,-2.455800,0.031916,0.088259,Comedy
4560,William,0.0,M,-3.378640,0.006157,0.015610,Action
...,...,...,...,...,...,...,...
36699915,Jackson,0.0,M,4.587186,0.000781,-0.026825,Action
36699915,Jackson,0.0,M,4.587186,0.000781,-0.026825,Drama
36814246,Man,6.0,M,-4.220301,0.001436,0.000037,Drama
36814246,Mary,1.0,F,-2.783137,0.017804,0.041502,Drama


In [338]:
# Does the gender influence is impacted by movie genre ? Study of impact due to role importance per movie genre
name_by_gender_by_genre_prop_df = genre_with_most_data_df.groupby(['gender','genre']).apply(lambda x: pd.Series({
        'prop_signif_per_gender_per_genre': (x['p_value'] < alpha).sum()/len(x['p_value']),
        'avg_slope_change_per_gender_per_genre_significant': x[x['p_value'] < alpha]['slope_change'].mean(),
        'se_slope_change_per_gender_per_genre_significant': x[x['p_value'] < alpha]['slope_change'].sem(),
        'avg_slope_change_per_gender_per_genre_global': x['slope_change'].mean(),
        'avg_mag_slope_change_per_gender_per_genre_significant': x[x['p_value'] < alpha]['slope_change'].abs().mean(),
        'se_mag_slope_change_per_gender_per_genre_significant': x[x['p_value'] < alpha]['slope_change'].abs().sem(),
        'avg_mag_slope_change_per_gender_per_genre_global': x['slope_change'].abs().mean(),
        'total_number_signif_per_gender_per_genre': (x['p_value'] < alpha).sum(),
    }))
display(name_by_gender_by_genre_prop_df)


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_gender_per_genre,avg_slope_change_per_gender_per_genre_significant,se_slope_change_per_gender_per_genre_significant,avg_slope_change_per_gender_per_genre_global,avg_mag_slope_change_per_gender_per_genre_significant,se_mag_slope_change_per_gender_per_genre_significant,avg_mag_slope_change_per_gender_per_genre_global,total_number_signif_per_gender_per_genre
gender,genre,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
F,Action,1.0,0.000447,0.00087,0.000447,0.01398,0.00075,0.01398,1002.0
F,Comedy,1.0,0.002229,0.000589,0.002229,0.01473,0.000512,0.01473,2505.0
F,Drama,1.0,0.002089,0.000417,0.002089,0.013669,0.000357,0.013669,3958.0
F,Romance Film,1.0,0.001437,0.000738,0.001437,0.01464,0.000656,0.01464,1863.0
F,Thriller,1.0,0.002103,0.000609,0.002103,0.013739,0.000511,0.013739,1688.0
M,Action,1.0,0.002146,0.000512,0.002146,0.013006,0.00042,0.013006,1927.0
M,Comedy,1.0,0.002474,0.000433,0.002474,0.012532,0.000366,0.012532,2847.0
M,Drama,1.0,0.002329,0.000326,0.002329,0.013589,0.000269,0.013589,5325.0
M,Romance Film,1.0,0.002522,0.000539,0.002522,0.013558,0.000448,0.013558,1982.0
M,Thriller,1.0,0.002065,0.000469,0.002065,0.013301,0.000389,0.013301,2529.0


In [339]:
# Assuming you have a DataFrame named name_by_gender_df with 'gender' and 'genre' columns
# name_by_gender_df should contain the relevant columns such as 'avg_mag_slope_change_per_gender_significant' and 'se_mag_slope_change_per_gender_significant'

fig = go.Figure()

colors = {'M': 'blue', 'F': 'pink'}

genres = most_data_per_genre
display(genres)

for genre in genres:
    genre_data = name_by_gender_by_genre_prop_df.xs(genre, level='genre')
    display(genre_data)
    # Bar for men
    fig.add_trace(go.Bar(
        x=genre_data.index,
        y=genre_data[genre_data.index == 'M']['avg_mag_slope_change_per_gender_per_genre_significant'],
        error_y=dict(type='data', array=2 * genre_data[genre_data.index == 'M']['se_mag_slope_change_per_gender_per_genre_significant']),
        marker_color=colors['M'],
        name=f'{genre} - Men'
    ))

    # Bar for women
    fig.add_trace(go.Bar(
        x=genre_data.index,
        y=genre_data[genre_data.index == 'F']['avg_mag_slope_change_per_gender_per_genre_significant'],
        error_y=dict(type='data', array=2 * genre_data[genre_data.index == 'F']['se_mag_slope_change_per_gender_per_genre_significant']),
        marker_color=colors['F'],
        name=f'{genre} - Women'
    ))

fig.update_layout(barmode='group', xaxis={'categoryorder':'total ascending'})
fig.show()


Index(['Drama', 'Comedy', 'Thriller', 'Romance Film', 'Action'], dtype='object', name='genre')

Unnamed: 0_level_0,prop_signif_per_gender_per_genre,avg_slope_change_per_gender_per_genre_significant,se_slope_change_per_gender_per_genre_significant,avg_slope_change_per_gender_per_genre_global,avg_mag_slope_change_per_gender_per_genre_significant,se_mag_slope_change_per_gender_per_genre_significant,avg_mag_slope_change_per_gender_per_genre_global,total_number_signif_per_gender_per_genre
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,1.0,0.002089,0.000417,0.002089,0.013669,0.000357,0.013669,3958.0
M,1.0,0.002329,0.000326,0.002329,0.013589,0.000269,0.013589,5325.0


Unnamed: 0_level_0,prop_signif_per_gender_per_genre,avg_slope_change_per_gender_per_genre_significant,se_slope_change_per_gender_per_genre_significant,avg_slope_change_per_gender_per_genre_global,avg_mag_slope_change_per_gender_per_genre_significant,se_mag_slope_change_per_gender_per_genre_significant,avg_mag_slope_change_per_gender_per_genre_global,total_number_signif_per_gender_per_genre
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,1.0,0.002229,0.000589,0.002229,0.01473,0.000512,0.01473,2505.0
M,1.0,0.002474,0.000433,0.002474,0.012532,0.000366,0.012532,2847.0


Unnamed: 0_level_0,prop_signif_per_gender_per_genre,avg_slope_change_per_gender_per_genre_significant,se_slope_change_per_gender_per_genre_significant,avg_slope_change_per_gender_per_genre_global,avg_mag_slope_change_per_gender_per_genre_significant,se_mag_slope_change_per_gender_per_genre_significant,avg_mag_slope_change_per_gender_per_genre_global,total_number_signif_per_gender_per_genre
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,1.0,0.002103,0.000609,0.002103,0.013739,0.000511,0.013739,1688.0
M,1.0,0.002065,0.000469,0.002065,0.013301,0.000389,0.013301,2529.0


Unnamed: 0_level_0,prop_signif_per_gender_per_genre,avg_slope_change_per_gender_per_genre_significant,se_slope_change_per_gender_per_genre_significant,avg_slope_change_per_gender_per_genre_global,avg_mag_slope_change_per_gender_per_genre_significant,se_mag_slope_change_per_gender_per_genre_significant,avg_mag_slope_change_per_gender_per_genre_global,total_number_signif_per_gender_per_genre
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,1.0,0.001437,0.000738,0.001437,0.01464,0.000656,0.01464,1863.0
M,1.0,0.002522,0.000539,0.002522,0.013558,0.000448,0.013558,1982.0


Unnamed: 0_level_0,prop_signif_per_gender_per_genre,avg_slope_change_per_gender_per_genre_significant,se_slope_change_per_gender_per_genre_significant,avg_slope_change_per_gender_per_genre_global,avg_mag_slope_change_per_gender_per_genre_significant,se_mag_slope_change_per_gender_per_genre_significant,avg_mag_slope_change_per_gender_per_genre_global,total_number_signif_per_gender_per_genre
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
F,1.0,0.000447,0.00087,0.000447,0.01398,0.00075,0.01398,1002.0
M,1.0,0.002146,0.000512,0.002146,0.013006,0.00042,0.013006,1927.0


In [340]:
fig = px.box(name_by_gender_df, x='gender', y='abs_slope_change', color='gender')
fig.update_yaxes(type="log")
fig.show()

In [341]:


#fig.write_html("desktop.html")
