# Parallel questions study
In this notebook, we carry out the study of the parallel questions related to the influence of movies on baby names, therefore conduction a global analysis.

In [243]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from scipy import stats
import seaborn as sns
from statsmodels.stats.proportion import proportions_ztest
import plotly.graph_objects as go

In [244]:
folder_processed_data_path = './data/processed_data/'

# Dataset containing month of release
movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_df.csv'))
movie_df.set_index(['wiki_ID'], inplace=True)
display(movie_df)

# Dataset containing p_value
name_by_movie_df = pd.read_csv(os.path.join(folder_processed_data_path, 'name_by_movie_ordered_pvalue_10_5_df.csv'))
name_by_movie_df.set_index(['wiki_ID'], inplace=True)
display(name_by_movie_df)

# Dataset containing movie genre
movie_genres_df = pd.read_csv(os.path.join(folder_processed_data_path, 'movie_genres_df.csv'))
movie_genres_df.set_index(['wiki_ID'], inplace=True)
display(movie_genres_df)

# Selection of significance level
alpha = 0.05

Unnamed: 0_level_0,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
975900,Ghosts of Mars,2001,8.0,14010832.0,56880,4.9
3196793,Getting Away with Murder: The JonBenét Ramsey ...,2000,2.0,,69,6.0
28463795,Brun bitter,1988,,,40,5.6
9363483,White Of The Eye,1987,,,2891,6.1
261236,A Woman in Flames,1983,,,623,5.9
...,...,...,...,...,...,...
35228177,Mermaids: The Body Found,2011,3.0,,1711,4.6
34980460,Knuckle,2011,1.0,,3192,6.8
9971909,Another Nice Mess,1972,9.0,,111,5.8
913762,The Super Dimension Fortress Macross II: Lover...,1992,5.0,,657,6.0


Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3217,Gold,,6.0,,,
3217,Linda,F,7.0,0.676072,-0.000675,0.429187
3217,Henry,M,4.0,0.068422,-0.002435,2.019954
3217,Duke,M,4.0,0.582585,0.000108,-0.566260
3217,Warrior,M,9.0,,,
...,...,...,...,...,...,...
37478048,Ajay,M,9.0,0.436957,-0.000126,0.806658
37501922,Murphy,F,3.0,0.234444,0.000354,-1.257988
37501922,Hunter,M,1.0,0.000021,-0.035578,7.051709
37501922,John,M,1.0,0.052067,-0.012279,2.177768


Unnamed: 0_level_0,genre
wiki_ID,Unnamed: 1_level_1
330,Comedy-drama
330,Drama
3217,Action
3217,Comedy
3217,Time travel
...,...
37476824,Crime Comedy
37476824,Caper story
37476824,Crime Fiction
37478048,Comedy film


How much movie genre ?


In [245]:
display(len(movie_genres_df['genre'].unique()))

363

**name_by_movie_df**: dataframe with names, p_value, slope_change

**movie_df**: dataframe with film caracteristics

**movie_genre_df**: dataframe with movie genre

**name_by_movie_aggregate_df**: **name_by_movie_df** + **movie_df**: dataframe with names, p_value, slope change + film caracteristics

**movie_genre_aggregate_df**: **name_by_movie_df** + **movie_genre_df**: dataframe with names, p_value, slope change + film genre

**movie_genre_aggregate_with_years_df**: dataframe with names, p_value, slope change + film genre + years

### Question 1: Month of release

In [246]:
# First, aggregate dataframe with p_value table with dataframe containing release month 
name_by_movie_aggregate_df = name_by_movie_df.merge(movie_df, how='left', left_on='wiki_ID', right_on='wiki_ID')
display(name_by_movie_aggregate_df)

Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3217,Gold,,6.0,,,,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Linda,F,7.0,0.676072,-0.000675,0.429187,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Henry,M,4.0,0.068422,-0.002435,2.019954,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Duke,M,4.0,0.582585,0.000108,-0.566260,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Warrior,M,9.0,,,,Army of Darkness,1992,10.0,21502796.0,191068,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...
37478048,Ajay,M,9.0,0.436957,-0.000126,0.806658,Mr. Bechara,1996,,,395,5.4
37501922,Murphy,F,3.0,0.234444,0.000354,-1.257988,Terminal Bliss,1992,,,245,4.4
37501922,Hunter,M,1.0,0.000021,-0.035578,7.051709,Terminal Bliss,1992,,,245,4.4
37501922,John,M,1.0,0.052067,-0.012279,2.177768,Terminal Bliss,1992,,,245,4.4


Divise year per season

In [247]:
summer = [6.0, 7.0, 8.0]
fall = [9.0,10.0,11.0]
winter = [12.0,1.0,2.0]
spring = [3.0,4.0,5.0]
summer_movies_df = name_by_movie_aggregate_df[name_by_movie_aggregate_df['month'].isin(summer)]
fall_movies_df = name_by_movie_aggregate_df[name_by_movie_aggregate_df['month'].isin(fall)]
winter_movies_df = name_by_movie_aggregate_df[name_by_movie_aggregate_df['month'].isin(winter)]
spring_movies_df = name_by_movie_aggregate_df[name_by_movie_aggregate_df['month'].isin(spring)]

display(summer_movies_df)
display(fall_movies_df)
display(winter_movies_df)
display(spring_movies_df)

Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3746,Deckard,M,0.0,,,,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Eldon,M,8.0,0.653938,-0.000105,0.460773,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Lewis,M,12.0,0.327638,-0.000698,1.024419,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Bear,M,11.0,,,,Blade Runner,1982,6.0,33139618.0,804384,8.1
3746,Leon,M,7.0,0.469966,0.000525,-0.748319,Blade Runner,1982,6.0,33139618.0,804384,8.1
...,...,...,...,...,...,...,...,...,...,...,...,...
36699915,Luke,M,5.0,0.846639,0.001421,-0.198023,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7
36699915,Underwood,M,1.0,,,,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7
36699915,Chase,F,2.0,0.149005,0.011448,-1.551732,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7
36699915,Circe,F,,,,,Percy Jackson & the Olympians: Sea of Monsters,2013,8.0,,123248,5.7


Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3217,Gold,,6.0,,,,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Linda,F,7.0,0.676072,-0.000675,0.429187,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Henry,M,4.0,0.068422,-0.002435,2.019954,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Duke,M,4.0,0.582585,0.000108,-0.566260,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Warrior,M,9.0,,,,Army of Darkness,1992,10.0,21502796.0,191068,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...
37322106,Major,M,0.0,0.080101,-0.002548,1.927686,Jab Tak Hai Jaan,2012,11.0,,58012,6.7
37373877,Beth,F,5.0,0.425923,-0.000270,0.826799,Crazy Eights,2006,10.0,,3338,3.8
37373877,Patterson,F,5.0,,,,Crazy Eights,2006,10.0,,3338,3.8
37373877,Jennifer,F,0.0,0.687248,-0.003315,0.413414,Crazy Eights,2006,10.0,,3338,3.8


Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3837,Lamarr,M,3.0,0.765429,0.000046,-0.305851,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
3837,Van,M,11.0,0.249160,-0.000456,1.216756,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
3837,Bart,M,0.0,0.861123,0.000158,-0.179090,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
3837,Lyle,M,6.0,0.879211,-0.000105,0.155542,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
3837,Buddy,M,18.0,0.965905,0.000018,-0.043728,Blazing Saddles,1974,2.0,119500000.0,147934,7.7
...,...,...,...,...,...,...,...,...,...,...,...,...
36956792,Kid,M,18.0,,,,The Water Horse: Legend of the Deep,2007,12.0,103071443.0,42523,6.4
36956792,Charlie,M,5.0,0.000208,-0.006215,5.427450,The Water Horse: Legend of the Deep,2007,12.0,103071443.0,42523,6.4
36956792,Beach,M,18.0,,,,The Water Horse: Legend of the Deep,2007,12.0,103071443.0,42523,6.4
36956792,Walker,M,8.0,0.561595,-0.000575,0.598551,The Water Horse: Legend of the Deep,2007,12.0,103071443.0,42523,6.4


Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
4560,Morrison,M,19.0,,,,Braveheart,1995,5.0,211409945.0,1072580,8.3
4560,Edward,M,3.0,0.713319,-0.000845,0.377036,Braveheart,1995,5.0,211409945.0,1072580,8.3
4560,Campbell,M,5.0,0.113180,-0.000474,1.721204,Braveheart,1995,5.0,211409945.0,1072580,8.3
4560,Murron,F,1.0,,,,Braveheart,1995,5.0,211409945.0,1072580,8.3
4560,William,M,0.0,0.006095,-0.015277,3.384440,Braveheart,1995,5.0,211409945.0,1072580,8.3
...,...,...,...,...,...,...,...,...,...,...,...,...
36814246,Girl,F,4.0,,,,Eraserhead,1977,3.0,7000000.0,124128,7.3
36814246,Mary,F,1.0,0.017282,-0.041302,2.799750,Eraserhead,1977,3.0,7000000.0,124128,7.3
36814246,Beautiful,F,4.0,,,,Eraserhead,1977,3.0,7000000.0,124128,7.3
36814246,Hall,F,4.0,,,,Eraserhead,1977,3.0,7000000.0,124128,7.3


In [248]:
summer_movies_df.index.unique()

Index([    3746,     3947,     4231,     4726,     4727,     4728,     4729,
           4730,     8481,     9979,
       ...
       36306987, 36329343, 36354051, 36354224, 36422681, 36448415, 36478252,
       36566804, 36617100, 36699915],
      dtype='int64', name='wiki_ID', length=4096)

In [249]:
prop_summer = len(summer_movies_df[summer_movies_df['p_value']<0.1])/len(summer_movies_df['p_value'])
display(prop_summer)
prop_fall = len(fall_movies_df[fall_movies_df['p_value']<0.1])/len(fall_movies_df['p_value'])
display(prop_fall)
prop_winter = len(winter_movies_df[winter_movies_df['p_value']<0.1])/len(winter_movies_df['p_value'])
display(prop_winter)
prop_spring = len(spring_movies_df[spring_movies_df['p_value']<0.1])/len(spring_movies_df['p_value'])
display(prop_spring)

0.13600104190407972

0.14431685722080306

0.13875639832480224

0.13644032637256256

Statistical test to assess whether proportion for different season are different or not

In [250]:
from scipy.stats import chi2_contingency

# Organize the data into a contingency table
observed_data = [
    [len(summer_movies_df[summer_movies_df['p_value'] < 0.1]), len(summer_movies_df['p_value'])],
    [len(fall_movies_df[fall_movies_df['p_value'] < 0.1]), len(fall_movies_df['p_value'])],
    [len(winter_movies_df[winter_movies_df['p_value'] < 0.1]), len(winter_movies_df['p_value'])],
    [len(spring_movies_df[spring_movies_df['p_value'] < 0.1]), len(spring_movies_df['p_value'])]
]

# Perform the chi-squared test
chi2, p, _, _ = chi2_contingency(observed_data)

# Print the results
print("Chi-squared value:", chi2)
print("P-value:", p)

Chi-squared value: 10.399003008260278
P-value: 0.015461904797585156


H0 : The proportions are all equal 

We can reject the null hypothesis at the 5% significance level.

### Question 2: Movie Genre has an impact ?

In [251]:
# First, aggregate dataframe with p_value with dataframe containing movie genre
# Outer merge required in order to obtain for each name of each film, all the possible genre it can be associated to 
movie_genre_aggregate_df = name_by_movie_df.merge(movie_genres_df, how='outer', left_on='wiki_ID', right_on='wiki_ID')
movie_genre_aggregate_df.head(25)

Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,genre
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3217,Gold,,6.0,,,,Action
3217,Gold,,6.0,,,,Comedy
3217,Gold,,6.0,,,,Time travel
3217,Gold,,6.0,,,,Black comedy
3217,Gold,,6.0,,,,Zombie Film
3217,Gold,,6.0,,,,Horror Comedy
3217,Gold,,6.0,,,,Action/Adventure
3217,Gold,,6.0,,,,Costume drama
3217,Gold,,6.0,,,,Stop motion
3217,Gold,,6.0,,,,Horror


In [252]:
# Need to drop the duplicates i.e. the instances that have the same wiki_ID for the same genre and same char words
movie_genre_aggregate_df.reset_index().drop_duplicates(subset=['genre', 'wiki_ID'], inplace=True)

First groupby test: can be removed when cleaning notebook

In [253]:
name_by_genre_significant_df = movie_genre_aggregate_df.groupby('genre').apply(lambda x: x[x['p_value'] < alpha])
display(name_by_genre_significant_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,char_words,gender,order,p_value,slope_change,t_stat,genre
genre,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Absurdism,19701,Tim,M,1.0,0.001787,-0.010096,4.090696,Absurdism
Absurdism,46505,Ted,M,0.0,0.046833,-0.001105,2.238396,Absurdism
Absurdism,46505,Johnny,M,10.0,0.039946,-0.002623,2.328911,Absurdism
Absurdism,75261,Robert,M,9.0,0.004233,-0.053751,3.591418,Absurdism
Absurdism,75261,Dave,M,19.0,0.030485,-0.001459,2.481634,Absurdism
...,...,...,...,...,...,...,...,...
Zombie Film,28362996,Burke,M,,0.017814,-0.000241,2.782818,Zombie Film
Zombie Film,30430079,Holly,F,1.0,0.014981,-0.003623,2.879653,Zombie Film
Zombie Film,33432215,Sarah,F,7.0,0.000060,-0.020843,6.283305,Zombie Film
Zombie Film,33432215,Mack,M,4.0,0.023270,-0.001138,2.633292,Zombie Film


Trying to see why there are for some "movie genre" NaN value for sem computation but not for mean computation.

 ANSWER: due to the fact that there is only one data point in after the groupy and filtering in a given movie genre.

In [254]:
name_by_genre_significant_df.loc['Acid western']

Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,genre
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
113651,William,M,0.0,0.006095,-0.015277,3.38444,Acid western
413426,Walker,M,0.0,0.01032,-0.000582,3.088164,Acid western
5579768,Jake,M,0.0,0.036284,-0.001215,2.383373,Acid western


In [255]:
# Try to compute number of film per genre
display(movie_genre_aggregate_df.reset_index().groupby('genre')['wiki_ID'].nunique())

# Sanity check for "Acid Western" ––> 9 movies
display(movie_genre_aggregate_df[movie_genre_aggregate_df['genre'] == 'Acid western'])
display(len(movie_genre_aggregate_df[movie_genre_aggregate_df['genre'] == 'Acid western']))

# Look at number of names/char_words per genre, here on "Acid Western"
display(movie_genre_aggregate_df[movie_genre_aggregate_df['genre'] == 'Acid western']['char_words'].nunique())

genre
Absurdism             91
Acid western           9
Action              7859
Action Comedy        162
Action Thrillers     497
                    ... 
World History         20
World cinema        7073
Wuxia                115
Z movie                3
Zombie Film          266
Name: wiki_ID, Length: 363, dtype: int64

Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,genre
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
113651,Blake,M,0.0,0.059814,0.006433,-2.097935,Acid western
113651,Cole,M,3.0,0.304149,-0.008285,1.077898,Acid western
113651,William,M,0.0,0.006095,-0.015277,3.38444,Acid western
113651,Marvin,,,0.903854,-8.7e-05,0.12361,Acid western
113651,Thel,F,11.0,,,,Acid western
113651,Charlie,M,9.0,0.241112,-0.000633,1.239055,Acid western
113651,Tench,M,10.0,,,,Acid western
113651,Russell,F,11.0,0.182583,-0.001799,1.422612,Acid western
113651,Conway,M,4.0,,,,Acid western
113651,John,M,6.0,0.315054,-0.005516,1.052699,Acid western


32

28

In [256]:
# Compute proportion of impacted names by genre
# Also computation of non significant and nan proportion for sanity check
name_by_genre_prop_df = movie_genre_aggregate_df.groupby('genre').apply(lambda x: pd.Series({
        # Number of film in a given movie genre 
        'nb_films_in_genre': x.reset_index()['wiki_ID'].count(),
        # Number of total different names that appear in a given movie genre
        'nb_names_in_genre': x['char_words'].count(),
        # Number of different names per genre that are significantly impacted by a movie release from that genre
        'nb_names_signi_in_genre': x[x['p_value'] < alpha]['char_words'].count(),
        # Proportion of names significantly impacted by a movie genre divided by total number of films in this movie genre
        'prop_names_signi_in_genre_per_total_film_in_genre': (x[x['p_value'] < alpha]['char_words'].count())/(x.reset_index()['wiki_ID'].count()),
        'is_na_sum': x['slope_change'].isna().sum(),
        'prop_signif_per_genre': (x['p_value'] < alpha).sum()/len(x['p_value']),
        'prop_non_signi': (x['p_value'] > alpha).sum()/len(x['p_value']),
        'prop_nan': (x['p_value'].isna()).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].mean(),
        'se_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].sem(),
        'avg_mag_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].abs().mean(),
        'se_mag_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].abs().sem(),
        'avg_slope_change_global': x['slope_change'].mean()
    }))
display(name_by_genre_prop_df)
name_by_genre_prop_df.head(50)


Unnamed: 0_level_0,nb_films_in_genre,nb_names_in_genre,nb_names_signi_in_genre,prop_names_signi_in_genre_per_total_film_in_genre,is_na_sum,prop_signif_per_genre,prop_non_signi,prop_nan,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_mag_slope_change_significant,avg_slope_change_global
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Absurdism,740.0,721.0,60.0,0.081081,220.0,0.081081,0.621622,0.297297,-0.002309,0.003895,0.016557,0.003258,-0.000527
Acid western,32.0,30.0,3.0,0.093750,8.0,0.093750,0.656250,0.250000,-0.005691,0.004796,0.005691,0.004796,-0.002524
Action,34780.0,31575.0,2829.0,0.081340,14374.0,0.081340,0.505377,0.413283,-0.001701,0.000460,0.013829,0.000381,-0.000264
Action Comedy,1036.0,984.0,87.0,0.083977,383.0,0.083977,0.546332,0.369691,0.000067,0.002149,0.012519,0.001672,-0.000097
Action Thrillers,2911.0,2755.0,257.0,0.088286,1032.0,0.088286,0.557197,0.354517,0.000562,0.001655,0.014784,0.001373,-0.000294
...,...,...,...,...,...,...,...,...,...,...,...,...,...
World History,20.0,0.0,0.0,0.000000,20.0,0.000000,0.000000,1.000000,,,,,
World cinema,19067.0,15344.0,837.0,0.043898,10538.0,0.043898,0.403420,0.552683,-0.000895,0.000707,0.010429,0.000609,0.000080
Wuxia,215.0,134.0,6.0,0.027907,174.0,0.027907,0.162791,0.809302,-0.002240,0.000951,0.002240,0.000951,-0.000327
Z movie,3.0,0.0,0.0,0.000000,3.0,0.000000,0.000000,1.000000,,,,,


Unnamed: 0_level_0,nb_films_in_genre,nb_names_in_genre,nb_names_signi_in_genre,prop_names_signi_in_genre_per_total_film_in_genre,is_na_sum,prop_signif_per_genre,prop_non_signi,prop_nan,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_mag_slope_change_significant,avg_slope_change_global
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Absurdism,740.0,721.0,60.0,0.081081,220.0,0.081081,0.621622,0.297297,-0.002309,0.003895,0.016557,0.003258,-0.000527
Acid western,32.0,30.0,3.0,0.09375,8.0,0.09375,0.65625,0.25,-0.005691,0.004796,0.005691,0.004796,-0.002524
Action,34780.0,31575.0,2829.0,0.08134,14374.0,0.08134,0.505377,0.413283,-0.001701,0.00046,0.013829,0.000381,-0.000264
Action Comedy,1036.0,984.0,87.0,0.083977,383.0,0.083977,0.546332,0.369691,6.7e-05,0.002149,0.012519,0.001672,-9.7e-05
Action Thrillers,2911.0,2755.0,257.0,0.088286,1032.0,0.088286,0.557197,0.354517,0.000562,0.001655,0.014784,0.001373,-0.000294
Action/Adventure,21112.0,19502.0,1855.0,0.087865,8037.0,0.087865,0.531451,0.380684,-0.001514,0.000561,0.013752,0.000462,-0.000236
Addiction Drama,245.0,228.0,30.0,0.122449,68.0,0.122449,0.6,0.277551,-0.000592,0.004282,0.013448,0.00348,-0.000249
Adult,365.0,244.0,30.0,0.082192,189.0,0.082192,0.4,0.517808,-0.003847,0.008919,0.023334,0.007829,-0.000797
Adventure,20830.0,18801.0,1744.0,0.083725,8882.0,0.083725,0.48987,0.426404,-0.001272,0.000606,0.013986,0.000506,-8.4e-05
Adventure Comedy,875.0,827.0,74.0,0.084571,331.0,0.084571,0.537143,0.378286,-0.003227,0.002155,0.011806,0.001697,-0.000664


In [257]:
name_by_genre_prop_df.isna().sum()
# Drop NaN values
name_by_genre_prop_df.dropna(inplace=True)
display(name_by_genre_prop_df)
# Sanity check
name_by_genre_prop_df.isna().sum()

Unnamed: 0_level_0,nb_films_in_genre,nb_names_in_genre,nb_names_signi_in_genre,prop_names_signi_in_genre_per_total_film_in_genre,is_na_sum,prop_signif_per_genre,prop_non_signi,prop_nan,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_mag_slope_change_significant,avg_slope_change_global
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Absurdism,740.0,721.0,60.0,0.081081,220.0,0.081081,0.621622,0.297297,-0.002309,0.003895,0.016557,0.003258,-0.000527
Acid western,32.0,30.0,3.0,0.093750,8.0,0.093750,0.656250,0.250000,-0.005691,0.004796,0.005691,0.004796,-0.002524
Action,34780.0,31575.0,2829.0,0.081340,14374.0,0.081340,0.505377,0.413283,-0.001701,0.000460,0.013829,0.000381,-0.000264
Action Comedy,1036.0,984.0,87.0,0.083977,383.0,0.083977,0.546332,0.369691,0.000067,0.002149,0.012519,0.001672,-0.000097
Action Thrillers,2911.0,2755.0,257.0,0.088286,1032.0,0.088286,0.557197,0.354517,0.000562,0.001655,0.014784,0.001373,-0.000294
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Women in prison films,62.0,52.0,7.0,0.112903,26.0,0.112903,0.467742,0.419355,-0.017517,0.010727,0.020497,0.009808,-0.003645
Workplace Comedy,677.0,654.0,74.0,0.109306,155.0,0.109306,0.661743,0.228951,-0.002591,0.001459,0.008569,0.001102,-0.000553
World cinema,19067.0,15344.0,837.0,0.043898,10538.0,0.043898,0.403420,0.552683,-0.000895,0.000707,0.010429,0.000609,0.000080
Wuxia,215.0,134.0,6.0,0.027907,174.0,0.027907,0.162791,0.809302,-0.002240,0.000951,0.002240,0.000951,-0.000327


nb_films_in_genre                                    0
nb_names_in_genre                                    0
nb_names_signi_in_genre                              0
prop_names_signi_in_genre_per_total_film_in_genre    0
is_na_sum                                            0
prop_signif_per_genre                                0
prop_non_signi                                       0
prop_nan                                             0
avg_slope_change_significant                         0
se_slope_change_significant                          0
avg_mag_slope_change_significant                     0
se_mag_slope_change_significant                      0
avg_slope_change_global                              0
dtype: int64

### Saving data

In [258]:
ready_for_web = './data/web_data/'
# Add the genre as a column of the dataframe and save as csv
name_by_genre_prop_df.to_csv(os.path.join(ready_for_web, 'movie_genre_significant.csv'), index=True)

## Analysis looking at time effects

In [259]:
# Need to merge datasets containing "p_value" (name_by_movie_df), "movie_genre" (movie_genres_df), "release_date" (movie_df)
# => aggregate "name_by_movie_aggregate_df" with "movie_genres_df"
movie_genre_aggregate_with_years_df = movie_genre_aggregate_df.merge(movie_df, how='left', left_on='wiki_ID', right_on='wiki_ID')
display(movie_genre_aggregate_with_years_df)

Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,genre,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
3217,Gold,,6.0,,,,Action,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Gold,,6.0,,,,Comedy,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Gold,,6.0,,,,Time travel,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Gold,,6.0,,,,Black comedy,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Gold,,6.0,,,,Zombie Film,Army of Darkness,1992,10.0,21502796.0,191068,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37241569,,,,,,,Action,Cold War,2012,11.0,,5033,6.6
37476824,,,,,,,Comedy,I Love New Year,2011,,,876,3.4
37476824,,,,,,,Crime Comedy,I Love New Year,2011,,,876,3.4
37476824,,,,,,,Caper story,I Love New Year,2011,,,876,3.4


In [260]:
# name_by_genre_per_year_prop_df = movie_genre_aggregate_with_years_filled_df.groupby(['genre','year']).apply(lambda x: x[x['p_value'] < 0.1])
name_by_genre_per_year_prop_df = movie_genre_aggregate_with_years_df.groupby(['genre','year']).apply(lambda x: x[x['p_value'] < alpha])
name_by_genre_per_year_prop_df.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,char_words,gender,order,p_value,slope_change,t_stat,genre,mov_name,year,month,revenue,numVotes,averageRating
genre,year,wiki_ID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Absurdism,1964,248601,George,M,2.0,0.02664,-0.011959,2.557438,Absurdism,A Hard Day's Night,1964,7.0,,47276,7.5
Absurdism,1964,248601,John,M,0.0,0.047794,0.073648,-2.22679,Absurdism,A Hard Day's Night,1964,7.0,,47276,7.5
Absurdism,1974,19701,Tim,M,1.0,0.001787,-0.010096,4.090696,Absurdism,Monty Python and the Holy Grail,1974,4.0,,560662,8.2
Absurdism,1978,75261,Robert,M,9.0,0.004233,-0.053751,3.591418,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Dave,M,19.0,0.030485,-0.001459,2.481634,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Barbara,F,15.0,0.004086,-0.018562,3.611592,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Kent,M,8.0,0.022216,-0.002411,2.659263,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Donald,M,6.0,0.011954,-0.010231,3.005882,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,Dean,M,2.0,0.009035,-0.008901,3.162738,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4
Absurdism,1978,75261,John,M,0.0,0.0051,-0.06019,3.485338,Absurdism,National Lampoon's Animal House,1978,7.0,141600000.0,127114,7.4


In [261]:
# Compute proportion of impacted names by genre by year
name_by_genre_per_year_prop_df = movie_genre_aggregate_with_years_df.groupby(['genre','year']).apply(lambda x: pd.Series({
        'prop_signif_per_genre_per_year': (x['p_value'] < alpha).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].mean(),
        'se_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].sem(),
        'avg_mag_slope_change_significant': x[x['p_value'] < alpha]['slope_change'].abs().mean(),
        'se_slope_change_magnitude_significant': x[x['p_value'] < alpha]['slope_change'].abs().sem(),
        'avg_slope_change_global': x['slope_change'].mean()
    }))
display(name_by_genre_per_year_prop_df)
#name_by_genre_per_year_prop_df.head(30)

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
genre,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Absurdism,1929,0.000000,,,,,
Absurdism,1930,0.000000,,,,,
Absurdism,1932,0.000000,,,,,0.000171
Absurdism,1938,0.000000,,,,,
Absurdism,1940,0.000000,,,,,
...,...,...,...,...,...,...,...
Zombie Film,2008,0.108434,-0.005228,0.002439,0.006446,0.002043,-0.000864
Zombie Film,2009,0.023256,0.026034,,0.026034,,0.001817
Zombie Film,2010,0.120690,-0.007142,0.005550,0.013962,0.002612,-0.001540
Zombie Film,2011,0.230769,-0.009043,0.006115,0.014741,0.003208,-0.003649


#### Need to fill the missing year for each genre with 0

In [262]:
# Define a function to fill gaps and add corresponding values
all_years_df = pd.DataFrame({'year': range(movie_df['year'].min(), movie_df['year'].max() + 1)}).reset_index(drop=True)
all_years_df = all_years_df.set_index('year', drop=True)
#display(all_years_df)
def fill_gaps(group):
    filled_group = pd.merge(all_years_df, group, on='year', how='left').fillna(0)
    return filled_group

name_by_genre_per_year_prop_df.reset_index(inplace=True)
display(name_by_genre_per_year_prop_df)

name_by_genre_per_year_prop_filled_df = name_by_genre_per_year_prop_df.groupby('genre').apply(fill_gaps)
display(name_by_genre_per_year_prop_filled_df)

Unnamed: 0,genre,year,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
0,Absurdism,1929,0.000000,,,,,
1,Absurdism,1930,0.000000,,,,,
2,Absurdism,1932,0.000000,,,,,0.000171
3,Absurdism,1938,0.000000,,,,,
4,Absurdism,1940,0.000000,,,,,
...,...,...,...,...,...,...,...,...
13827,Zombie Film,2008,0.108434,-0.005228,0.002439,0.006446,0.002043,-0.000864
13828,Zombie Film,2009,0.023256,0.026034,,0.026034,,0.001817
13829,Zombie Film,2010,0.120690,-0.007142,0.005550,0.013962,0.002612,-0.001540
13830,Zombie Film,2011,0.230769,-0.009043,0.006115,0.014741,0.003208,-0.003649


Unnamed: 0_level_0,Unnamed: 1_level_0,year,genre,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Absurdism,0,1888,0,0.0,0.0,0.0,0.0,0.0,0.0
Absurdism,1,1889,0,0.0,0.0,0.0,0.0,0.0,0.0
Absurdism,2,1890,0,0.0,0.0,0.0,0.0,0.0,0.0
Absurdism,3,1891,0,0.0,0.0,0.0,0.0,0.0,0.0
Absurdism,4,1892,0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
Zombie Film,124,2012,Zombie Film,0.0,0.0,0.0,0.0,0.0,0.0
Zombie Film,125,2013,0,0.0,0.0,0.0,0.0,0.0,0.0
Zombie Film,126,2014,0,0.0,0.0,0.0,0.0,0.0,0.0
Zombie Film,127,2015,0,0.0,0.0,0.0,0.0,0.0,0.0


In [263]:
# Keep only movie genre for which there is at least 10 nonzero values
name_by_genre_per_year_prop_filled_df.drop(columns=['genre'], inplace=True)
name_by_genre_per_year_prop_filled_df.reset_index(inplace=True)
display(name_by_genre_per_year_prop_filled_df)
# Count the number of non-zero values for each genre
genre_counts = name_by_genre_per_year_prop_filled_df[name_by_genre_per_year_prop_filled_df['avg_slope_change_significant'] != 0].groupby('genre')['year'].nunique()
display(genre_counts)

# Filter out genres with fewer than 10 non-zero years
selected_genres = genre_counts[genre_counts >= 10].index
display(selected_genres)

# Filter the original DataFrame based on the selected genres
name_by_genre_per_year_prop_filled_filtered_df = name_by_genre_per_year_prop_filled_df[name_by_genre_per_year_prop_filled_df['genre'].isin(selected_genres)]
display(name_by_genre_per_year_prop_filled_filtered_df)



Unnamed: 0,genre,level_1,year,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
0,Absurdism,0,1888,0.0,0.0,0.0,0.0,0.0,0.0
1,Absurdism,1,1889,0.0,0.0,0.0,0.0,0.0,0.0
2,Absurdism,2,1890,0.0,0.0,0.0,0.0,0.0,0.0
3,Absurdism,3,1891,0.0,0.0,0.0,0.0,0.0,0.0
4,Absurdism,4,1892,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
46822,Zombie Film,124,2012,0.0,0.0,0.0,0.0,0.0,0.0
46823,Zombie Film,125,2013,0.0,0.0,0.0,0.0,0.0,0.0
46824,Zombie Film,126,2014,0.0,0.0,0.0,0.0,0.0,0.0
46825,Zombie Film,127,2015,0.0,0.0,0.0,0.0,0.0,0.0


genre
Absurdism                24
Acid western              3
Action                   90
Action Comedy            23
Action Thrillers         42
                         ..
Women in prison films     3
Workplace Comedy         23
World cinema             64
Wuxia                     4
Zombie Film              18
Name: year, Length: 290, dtype: int64

Index(['Absurdism', 'Action', 'Action Comedy', 'Action Thrillers',
       'Action/Adventure', 'Addiction Drama', 'Adult', 'Adventure',
       'Adventure Comedy', 'Airplanes and airports',
       ...
       'Thriller', 'Time travel', 'Tragedy', 'Tragicomedy', 'War film',
       'Western', 'Whodunit', 'Workplace Comedy', 'World cinema',
       'Zombie Film'],
      dtype='object', name='genre', length=176)

Unnamed: 0,genre,level_1,year,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
0,Absurdism,0,1888,0.0,0.0,0.0,0.0,0.0,0.0
1,Absurdism,1,1889,0.0,0.0,0.0,0.0,0.0,0.0
2,Absurdism,2,1890,0.0,0.0,0.0,0.0,0.0,0.0
3,Absurdism,3,1891,0.0,0.0,0.0,0.0,0.0,0.0
4,Absurdism,4,1892,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
46822,Zombie Film,124,2012,0.0,0.0,0.0,0.0,0.0,0.0
46823,Zombie Film,125,2013,0.0,0.0,0.0,0.0,0.0,0.0
46824,Zombie Film,126,2014,0.0,0.0,0.0,0.0,0.0,0.0
46825,Zombie Film,127,2015,0.0,0.0,0.0,0.0,0.0,0.0


In [264]:
# Dropping columns 
name_by_genre_per_year_prop_filled_filtered_df.drop(columns=['level_1'], inplace=True)
display(name_by_genre_per_year_prop_filled_filtered_df)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,genre,year,prop_signif_per_genre_per_year,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_slope_change_magnitude_significant,avg_slope_change_global
0,Absurdism,1888,0.0,0.0,0.0,0.0,0.0,0.0
1,Absurdism,1889,0.0,0.0,0.0,0.0,0.0,0.0
2,Absurdism,1890,0.0,0.0,0.0,0.0,0.0,0.0
3,Absurdism,1891,0.0,0.0,0.0,0.0,0.0,0.0
4,Absurdism,1892,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...
46822,Zombie Film,2012,0.0,0.0,0.0,0.0,0.0,0.0
46823,Zombie Film,2013,0.0,0.0,0.0,0.0,0.0,0.0
46824,Zombie Film,2014,0.0,0.0,0.0,0.0,0.0,0.0
46825,Zombie Film,2015,0.0,0.0,0.0,0.0,0.0,0.0


In [265]:
# #reset index and reset index
# name_by_genre_per_year_prop_filled_df.reset_index(inplace=True).set_index(['genre'], inplace=True)
# display(name_by_genre_per_year_prop_filled_df)

In [266]:
display(name_by_genre_per_year_prop_filled_filtered_df.isna().sum())
# # Drop NaN values
# name_by_genre_per_year_prop_df.fillna(0, inplace=True)
# display(name_by_genre_per_year_prop_df)
# # Sanity check
# display(name_by_genre_per_year_prop_df.isna().sum())

genre                                    0
year                                     0
prop_signif_per_genre_per_year           0
avg_slope_change_significant             0
se_slope_change_significant              0
avg_mag_slope_change_significant         0
se_slope_change_magnitude_significant    0
avg_slope_change_global                  0
dtype: int64

### Saving the data

In [267]:
# Add the genre as a column of the dataframe and save as csv
name_by_genre_per_year_prop_filled_filtered_df.to_csv(os.path.join(ready_for_web, 'movie_genre_per_year_significant.csv'), index=False)

In [268]:
# Resaving data for Circle Packing with only movie genre kept in time analysis
# Add the genre as a column of the dataframe and save as csv
# Filter the original DataFrame based on the selected genres
name_by_genre_prop_df.reset_index(inplace=True)
name_by_genre_prop_filtered_df = name_by_genre_prop_df[name_by_genre_prop_df['genre'].isin(selected_genres)]
display(name_by_genre_prop_filtered_df)
name_by_genre_prop_filtered_df.to_csv(os.path.join(ready_for_web, 'movie_genre_significant_filtered.csv'), index=False)

Unnamed: 0,genre,nb_films_in_genre,nb_names_in_genre,nb_names_signi_in_genre,prop_names_signi_in_genre_per_total_film_in_genre,is_na_sum,prop_signif_per_genre,prop_non_signi,prop_nan,avg_slope_change_significant,se_slope_change_significant,avg_mag_slope_change_significant,se_mag_slope_change_significant,avg_slope_change_global
0,Absurdism,740.0,721.0,60.0,0.081081,220.0,0.081081,0.621622,0.297297,-0.002309,0.003895,0.016557,0.003258,-0.000527
2,Action,34780.0,31575.0,2829.0,0.081340,14374.0,0.081340,0.505377,0.413283,-0.001701,0.000460,0.013829,0.000381,-0.000264
3,Action Comedy,1036.0,984.0,87.0,0.083977,383.0,0.083977,0.546332,0.369691,0.000067,0.002149,0.012519,0.001672,-0.000097
4,Action Thrillers,2911.0,2755.0,257.0,0.088286,1032.0,0.088286,0.557197,0.354517,0.000562,0.001655,0.014784,0.001373,-0.000294
5,Action/Adventure,21112.0,19502.0,1855.0,0.087865,8037.0,0.087865,0.531451,0.380684,-0.001514,0.000561,0.013752,0.000462,-0.000236
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
266,Western,6028.0,4999.0,483.0,0.080126,2712.0,0.080126,0.469973,0.449900,0.000650,0.001712,0.020828,0.001425,0.000426
267,Whodunit,390.0,360.0,42.0,0.107692,116.0,0.107692,0.594872,0.297436,0.001528,0.007432,0.025626,0.006267,0.000438
269,Workplace Comedy,677.0,654.0,74.0,0.109306,155.0,0.109306,0.661743,0.228951,-0.002591,0.001459,0.008569,0.001102,-0.000553
270,World cinema,19067.0,15344.0,837.0,0.043898,10538.0,0.043898,0.403420,0.552683,-0.000895,0.000707,0.010429,0.000609,0.000080


### Question 3: Attendence/popularity + ratings

In [269]:
# The dataframe "name_by_movie_aggregate_df" already contains the wanted caracteristics
display(name_by_movie_aggregate_df)
name_by_movie_aggregate_df['numVotes'].max()

#Proportion of the film that had an influence in data segmented by number of votes

prop_0_10k = len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] < 10000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]) / len(name_by_movie_aggregate_df[name_by_movie_aggregate_df['numVotes'] < 10000])

print(f"Proportion of movies with numVotes < 10k and p_value < 0.1: {prop_0_10k :.3%}")

prop_10k_100k = len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 10000) & (name_by_movie_aggregate_df['numVotes'] < 100000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]) / len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 10000) & (name_by_movie_aggregate_df['numVotes'] < 100000)])

print(f"Proportion of movies with numVotes in [10k-100k] and p_value < 0.1: {prop_10k_100k :.3%}")

prop_100k_1M = len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 100000) & (name_by_movie_aggregate_df['numVotes'] < 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]) / len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 100000) & (name_by_movie_aggregate_df['numVotes'] < 1000000)])

print(f"Proportion of movies with numVotes in [100k-1M] and p_value < 0.1: {prop_100k_1M :.3%}")

prop_greater_1M = len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]) / len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 1000000)])

print(f"Proportion of movies with numVotes > 1M and p_value < 0.1: {prop_greater_1M :.3%}")


len(name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]['numVotes'].unique())


Unnamed: 0_level_0,char_words,gender,order,p_value,slope_change,t_stat,mov_name,year,month,revenue,numVotes,averageRating
wiki_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
3217,Gold,,6.0,,,,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Linda,F,7.0,0.676072,-0.000675,0.429187,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Henry,M,4.0,0.068422,-0.002435,2.019954,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Duke,M,4.0,0.582585,0.000108,-0.566260,Army of Darkness,1992,10.0,21502796.0,191068,7.4
3217,Warrior,M,9.0,,,,Army of Darkness,1992,10.0,21502796.0,191068,7.4
...,...,...,...,...,...,...,...,...,...,...,...,...
37478048,Ajay,M,9.0,0.436957,-0.000126,0.806658,Mr. Bechara,1996,,,395,5.4
37501922,Murphy,F,3.0,0.234444,0.000354,-1.257988,Terminal Bliss,1992,,,245,4.4
37501922,Hunter,M,1.0,0.000021,-0.035578,7.051709,Terminal Bliss,1992,,,245,4.4
37501922,John,M,1.0,0.052067,-0.012279,2.177768,Terminal Bliss,1992,,,245,4.4


Proportion of movies with numVotes < 10k and p_value < 0.1: 12.741%
Proportion of movies with numVotes in [10k-100k] and p_value < 0.1: 15.123%
Proportion of movies with numVotes in [100k-1M] and p_value < 0.1: 15.200%
Proportion of movies with numVotes > 1M and p_value < 0.1: 13.737%


50

Assumption: 
-Attendence is estimated by the number of votes
-A threshold of # of votes anove wich we start to study the influence of rating 

Ideas: 

-separate data according to number of votes & then separate data accordimng to rating 

-separate first according to votes and then in the segments of votes separates bad and good reviews

Question 4 : Faire la moyenne

In [270]:
#We segment the data frame according to the number of votes

votes_seg_0_10k = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] < 10000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]
votes_seg_10k_100k = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 10000) & (name_by_movie_aggregate_df['numVotes'] < 100000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]
votes_seg_100k_1M = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 100000) & (name_by_movie_aggregate_df['numVotes'] < 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]
votes_seg_1M_inf = name_by_movie_aggregate_df[(name_by_movie_aggregate_df['numVotes'] > 1000000) & (name_by_movie_aggregate_df['p_value'] < 0.1)]

a = [votes_seg_0_10k['slope_change'].mean(), votes_seg_10k_100k['slope_change'].mean(), votes_seg_100k_1M['slope_change'].mean(), votes_seg_1M_inf['slope_change'].mean()]
index_names = ['0-10k', '10k-100k', '100k-1M', '1M-inf']
results = pd.DataFrame(a, index=index_names,columns = ['avg_slope_change'])
results.index.name = 'Seg_numVotes'
display(results)

Unnamed: 0_level_0,avg_slope_change
Seg_numVotes,Unnamed: 1_level_1
0-10k,-0.00129
10k-100k,-0.001823
100k-1M,-0.002344
1M-inf,-0.000799


In [271]:
name_by_movie_aggregate_df_significant = name_by_movie_aggregate_df[name_by_movie_aggregate_df['p_value'] < 0.1]

#We segment the data frame according to the number of votes

##Calculate the average cahnge of slopes for the different number of vote segments 

numVotes_bins = [0,10000,100000,1000000,np.inf]
segments_numVotes_label = ['0-10000','10000-100000','100000-1000000','1000000+']
name_by_movie_aggregate_df_significant['numVotes_segmented']  = pd.cut(name_by_movie_aggregate_df_significant['numVotes'],numVotes_bins,labels=segments_numVotes_label,right=True)

avg_magnitude_slopes_change_numVotes = name_by_movie_aggregate_df_significant.groupby('numVotes_segmented').apply(lambda x: pd.Series({
    'avg_magnitude_slopes_change': x['slope_change'].abs().mean(), 
    'avg_slope_change': x['slope_change'].mean()
    }))
display(avg_magnitude_slopes_change_numVotes)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,avg_magnitude_slopes_change,avg_slope_change
numVotes_segmented,Unnamed: 1_level_1,Unnamed: 2_level_1
0-10000,0.013986,-0.00129
10000-100000,0.010802,-0.001823
100000-1000000,0.009755,-0.002344
1000000+,0.011324,-0.000799


#### Segementing w.r.t. movie rating

In [272]:
#We segment the data frame according to the rating
#Calculate the average change of slopes for the different rating segements

rating_quantiles = np.quantile(name_by_movie_aggregate_df_significant['averageRating'],[0.25,0.5,0.75])
#display(rating_quantiles)

# display((name_by_movie_aggregate_df_significant['averageRating']<= 5.5).sum()/len(name_by_movie_aggregate_df_significant))

rating_bins = [0,rating_quantiles[0],rating_quantiles[1],rating_quantiles[2],10]
segments_rating_label = ['0-{}'.format(rating_quantiles[0]),'{}-{}'.format(rating_quantiles[0], rating_quantiles[1]),'{}-{}'.format(rating_quantiles[1], rating_quantiles[2]),'{}-10'.format(rating_quantiles[2])]
name_by_movie_aggregate_df_significant['rating_segmented']  = pd.cut(name_by_movie_aggregate_df_significant['averageRating'],rating_bins,labels=segments_rating_label,right=True)
avg_slopes_change_rating = name_by_movie_aggregate_df_significant.groupby('rating_segmented').apply(lambda x: pd.Series({
    'avg_slopes_change': x['slope_change'].mean(),
    'avg_magnitude_slopes_change': x['slope_change'].abs().mean()
}))

display(avg_slopes_change_rating)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0_level_0,avg_slopes_change,avg_magnitude_slopes_change
rating_segmented,Unnamed: 1_level_1,Unnamed: 2_level_1
0-5.5,-0.001735,0.011755
5.5-6.3,-0.001851,0.012728
6.3-6.9,-0.00136,0.012693
6.9-10,-0.001422,0.012424


### Question 4: Character Importance in film

In [273]:
# The dataframe "name_by_movie_df" already contains the wanted caracteristics ("order")
# display(name_by_order_df)
name_by_order_df = name_by_movie_df.groupby("order").apply(lambda x: x[x['p_value'] < 0.1])
display(name_by_order_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,char_words,gender,order,p_value,slope_change,t_stat
order,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,4560,William,M,0.0,0.006095,-0.015277,3.384440
0.0,5035,Eric,M,0.0,0.000031,-0.025032,6.755076
0.0,5729,Harold,M,0.0,0.045354,-0.001979,2.256695
0.0,19715,Gracie,F,0.0,0.013734,-0.008358,2.928240
0.0,22751,Julia,F,0.0,0.013218,0.023395,-2.949628
...,...,...,...,...,...,...,...
94.0,9834441,Lily,F,94.0,0.000714,0.023868,-4.642400
95.0,20777420,Thomas,M,95.0,0.001181,-0.010917,4.336954
98.0,370064,Anderson,F,98.0,0.001162,-0.003267,4.346455
98.0,25079197,Tyson,M,98.0,0.001493,0.003804,-4.197156


In [274]:
name_by_order_prop_df = name_by_movie_df.groupby("order").apply(lambda x: pd.Series({
        'prop_signif_per_order': (x['p_value'] < 0.1).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].mean(),
        'avg_slope_change_global': x['slope_change'].mean(),
        'avg_magnitude_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].abs().mean(),
        'total_number_signif_per_order': (x['p_value'] < 0.1).sum(),
        'proportion_negative_SC' : (x[x['p_value'] < 0.1]['slope_change'] < 0).sum() / len(x[x['p_value'] < 0.1]['slope_change']),
        'proportion_positive_SC' : (x[x['p_value'] < 0.1]['slope_change'] > 0).sum() / len(x[x['p_value'] < 0.1]['slope_change']),
        'se_slope_change_magnitude_significant': x[x['p_value'] < 0.1]['slope_change'].abs().sem()
    }))
display(name_by_order_prop_df)



invalid value encountered in scalar divide


invalid value encountered in scalar divide



Unnamed: 0_level_0,prop_signif_per_order,avg_slope_change_significant,avg_slope_change_global,avg_magnitude_slope_change_significant,total_number_signif_per_order,proportion_negative_SC,proportion_positive_SC,se_slope_change_magnitude_significant
order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,0.152382,-0.001752,-0.000224,0.015177,3723.0,0.729251,0.270749,0.000388
1.0,0.153846,-0.000726,0.000031,0.014285,2910.0,0.725430,0.274570,0.000443
2.0,0.150620,-0.001973,-0.000290,0.012232,2187.0,0.758116,0.241884,0.000439
3.0,0.148164,-0.001605,-0.000378,0.012099,1767.0,0.753820,0.246180,0.000446
4.0,0.144475,-0.002070,-0.000353,0.011672,1454.0,0.763411,0.236589,0.000480
...,...,...,...,...,...,...,...,...
151.0,0.000000,,0.001574,,0.0,,,
152.0,0.000000,,-0.000905,,0.0,,,
169.0,0.000000,,-0.000067,,0.0,,,
300.0,0.000000,,-0.000038,,0.0,,,


In [275]:
# Limiter les données jusqu'à l'ordre 100
filtered_df = name_by_order_prop_df[(name_by_order_prop_df.index <= 100) & (name_by_order_prop_df.index > 0)]

# Limiter la plage de la hauteur entre 0 et 0.1
y_range = [0, 0.03]

# Création du bar chart interactif avec sous-graphiques
fig = go.Figure()

# # Tracé pour avg magnitude slope change
# fig.add_trace(go.Bar(
#     x=filtered_df.index,
#     y=filtered_df['avg_magnitude_slope_change_significant'],
#     name='Avg Magnitude Slope Change',
#     marker_color='blue'
# ))

# Tracé pour la proportion de slope change négatif à l'intérieur de la barre de magnitude
fig.add_trace(go.Bar(
    x=filtered_df.index,
    y=filtered_df['proportion_negative_SC'] * filtered_df['avg_magnitude_slope_change_significant'],
    name='Proportion Slope Change Negatif',
    marker_color='red',
    offsetgroup=1
))

# Tracé pour la proportion de slope change positif à l'intérieur de la barre de magnitude
fig.add_trace(go.Bar(
    x=filtered_df.index,
    y=filtered_df['proportion_positive_SC'] * filtered_df['avg_magnitude_slope_change_significant'],
    name='Proportion Slope Change Positif',
    marker_color='green',
    offsetgroup=1
))

# Mise en forme du tracé
fig.update_layout(
    xaxis=dict(title='Order'),
    yaxis=dict(title='Magnitude / Proportion', range=y_range),
    barmode='stack'  # 'stack' empile les barres pour chaque order
)

In [276]:
fig.write_html("CaracterRole.html")

In [277]:
# Assumez que votre dataframe s'appelle name_by_order_prop_df

# Limiter les données jusqu'à l'ordre 20
filtered_df = name_by_order_prop_df[(name_by_order_prop_df.index <= 20) & (name_by_order_prop_df.index > 0)]

# Limiter la plage de la hauteur entre 0 et 0.03
y_range = [0, 0.03]

# Création du bar chart interactif avec sous-graphiques
fig = go.Figure()

# Tracé pour avg magnitude slope change avec erreur
fig.add_trace(go.Bar(
    x=filtered_df.index,
    y=filtered_df['avg_magnitude_slope_change_significant'],
    name='Avg Magnitude Slope Change',
    marker_color='orange',
    error_y=dict(
        type='data',
        array=filtered_df['se_slope_change_magnitude_significant'],
        visible=True
    )
))

# Mise en forme du tracé
fig.update_layout(
    xaxis=dict(title='Charater Order'),
    yaxis=dict(title='Slope Change Magnitude', range=y_range),
    barmode='stack'  # 'stack' empile les barres pour chaque order
)

# Affichage du graphique
fig.show()

#### Does movie genre and caracter role are linked ?

In [278]:
# Does the order influence is impacted by movie genre ? Study of impact due to role importance per movie genre
name_by_order_by_genre_prop_df = movie_genre_aggregate_df.groupby(['order','genre']).apply(lambda x: pd.Series({
        'prop_signif_per_order_per_genre': (x['p_value'] < 0.1).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].mean(),
        'avg_magnitude_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].abs().mean(),
        'avg_slope_change_global': x['slope_change'].mean(),
        'total_number_signif_per_order_per_genre': (x['p_value'] < 0.1).sum(),
    }))
display(name_by_order_by_genre_prop_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_order_per_genre,avg_slope_change_significant,avg_magnitude_slope_change_significant,avg_slope_change_global,total_number_signif_per_order_per_genre
order,genre,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,Absurdism,0.189873,0.014856,0.029256,0.003077,15.0
0.0,Acid western,0.500000,-0.002660,0.005877,-0.000975,4.0
0.0,Action,0.150447,-0.000455,0.014450,-0.000131,657.0
0.0,Action Comedy,0.113402,0.006659,0.015866,0.000727,11.0
0.0,Action Thrillers,0.159269,0.003217,0.017853,-0.000243,61.0
...,...,...,...,...,...,...
302.0,Biographical film,0.000000,,,,0.0
302.0,Biography,0.000000,,,,0.0
302.0,Drama,0.000000,,,,0.0
302.0,Period piece,0.000000,,,,0.0


### Does the order of a name influence differently according to gender ?
<span style="color:red"> *Prendre seulement les valeur ou p less 0.1 pour faire l'etude des slopes ? Si on les gardes ça va influencer nos moyenne avec des truc pas significantes *</span>

<span style="color:red"> **Revoir **</span>

In [279]:
# Calculate the average magnitude of slope change on all the data
# Calculate the average magnitude of slope change on data having a slope change statistically significant
# Calculate the average of slope change on data having a slope change statistically significant
name_by_order_by_gender_prop_df = name_by_movie_df.groupby(['order','gender']).apply(lambda x: pd.Series({
        'prop_signif_per_order_per_genre': (x['p_value'] < 0.1).sum()/len(x['p_value']),
        'avg_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].mean(),
        'avg_slope_change_global': x['slope_change'].mean(),
        'avg_magnitude_slope_change_significant': x[x['p_value'] < 0.1]['slope_change'].abs().mean(),
        'avg_magnitude_slope_change_global': x['slope_change'].abs().mean(),
        'total_number_signif_per_order_per_genre': (x['p_value'] < 0.1).sum(),
    }))
display(name_by_order_by_gender_prop_df)


Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_order_per_genre,avg_slope_change_significant,avg_slope_change_global,avg_magnitude_slope_change_significant,avg_magnitude_slope_change_global,total_number_signif_per_order_per_genre
order,gender,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0.0,F,0.172261,-0.000833,0.000078,0.014784,0.005188,1283.0
0.0,M,0.143379,-0.002366,-0.000400,0.015421,0.004928,2392.0
1.0,F,0.164144,0.000097,0.000333,0.014767,0.005097,1540.0
1.0,M,0.144347,-0.001762,-0.000305,0.013695,0.004450,1343.0
2.0,F,0.164639,-0.001191,0.000074,0.013123,0.004721,1015.0
...,...,...,...,...,...,...,...
151.0,M,0.000000,,0.001574,,0.001574,0.0
152.0,F,0.000000,,-0.000905,,0.000905,0.0
169.0,M,0.000000,,-0.000067,,0.000067,0.0
300.0,M,0.000000,,-0.000038,,0.000038,0.0


### Question 5: Caracter gender in film
<span style="color:green"> ok</span>

In [280]:
# The dataframe "name_by_movie" has everything we need
name_by_gender_df = name_by_movie_df.groupby('gender').apply(lambda x: x[x['p_value'] < 0.1])
display(name_by_gender_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,char_words,gender,order,p_value,slope_change,t_stat
gender,wiki_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,3947,Barbara,F,8.0,0.063999,-0.003945,2.058789
F,4231,Cassandra,F,8.0,0.071099,0.017066,-1.997573
F,4231,Jennifer,F,6.0,0.031782,-0.086605,2.458174
F,4560,Isabelle,F,2.0,0.000007,-0.008570,7.995228
F,4726,Vicki,F,2.0,0.084777,-0.000762,1.894210
...,...,...,...,...,...,...,...
M,36956792,Gunner,M,13.0,0.001406,-0.002385,4.232822
M,36956792,Charlie,M,5.0,0.000208,-0.006215,5.427450
M,37322106,Major,M,0.0,0.080101,-0.002548,1.927686
M,37501922,Hunter,M,1.0,0.000021,-0.035578,7.051709


In [281]:
#Average slope change 
name_by_gender_prop_df = name_by_movie_df.groupby("gender").apply(lambda x: pd.Series({
        'prop_signif_per_gender': (x['p_value'] < 0.1).sum()/len(x['p_value']),
        'avg_slope_change_per_gender_significant': x[x['p_value'] < 0.1]['slope_change'].mean(),
        'avg_slope_change_per_gender_global': x['slope_change'].mean(),
        'avg_mag_slope_change_per_gender_significant': x[x['p_value'] < 0.1]['slope_change'].abs().mean(),
        'avg_mag_slope_change_per_gender_global': x['slope_change'].abs().mean(),
        'total_number_signif_per_gender': (x['p_value'] < 0.1).sum(),
    }))
display(name_by_gender_prop_df)

Unnamed: 0_level_0,prop_signif_per_gender,avg_slope_change_per_gender_significant,avg_slope_change_per_gender_global,avg_mag_slope_change_per_gender_significant,avg_mag_slope_change_per_gender_global,total_number_signif_per_gender
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
F,0.156183,-0.001337,-0.000154,0.012687,0.004279,9793.0
M,0.12601,-0.001921,-0.000378,0.012182,0.003721,13199.0


#### Does caracter gender and movie genre are linked ?

In [282]:
# Does the gender influence is impacted by movie genre ? Study of impact due to role importance per movie genre
name_by_gender_by_genre_prop_df = movie_genre_aggregate_df.groupby(['gender','genre']).apply(lambda x: pd.Series({
        'prop_signif_per_gender_per_genre': (x['p_value'] < 0.1).sum()/len(x['p_value']),
        'avg_slope_change_per_gender_per_genre_significant': x[x['p_value'] < 0.1]['slope_change'].mean(),
        'avg_slope_change_per_gender_per_genre_global': x['slope_change'].mean(),
        'avg_mag_slope_change_per_gender_per_genre_significant': x[x['p_value'] < 0.1]['slope_change'].abs().mean(),
        'avg_mag_slope_change_per_gender_per_genre_global': x['slope_change'].abs().mean(),
        'total_number_signif_per_gender_per_genre': (x['p_value'] < 0.1).sum(),
    }))
display(name_by_gender_by_genre_prop_df)

Unnamed: 0_level_0,Unnamed: 1_level_0,prop_signif_per_gender_per_genre,avg_slope_change_per_gender_per_genre_significant,avg_slope_change_per_gender_per_genre_global,avg_mag_slope_change_per_gender_per_genre_significant,avg_mag_slope_change_per_gender_per_genre_global,total_number_signif_per_gender_per_genre
gender,genre,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
F,Absurdism,0.156863,-0.003750,-0.000930,0.008685,0.003611,32.0
F,Acid western,0.166667,-0.002164,-0.001279,0.002164,0.001279,1.0
F,Action,0.157500,-0.000576,0.000176,0.012130,0.004378,1366.0
F,Action Comedy,0.178030,0.002138,0.000645,0.011125,0.004563,47.0
F,Action Thrillers,0.181692,0.000879,0.000482,0.015221,0.005683,131.0
...,...,...,...,...,...,...,...
M,Women in prison films,0.153846,-0.030534,-0.008910,0.040963,0.012976,2.0
M,Workplace Comedy,0.141667,-0.001761,-0.000440,0.008041,0.002512,51.0
M,World cinema,0.079252,-0.001155,-0.000068,0.008788,0.002190,691.0
M,Wuxia,0.060976,-0.002400,-0.000335,0.002400,0.000871,5.0


In [283]:
display(name_by_gender_by_genre_prop_df['avg_mag_slope_change_per_gender_per_genre_significant']['F']['Absurdism'])

display(name_by_gender_by_genre_prop_df['avg_mag_slope_change_per_gender_per_genre_significant']['M']['Absurdism'])

display(name_by_gender_by_genre_prop_df['avg_mag_slope_change_per_gender_per_genre_significant']['F'].idxmax())
display(name_by_gender_by_genre_prop_df['avg_mag_slope_change_per_gender_per_genre_significant']['M'].idxmax())


0.008684834792064004

0.01372688332479104

'Outlaw biker film'

'Archaeology'

In [284]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Scatter(
    x=[1, 2, 3, 4], y=[10, 11, 12, 13],
    mode='markers',
    marker_size=[40, 60, 80, 100])
])

fig.show()

import plotly.express as px

#fig = px.scatter(x=range(10), y=range(10))
fig.write_html("desktop.html")
