In [1]:
import sys 
sys.path.append('../scripts')

In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
from sklearn.preprocessing import MinMaxScaler
from bokeh.models import Slope
from bokeh.palettes import Sunset10
from bokeh.plotting import figure, show
import pingouin as pg

from helpers import get_graph_from_pickle, merge_graph_to_df

In [3]:
df_origin = pd.read_csv('../data/processed/preprocessed.csv', index_col=0)
df_origin.set_index('wikipedia_id', inplace=True)

In [4]:
list_year = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

dfs = {}  # Dictionary to store the dataframes

for year in list_year:
    G = get_graph_from_pickle(year)
    df_year = merge_graph_to_df(df_origin, G) 
    df_year.drop(columns=['languages','plot', 'freebase_id'], inplace=True)
    dfs[f"{year}"] = df_year

# Access the dataframes using dfs["df_year"]
df_all = pd.concat(dfs.values())

In [5]:
display(df_all.head(2)) 

Unnamed: 0,name,release_year,rating,num_votes,countries,genres,betweenness,degree
11396051,L'Atlantide,1921.0,6.7,419,['France'],"['Silent film', 'Indie', 'World cinema', 'Blac...",0.000532,129
32932657,The Breaking of the Drought,1920.0,5.2,27,[],['Silent film'],0.000149,57


In [14]:
# Here we scale the betweenness and degree columns for all the dataframes independently for each decade and for the whole dataset 
scaler = MinMaxScaler()
for df in dfs:
    df_used = dfs[df]
    df_used[['betweenness', 'degree']] = scaler.fit_transform(df_used[['betweenness', 'degree']])
    df_used['nb_genres'] = df_used['genres'].apply(lambda x: len(x.split(',')))

    dfs[df] = df_used
df_all[['betweenness', 'degree']] = scaler.fit_transform(df_all[['betweenness', 'degree']])
df_all['nb_genres'] = df_all['genres'].apply(lambda x: len(x.split(',')))

In [15]:
display(df_all.head(2)) 

Unnamed: 0,name,release_year,rating,num_votes,countries,genres,betweenness,degree,nb_genres
11396051,L'Atlantide,1921.0,6.7,419,['France'],"['Silent film', 'Indie', 'World cinema', 'Blac...",0.016064,0.024298,6
32932657,The Breaking of the Drought,1920.0,5.2,27,[],['Silent film'],0.004488,0.01063,1


In [27]:
partial_corr_all= df_all.drop(columns=['name','countries', 'genres']).pcorr().round(3)[['betweenness', 'degree']].loc['rating']


In [30]:
corelation_dict_betweenness = {}
corelation_dict_degree = {}
for df in dfs:
    df_used = dfs[df]
    part_corr = df_used.drop(columns=['name','countries', 'genres']).pcorr().round(3)[['betweenness', 'degree']].loc['rating']
    corelation_dict_betweenness[df] = part_corr['betweenness']
    corelation_dict_degree[df] = part_corr['degree']

print(corelation_dict_betweenness)
print(corelation_dict_degree)
print("data from all decades:")
print(partial_corr_all)

{'1920': -0.063, '1930': -0.078, '1940': -0.066, '1950': -0.041, '1960': -0.067, '1970': -0.086, '1980': -0.01, '1990': -0.045, '2000': -0.002, '2010': 0.02}
{'1920': 0.028, '1930': 0.051, '1940': -0.028, '1950': -0.044, '1960': 0.028, '1970': 0.02, '1980': -0.065, '1990': -0.085, '2000': -0.086, '2010': -0.121}
data from all decades:
betweenness   -0.050
degree        -0.088
Name: rating, dtype: float64
