# Correlation Analysis

## Imports

In [2]:
import pandas as pd
from bokeh.models import Tabs, TabPanel

from scripts.helpers import get_graph_from_pickle, merge_graph_to_df

In [3]:
df_origin = pd.read_csv('../data/processed/preprocessed.csv', index_col=0)
df_origin.set_index('wikipedia_id', inplace=True)
print(df_origin.shape)
df_origin.head(5)

(22394, 9)


Unnamed: 0_level_0,name,release_year,rating,num_votes,plot,freebase_id,languages,countries,genres
wikipedia_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10109752,Miss Jerry,1894.0,5.3,207,After finding out that her father is suffering...,/m/02q23xk,['Silent film'],['United States of America'],"['Short Film', 'Silent film', 'Indie', 'Black-..."
28703057,The Corbett-Fitzsimmons Fight,1897.0,5.3,484,The film no longer exists in its entirety; how...,/m/0czdh_n,[],[],['Sports']
142995,The Story of the Kelly Gang,1906.0,6.0,855,The Story of the Kelly Gangs tone is of sorrow...,/m/0120y4,['English Language'],['Australia'],"['Crime Fiction', 'Silent film', 'Biography', ..."
32986669,Robbery Under Arms,1907.0,4.3,25,Key scenes of the film included the branding o...,/m/04p7yxx,['Silent film'],['Australia'],"['Silent film', 'Drama']"
32987200,"Captain Midnight, the Bush King",1911.0,5.4,18,Edgar Dalimore is the son of wealthy station o...,/m/0h569x9,['Silent film'],[],['Silent film']


In [4]:
decades = [1920, 1930, 1940, 1950, 1960, 1970, 1980, 1990, 2000, 2010]

dfs = {}  # Dictionary to store the dataframes

for decade in decades:
    G = get_graph_from_pickle(decade)
    df_year = merge_graph_to_df(df_origin, G) 
    dfs[decade] = df_year

In [5]:
correlation_df = pd.DataFrame(columns=['degree', 'betweenness'])

for year, df in dfs.items():
    corr_b = df['rating'].corr(df['betweenness'])
    corr_d = df['rating'].corr(df['degree'])
    correlation_df.loc[year] = [corr_d, corr_b]

correlation_df

Unnamed: 0,degree,betweenness
1920,-0.076025,-0.149805
1930,-0.014151,-0.086933
1940,-0.08326,-0.108408
1950,-0.074576,-0.09105
1960,-0.066483,-0.102074
1970,-0.12139,-0.116902
1980,-0.104128,-0.092801
1990,-0.119577,-0.114067
2000,-0.135006,-0.120756
2010,-0.142755,-0.086834


In [6]:
correlation_df = correlation_df[['degree']].reset_index().rename(columns={'index': 'decade', 'degree': 'correlation coefficient'})
correlation_df['correlation coefficient'] = correlation_df['correlation coefficient'].round(3)

In [7]:
correlation_df

Unnamed: 0,decade,correlation coefficient
0,1920,-0.076
1,1930,-0.014
2,1940,-0.083
3,1950,-0.075
4,1960,-0.066
5,1970,-0.121
6,1980,-0.104
7,1990,-0.12
8,2000,-0.135
9,2010,-0.143


In [8]:
from scripts.helpers import get_bokeh_table
from bokeh.plotting import output_file, save, show

my_table = get_bokeh_table(correlation_df)
show(my_table)
# output_file('emb_corr_table.html')
# save(my_table)

In [95]:
from scripts.plotting import plot_bokeh_scatter
from scipy.stats import linregress

tabs = []

for decade in decades:
    res = linregress(dfs[decade].degree, dfs[decade].rating)
    plot = plot_bokeh_scatter(dfs[decade], 'degree', res.slope, res.intercept)
    tab = TabPanel(child=plot, title=str(decade))

    tabs.append(tab)
    
tabs = Tabs(tabs=tabs, sizing_mode='stretch_width')

from bokeh.plotting import output_file, save
output_file('scatter_tabs.html')
save(tabs)

'/Users/jan.kokla/Documents/EPFL/ada-2023-project-adaroundtheworld2023/notebooks/scatter_tabs.html'