In [1]:
import torch
import random
import numpy as np
from sklearn.utils import check_random_state
import pandas as pd


random.seed(42)

# Set the random seed for numpy
np.random.seed(42)

# Set the random seed for pandas
# pd.np.random.seed(42)

# Set the random seed for scikit-learn
check_random_state(42)

# Set the random seed for pytorch
torch.manual_seed(42)

<torch._C.Generator at 0x7fb939bda010>

In [2]:
from src.load_data import get_data
from src.EchoGAE import EchoGAE_algorithm
from src.echo_chamber_measure import EchoChamberMeasure


# TODO: Work on baselines
from networkx.algorithms.community import asyn_fluidc
from src.baselines.RWC_jit import RWC
# from src.baselines.RWC import RWC # If you don't have numba installed use this line instead but it will be slower

from src.baselines.polarization_index import add_ideology_to_graph, opinion_model, get_polarization_index

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
datasets = ["gun", "abortion", "super_bowl", "sxsw"]

In [4]:
def add_ideology_to_graph_22(G, ideologies_dict):
    G = G.copy()
    core_nodes = []
    for node in G.nodes():
        if node in ideologies_dict:
            G.nodes[node]["ideology"] = ideologies_dict[node]
            core_nodes.append(node)
        else:
            G.nodes[node]["ideology"] = 0
    return G, core_nodes

In [5]:
experiments = []

use_baseline = True

for ds in datasets:
    print(f"Dataset ({ds}): ", end="")
    
    ds_dict = {}
    ds_dict["dataset"] = ds

    # Get the data
    G, users_embeddings, labels, allsides_scores, node_id_map = get_data(f"data/{ds}/")

    # Graph information
    ds_dict["number_of_nodes"] = G.number_of_nodes()
    ds_dict["number_of_edges"] = G.number_of_edges()
    ds_dict["number_of_communities"] = len(np.unique(labels))


    # My metric
    user_emb = EchoGAE_algorithm(G, user_embeddings= users_embeddings, show_progress=False, hidden_channels = 20, out_channels=10, epochs=300)
    ecm = EchoChamberMeasure(user_emb, labels)
    eci = ecm.echo_chamber_index()
    ds_dict["echo_chamber_score"] = eci

    print(f"ECS = {eci:.3f} -- ", end=" ")

    # For communities ECIs and Sizes
    sizes = []
    ECSs = []

    for i in np.unique(labels):
        sizes.append(np.sum(labels == i))
        ECSs.append(ecm.community_echo_chamber_index(i))

    ds_dict["community_sizes"] = sizes
    ds_dict["community_ECIs"] = ECSs


    if use_baseline:
        # Baseline
        mem = asyn_fluidc(G, k=2, seed=42, max_iter=1000)
        mem = list(mem)

        nodes_0 = np.array(list(mem[0]))
        nodes_1 = np.array(list(mem[1]))


        rwc = RWC(G, nodes_0, nodes_1)
        ds_dict["RWC"] = rwc
        print(f"RWC = {rwc:.3f} -- ", end=" ")


        # Valenzuela metric
        G_ven, core_nodes = add_ideology_to_graph_22(G, allsides_scores)
        opinions = opinion_model(G_ven, core_nodes=core_nodes)
        polarization_index = get_polarization_index(opinions)[0]

        ds_dict["PI"] = polarization_index
        print(f"PI = {polarization_index:.3f}")
    else:
        print("")

    experiments.append(ds_dict)

    print("\n")

Dataset (gun): 

100%|██████████| 6941/6941 [01:21<00:00, 85.02it/s] 
 65%|██████▍   | 4471/6913 [05:15<02:52, 14.19it/s]


KeyboardInterrupt: 

In [6]:
df = pd.DataFrame(experiments)

In [7]:
df

Unnamed: 0,dataset,number_of_nodes,number_of_edges,number_of_communities,echo_chamber_score,community_sizes,community_ECIs,RWC,PI
0,gun,6566,14322,2,0.714684,"[3984, 2582]","[0.7393391787689997, 0.676642154010187]",0.420896,0.31415
1,abortion,5087,10572,2,0.614,"[3933, 1154]","[0.5983386571853089, 0.6673743112935674]",0.513219,0.185957
2,super_bowl,5460,8732,3,0.486666,"[23, 5398, 39]","[0.8093488340891872, 0.4825389652121271, 0.867...",0.27338,0.015645
3,sxsw,2436,5325,6,0.467519,"[1532, 34, 85, 717, 54, 14]","[0.41139140266375696, 0.792245645786395, 0.815...",0.479801,0.001514
