In [1]:
import networkx as nx
import pickle
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse

In [153]:
def getBasicInfo(G):
    nodes = G.number_of_nodes()
    edges = G.number_of_edges()
    density = nx.density(G)
    try:
        biggest_cc = max(nx.connected_components(G), key=len)
        biggest_cc = G.subgraph(biggest_cc)
        diameter = nx.diameter(biggest_cc)

    except:
        diameter = None
    avg_degree = sum([d for n, d in G.degree()]) / nodes
    clustering_distribution = [c for n, c in nx.clustering(G).items()]
    average_clustering = sum(clustering_distribution) / nodes
    global_clustering = nx.transitivity(G)
    max_degree = max([d for n, d in G.degree()])
    min_degree = min([d for n, d in G.degree()])
    zero_degree = sum([1 for n, d in G.degree() if d == 0]) / nodes
    degree_distribution = [d for n, d in G.degree()]
    node_with_max_degree = [n for n, d in G.degree() if d == max_degree]
    
    return nodes, edges, diameter, density, avg_degree, average_clustering, \
           global_clustering, \
           max_degree, min_degree, zero_degree, degree_distribution, \
           clustering_distribution, node_with_max_degree
           

def printBasicInfo(info_list):
    print('Number of nodes: ', info_list[0])
    print('Number of edges: ', info_list[1])
    print('Diameter: ', info_list[2])
    print('Density: ', info_list[3])
    print('Average degree: ', info_list[4])
    print('Average clustering: ', info_list[5])
    print('Global clustering: ', info_list[6])
    print('Max degree: ', info_list[7])
    print('Min degree: ', info_list[8])
    print('Zero degree: ', info_list[9])
    print('Node with max degree: ', info_list[12])


def plot_CCDF(data: list, title: str, xlabel: str, ylabel: str, save_path: str) -> None:
    """
    Plot the CCDF of a given array.
    """
    ccdf = CCDF(data)
    plt.figure(figsize=(10, 6))
    plt.plot(range(1, len(ccdf) + 1), ccdf, 'o', markersize=3)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if not os.path.exists(os.path.dirname(save_path)):
        os.makedirs(os.path.dirname(save_path))
    plt.savefig(save_path)

def CCDF(data: list) -> list:
    """
    Calculate the CCDF of a given array.
    """
    data = np.array(data)
    data.sort()
    s = data.sum()
    cdf = data.cumsum(0) / s
    ccdf = 1 - cdf
    return ccdf

def modularidadePorAtributo(G, atributo):
    node_atributes = nx.get_node_attributes(G, atributo)
    partitions = {}
    for key, value in node_atributes.items():
        partitions[value] = partitions.get(value, []) + [key]

    # remove the unvisited nodes from G
    partitions = [set(partition) for partition in partitions.values()]
    modularity = nx.algorithms.community.modularity(G, partitions)
    return modularity

def removeWeirdNodes(G):
    node_atributes = nx.get_node_attributes(G, 'age')
    partitions = {}
    visited_nodes = []
    for key, value in node_atributes.items():
        partitions[value] = partitions.get(value, []) + [key]
        visited_nodes.append(key)

    # remove the unvisited nodes from G
    G.remove_nodes_from(set(G.nodes()) - set(visited_nodes))
    return G

import matplotlib.pyplot as plt

import matplotlib.pyplot as plt

def plotMetricEvolutionOverTime(infos_years):
    marked_years = [2022, 2018, 2014, 2010, 2006, 2002, 1998]
    metric_names = ['Number of nodes',
                    'Number of edges',
                    'Diameter',
                    'Density',
                    'Average degree',
                    'Average clustering',
                    'Global clustering',
                    'Max degree',
                    'Min degree',
                    'Zero degree',
                    'Node with max degree']
    years = list(range(2000, 2024))
    for i in range(len(metric_names)):
        metric = []
        for year in years:
            metric.append(infos_years[str(year)][i])

        # Make each plot from scratch, years as x label, save
        plt.figure()
        plt.plot(years, metric, label=metric_names[i], marker='o')
        
        # Add red vertical lines at marked years without legend
        for marked_year in [2022, 2018, 2014, 2010, 2006, 2002, 1998]:
            plt.axvline(x=marked_year, color='red', linestyle='--')
        
        plt.title(metric_names[i])
        plt.xlabel('Year')
        plt.ylabel(metric_names[i])
        plt.savefig(f'plots/{metric_names[i]}.png')



def plotMetricEvolutionOverTime2(infos_terms):
    metric_names = ['Number of nodes',
                    'Number of edges',
                    'Diameter',
                    'Density',
                    'Average degree',
                    'Average clustering',
                    'Global clustering',
                    'Max degree',
                    'Min degree',
                    'Zero degree',
                    'Node with max degree']
    terms = list(range(51, 58))
    for i in range(len(metric_names)):
        metric = []
        for term in terms:
            metric.append(infos_terms[str(term)][i])

        # Make each plot from scratch, years as x label, save
        plt.figure()
        plt.plot(terms, metric, label=metric_names[i], marker='o')
        plt.title(metric_names[i])
        plt.xlabel('Term')
        plt.ylabel(metric_names[i])
        plt.savefig(f'plots/{metric_names[i]}_terms.png')


In [None]:
netwoks_folder = 'data/networks/'
networks = os.listdir(netwoks_folder)
networks = [netwoks_folder + network for network in networks]
networks = [network for network in networks if network.endswith('.gpickle')]
infos_years = {}
infos_terms = {}
modularidade_por_atributo_ano = {}
modularidade_por_atributo_terms = {}
for network in networks:
    print(network)
    with open(network, 'rb') as f:
        G = pickle.load(f)
    try:
        G = removeWeirdNodes(G)
        info_list = getBasicInfo(G)
    except Exception as e:
        print('Error in network: ', network)
        print(e)
        break
    printBasicInfo(info_list)
    # plot_CCDF(info_list[10], 'Degree distribution', 'Degree', 'CCDF', 'plots/degree_distribution.png')

    atributos = ['education', 'gender','siglaUf', 'siglaPartido', 'region', 'occupation', 'marital_status', 'ethnicity', 'age']
    modularidade_por_atributo = {}
    for atributo in atributos:
        modularidade_por_atributo[atributo] = modularidadePorAtributo(G, atributo)

    if '20' in network:
        year = network.split('network.gpickle')[0].split('/')[-1]
        infos_years[year] = info_list
        modularidade_por_atributo_ano[year] = modularidade_por_atributo
    else:
        term = network.split('network.gpickle')[0].split('/')[-1]
        infos_terms[term] = info_list
        modularidade_por_atributo_terms[term] = modularidade_por_atributo

In [None]:
plotMetricEvolutionOverTime2(infos_terms)

In [None]:
def plotModularityOverTime(modularidade_por_atributo_ano):
    atributos_list = [
        'education','gender','siglaUf','siglaPartido','region','occupation','marital_status','ethnicity','age'
    ]
    years = list(range(2000, 2024))
    for atributo in atributos_list:
        metric = []
        for year in years:
            metric.append(modularidade_por_atributo_ano[str(year)][atributo])

        for marked_year in [2022, 2018, 2014, 2010, 2006, 2002]:
            plt.axvline(x=marked_year, color='red', linestyle='--')
        
        for marked_year in [2020, 2016, 2012, 2008, 2004, 2000]:
            plt.axvline(x=marked_year, color='green', linestyle='--')
        

        # Make each plot from scratch, years as x label, save
        plt.figure()
        plt.plot(years, metric, label=atributo, marker='o')
        plt.title(atributo)
        plt.xlabel('Year')
        plt.ylabel(atributo)

        plt.savefig(f'plots/modularity/{atributo}_modularity.png')
def plotModularityOverTime2(modularidade_por_atributo_terms):
    atributos_list = [
        'education','gender','siglaUf','siglaPartido','region','occupation','marital_status','ethnicity','age'
    ]
    terms = list(range(51, 58))

    for atributo in atributos_list:
        metric = []
        for term in terms:
            metric.append(modularidade_por_atributo_terms[str(term)][atributo])

        # Make each plot from scratch, years as x label, save
        plt.figure()
        plt.plot(terms, metric, label=atributo, marker='o')
        plt.title(atributo)
        plt.xlabel('Term')
        plt.ylabel(atributo)
        plt.xticks(terms, rotation=45)

        plt.savefig(f'plots/modularity/{atributo}_modularity_terms.png')

plotModularityOverTime(modularidade_por_atributo_ano)
plotModularityOverTime2(modularidade_por_atributo_terms)

In [2]:
def highest_pagerank(G):
    '''
    Return the most relevant congressperson of a given network
    '''
    pagerank = nx.pagerank(G)
    highest_pagerank = max(pagerank, key=pagerank.get)
    degree = dict(G.degree())
    degree_of_most_relevant = degree[highest_pagerank]

    return highest_pagerank, degree_of_most_relevant

def highest_degree_function(G):
    '''
    Return the most relevant congressperson of a given network
    '''
    degree = dict(G.degree())
    highest_degree = max(degree, key=degree.get)
    degree_of_most_relevant = degree[highest_degree]
    return highest_degree, degree_of_most_relevant


In [None]:
pagerank_year = {}
pagerank_term = {}
degree_term = {}
degree_year = {}
netwoks_folder = 'data/networks/'
networks = os.listdir(netwoks_folder)
networks = [netwoks_folder + network for network in networks]
networks = [network for network in networks if network.endswith('.gpickle')]

for network in networks:
    print(network)
    with open(network, 'rb') as f:
        G = pickle.load(f)

    if '20' in network:
        year = network.split('network.gpickle')[0].split('/')[-1]
        pagerank_year[year] = highest_pagerank(G)
        degree_year[year] = highest_degree_function(G)
    else:
        term = network.split('network.gpickle')[0].split('/')[-1]
        pagerank_term[term] = highest_pagerank(G)
        degree_term[term] = highest_degree_function(G)

In [4]:
congresspersons = 'data/congresspeople/enriched_congresspeople.csv'
congresspersons_df = pd.read_csv(congresspersons)
congresspersons_df = congresspersons_df[['id', 'nomeEleitoral']]

In [22]:
df_page_rank = pd.DataFrame()
for year in pagerank_year.keys():
    nome = congresspersons_df[congresspersons_df['id'] == pagerank_year[year][0]]['nomeEleitoral'].unique()[0]
    nome = ' '.join([subnome.capitalize() for subnome in nome.split(' ')])
    df = pd.DataFrame({
        'year': [year],
        'nome': [nome],
        'degree': [pagerank_year[year][1]]
    })
    df_page_rank = pd.concat([
        df_page_rank, 
        df
    ])

In [23]:
df_degree = pd.DataFrame()
for year in degree_year.keys():
    nome = congresspersons_df[congresspersons_df['id'] == degree_year[year][0]]['nomeEleitoral'].unique()[0]
    nome = ' '.join([subnome.capitalize() for subnome in nome.split(' ')])
    df = pd.DataFrame({
        'year': [year],
        'nome': [nome],
        'degree': [degree_year[year][1]]
    })
    df_degree = pd.concat([
        df_degree,
        df 
    ])

In [26]:
df_page_rank.sort_values(by=['year'], inplace=True)
df_degree.sort_values(by=['year'], inplace=True)

In [28]:
df_degree.to_csv('data/results/degree.csv', index=False)
df_page_rank.to_csv('data/results/pagerank.csv', index=False)