In [37]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [38]:
import os
import sys
sys.path.append("../..")
sys.path.append("../../..")

In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import csv

In [40]:
# Generate the .csv for the metrics (for typologies Id and Ie)

In [41]:
paths = {
    'I-d': 'new-data_I-d_high_5_2_all/network_graph_phi.graphml',
    'I-e': 'new-data_I-e_high_5_2_all/network_graph_phi.graphml',
    'IV-n': 'new-data_IV-n_high_5_2_all/network_graph_phi.graphml'
}

node_data = {}

In [42]:
def load_graph_data(path, prefix):
    G = nx.read_graphml(path)
    for node, data in G.nodes(data=True):
        if node not in node_data:
            node_data[node] = {'label': data.get('label', '')}
        # Adiciona cada atributo com prefixo para distinguir
        for attr, value in data.items():
            if attr not in ['label']:  # Ignora o label para evitar duplicatas
                node_data[node][f"{prefix}_{attr}"] = value


for prefix, path in paths.items():
    load_graph_data(path, prefix)

df = pd.DataFrame.from_dict(node_data, orient='index')
df.to_csv('combined_graph_data.csv', index_label='node_id')


In [30]:
import os
import pandas as pd
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt

class NetworkStatisticalAnalysis:
    def __init__(self, network_metric_path, attributes, alpha=0.05):
        self.alpha = alpha
        self._z_i = st.norm.ppf(1 - alpha / 2)
        self.attributes = attributes
        self.metric_names = {"I-d_betweenness_node": "betweenness", "I-d_degree_node": "degree"}
        self._data = pd.read_csv(network_metric_path)
        self.get_data_coefficients()

        # Filtrando dados
        self.filtered_data_positive = self._data[(self._data['I-d_quantity_i-d'] > 0) | (self._data['I-e_quantity_i-e'] > 0)]
        self.filtered_data_zero = self._data[(self._data['I-d_quantity_i-d'] == 0) & (self._data['I-e_quantity_i-e'] == 0)]

    def get_data_coefficients(self):
        # Coleta todos os coeficientes com base nas colunas especificadas
        self.data_coefficients = []
        for attr in self.attributes:
            coefficients = self._data[attr].to_numpy()
            self.data_coefficients.append(coefficients)

    def calculate_lower_bound(self, coefficients):
        return coefficients.mean() - self._z_i * coefficients.std()

    def calculate_upper_bound(self, coefficients):
        return coefficients.mean() + self._z_i * coefficients.std()

    def plot_confidence_interval_for_metric(self, metric, plt_title=''):
        os.makedirs('statistical_outputs', exist_ok=True)
        metric_name = self.metric_names.get(metric, metric)
        plt.figure(figsize=(8, 5))

        # Dados com quantity_i-d > 0 ou quantity_i-e > 0
        coeff_positive = self.filtered_data_positive[metric].to_numpy()
        if len(coeff_positive) > 0:
            lower_bound_pos = self.calculate_lower_bound(coeff_positive)
            upper_bound_pos = self.calculate_upper_bound(coeff_positive)

            plt.plot([1, 1], [lower_bound_pos, upper_bound_pos], label="Accounts with at least one typology", color='blue')
            plt.hlines(lower_bound_pos, 0.9, 1.1, color='blue', linestyle='--')
            plt.hlines(upper_bound_pos, 0.9, 1.1, color='blue', linestyle='--')
            plt.text(1, lower_bound_pos, f'{lower_bound_pos:.3f}', color='blue', ha='center', va='top')
            plt.text(1, upper_bound_pos, f'{upper_bound_pos:.3f}', color='blue', ha='center', va='bottom')

        # Dados com quantity_i-d == 0 e quantity_i-e == 0
        coeff_zero = self.filtered_data_zero[metric].to_numpy()
        if len(coeff_zero) > 0:
            lower_bound_zero = self.calculate_lower_bound(coeff_zero)
            upper_bound_zero = self.calculate_upper_bound(coeff_zero)

            plt.plot([1.2, 1.2], [lower_bound_zero, upper_bound_zero], label="Accounts with no typology", color='red')
            plt.hlines(lower_bound_zero, 1.1, 1.3, color='red', linestyle='--')
            plt.hlines(upper_bound_zero, 1.1, 1.3, color='red', linestyle='--')
            plt.text(1.2, lower_bound_zero, f'{lower_bound_zero:.3f}', color='red', ha='center', va='top')
            plt.text(1.2, upper_bound_zero, f'{upper_bound_zero:.3f}', color='red', ha='center', va='bottom')

        print(f'{metric} | Positive Lower bound: {lower_bound_pos:.3f}, Upper bound: {upper_bound_pos:.3f}')
        print(f'{metric} | Zero Lower bound: {lower_bound_zero:.3f}, Upper bound: {upper_bound_zero:.3f}')


        plt.xticks([])

        plt.xlabel("")
        plt.ylabel("Attribute Value")
        plt.title(f'{metric_name}')

        plt.legend()

        plt.savefig(f'statistical_outputs/{metric}_confidence_interval.png')
        plt.close()

    def plot_all_metrics(self):
        for metric in self.attributes:
            self.plot_confidence_interval_for_metric(metric, plt_title=f" - {metric}")

if __name__ == '__main__':
    metrics = [
        "I-d_betweenness_node", 
        "I-d_degree_node",
    ]
    network_analysis = NetworkStatisticalAnalysis(network_metric_path='./combined_graph_data.csv', attributes=metrics)
    network_analysis.plot_all_metrics()


I-d_betweenness_node | Positive Lower bound: -0.008, Upper bound: 0.012
I-d_betweenness_node | Zero Lower bound: -0.004, Upper bound: 0.004
I-d_degree_node | Positive Lower bound: -26.830, Upper bound: 50.544
I-d_degree_node | Zero Lower bound: -10.066, Upper bound: 12.637
