In [1]:
from datasets import Dataset, load_dataset, features, ClassLabel, load_from_disk

import pandas as pd
import numpy as np
import scipy.stats as stats
import researchpy as rp
from itertools import chain

import networkx as nx
import community as community_louvain
from networkx.algorithms.community import k_clique_communities
from networkx.algorithms import bipartite

import seaborn as sns
import matplotlib.pyplot as plt
import pydot
from networkx.drawing.nx_pydot import graphviz_layout

from llm_mri import LLM_MRI

  from .autonotebook import tqdm as notebook_tqdm


# Classe de métricas

In [2]:
class LLM_metrics:

    def __init__(self, Graph, label, model_name, map_dimensions, total_num_of_layers):
        self.Graph = Graph
        self.label = label
        self.model_name = model_name
        self.map_dimensions = map_dimensions
        self.layers = total_num_of_layers

        """
        Rotulando cada nó com a camada que ele pertence, isso é feito
        buscando o primeiro número do seu nome
        """
        for n in self.Graph.nodes:
            self.Graph.nodes[n]['layer'] = int(n.split('_')[0])
        
        self.projection_even, self.projection_odd = self.project_graph()

    def project_graph(self):
        nodes_even_layers = set()
        nodes_odd_layers = set()
        """
        Para fazer a projeção, é necessário que os nós tenham a label
        de qual camada pertencem.
        """
        for layer in range(self.layers + 1):
            if layer % 2 == 0:
                nodes_even_layers = nodes_even_layers.union({ n for n, d in self.Graph.nodes(data=True) if d['layer'] == layer})
            else:
                nodes_odd_layers = nodes_odd_layers.union({ n for n, d in self.Graph.nodes(data=True) if d['layer'] == layer})

        return bipartite.collaboration_weighted_projected_graph(self.Graph, nodes_even_layers), bipartite.collaboration_weighted_projected_graph(self.Graph, nodes_odd_layers)

    def get_degree_by_layer(self):
        camadas = []
        for x in range(self.layers + 1):
            camadas.append(str(x))
        df_layers = pd.DataFrame(columns=['layer', 'mean', 'var'])

        for i in camadas:
            df_layers = pd.concat([pd.DataFrame([[
                i,
                pd.Series([v for k, v in dict(nx.degree(self.Graph)).items() if k.split("_")[0] == i]).mean(),
                pd.Series([v for k, v in dict(nx.degree(self.Graph)).items() if k.split("_")[0] == i]).var(),
            ]], columns=df_layers.columns), df_layers], ignore_index=True)
        
        return df_layers.reindex(index=df_layers.index[::-1])

    def get_graph_center_of_mass(self):
        camadas = []
        for x in range(self.layers + 1):
            camadas.append(str(x))
        
        center_of_mass = 0

        for i in camadas:
            center_of_mass += ((pd.Series([k for k, v in dict(self.Graph.nodes()).items() if k.split("_")[0] == i]).count()) * (int(i) - (self.layers / 2)))
        
        return center_of_mass / len(list(self.Graph.nodes()))

    def get_graph_center_of_strength(self):
        camadas = []
        for x in range(self.layers + 1):
            camadas.append(str(x))
        
        center_of_strength = 0
        # sum_of_var = 0
        array_of_strenght = []
        sum_of_weights = 0

        for i in camadas:
            center_of_strength += ((pd.Series([v for k, v in dict(self.Graph.degree(weight='weight')).items() if k.split("_")[0] == i]).std()) * (int(i) - (self.layers / 2)))
            # array_of_strenght.append(pd.Series([v for k, v in dict(self.Graph.degree(weight='weight')).items() if k.split("_")[0] == i]).std())
            # sum_of_var += abs(((pd.Series([v for k, v in dict(self.Graph.degree(weight='weight')).items() if k.split("_")[0] == i]).var()) * (int(i) - (self.layers / 2))))
            # sum_of_weights += pd.Series([v for k, v in dict(self.Graph.degree(weight='weight')).items() if k.split("_")[0] == i]).sum()
        
        return center_of_strength
        # return center_of_strength / sum_of_weights
        # return center_of_strength / sum_of_var
        # return pd.Series(array_of_strenght).mean()

    def get_graph(self):
        return self.Graph
    
    def get_basic_metrics(self):
        return {
            "mean_degree": pd.Series([v for k, v in dict(nx.degree(self.Graph)).items()]).mean(),
            "var_degree": pd.Series([v for k, v in dict(nx.degree(self.Graph)).items()]).var(),
            "skew_degree": pd.Series([v for k, v in dict(nx.degree(self.Graph)).items()]).skew(),
            "kurt_degree": pd.Series([v for k, v in dict(nx.degree(self.Graph)).items()]).kurt(),
            "mean_strength": pd.Series([v for k, v in dict(self.Graph.degree(weight='weight')).items()]).mean(),
            "var_strength": pd.Series([v for k, v in dict(self.Graph.degree(weight='weight')).items()]).var(),
            "skew_strength": pd.Series([v for k, v in dict(self.Graph.degree(weight='weight')).items()]).skew(),
            "kurt_strength": pd.Series([v for k, v in dict(self.Graph.degree(weight='weight')).items()]).kurt(),
            # "average_node_connectivity": nx.average_node_connectivity(self.Graph),
            "assortativity": nx.degree_assortativity_coefficient(self.Graph, weight='weight'),
            "density": nx.density(self.Graph),
            "center_of_mass": self.get_graph_center_of_mass(),
            "center_of_strength": self.get_graph_center_of_strength(),
            "model_name": self.model_name,
            "map_dimensions": self.map_dimensions,
            "label": self.label
        }

    def get_projection_metrics_even(self):
        return {
            "mean_degree": pd.Series([v for k, v in dict(nx.degree(self.projection_even)).items()]).mean(),
            "var_degree": pd.Series([v for k, v in dict(nx.degree(self.projection_even)).items()]).var(),
            "mean_strength": pd.Series([v for k, v in dict(self.projection_even.degree(weight='weight')).items()]).mean(),
            "var_strength": pd.Series([v for k, v in dict(self.projection_even.degree(weight='weight')).items()]).var(),
            "average_clustering": nx.average_clustering(self.projection_even, weight="weight"),
            # "average_node_connectivity": nx.average_node_connectivity(self.projection_even),
            "assortativity": nx.degree_assortativity_coefficient(self.projection_even, weight='weight'),
            "density": nx.density(self.projection_even),
            "average_shortest_path": nx.average_shortest_path_length(self.projection_even, weight="weight") if nx.is_connected(self.projection_even) else float('NaN'),
            "model_name": self.model_name,
            "map_dimensions": self.map_dimensions,
            "label": self.label,
            "side": "even"
        }

    def get_projection_metrics_odd(self):
        return {
            "mean_degree": pd.Series([v for k, v in dict(nx.degree(self.projection_odd)).items()]).mean(),
            "var_degree": pd.Series([v for k, v in dict(nx.degree(self.projection_odd)).items()]).var(),
            "mean_strength": pd.Series([v for k, v in dict(self.projection_odd.degree(weight='weight')).items()]).mean(),
            "var_strength": pd.Series([v for k, v in dict(self.projection_odd.degree(weight='weight')).items()]).var(),
            "average_clustering": nx.average_clustering(self.projection_odd, weight="weight"),
            # "average_node_connectivity": nx.average_node_connectivity(self.projection_odd),
            "assortativity": nx.degree_assortativity_coefficient(self.projection_odd, weight="weight"),
            "density": nx.density(self.projection_odd),
            "average_shortest_path": nx.average_shortest_path_length(self.projection_odd, weight="weight") if nx.is_connected(self.projection_odd) else float('NaN'),
            "model_name": self.model_name,
            "map_dimensions": self.map_dimensions,
            "label": self.label,
            "side": "odd"
        }

    def get_basic_metrics_list_of_names(self):
        return [
            'mean_degree',
            'var_degree',
            'skew_degree',
            'kurt_degree',
            "mean_strength",
            "var_strength",
            "skew_strength",
            "kurt_strength",
            'average_clustering',
            # 'average_node_connectivity',
            'assortativity',
            'density',
            'model_name',
            'map_dimensions',
            'label',
            'side',
        ]

# Analisando Ativações do Grafo

In [3]:
# G_top = nx.read_gml('graphs/dpoc_bert-base-portuguese-cased_10_top.gml')
# G_botton = nx.read_gml('graphs/dpoc_bert-base-portuguese-cased_10_botton.gml')
# top_metrics = LLM_metrics(G_top, 'top', 'bert-base-portuguese-cased', 10, 12)
# top_metrics.get_basic_metrics()

# camadas = []
# for x in range(12):
#     camadas.append(str(x))
# center_of_edge = 0
# top_edges_by_layer = []
# for i in camadas:
#     center_of_edge += ((pd.Series([v['weight'] for k, v in dict(G_top.edges()).items() if (k[0].split("_")[0] == str(int(i)+1) and  k[1].split("_")[0] == str(int(i)+1)) or (k[1].split("_")[0] == str(int(i)+1) and  k[0].split("_")[0] == str(int(i)+1))]).sum()) * (int(i) - (12 / 2)))
#     top_edges_by_layer.append(pd.Series([v['weight'] for k, v in dict(G_top.edges()).items() if (k[0].split("_")[0] == i and  k[1].split("_")[0] == str(int(i)+1)) or (k[1].split("_")[0] == i and  k[0].split("_")[0] == str(int(i)+1))]).sum())
#     print(((pd.Series([v['weight'] for k, v in dict(G_top.edges()).items() if (k[0].split("_")[0] == i and  k[1].split("_")[0] == str(int(i)+1)) or (k[1].split("_")[0] == i and  k[0].split("_")[0] == str(int(i)+1))]).sum())))
# print("centro", center_of_edge)
# center_of_edge = 0
# top_edges_by_layer = []
# for i in camadas:
#     center_of_edge += ((pd.Series([v['weight'] for k, v in dict(G_botton.edges()).items() if (k[0].split("_")[0] == str(int(i)+1) and  k[1].split("_")[0] == str(int(i)+1)) or (k[1].split("_")[0] == str(int(i)+1) and  k[0].split("_")[0] == str(int(i)+1))]).sum()) * (int(i) - (12 / 2)))
#     top_edges_by_layer.append(pd.Series([v['weight'] for k, v in dict(G_botton.edges()).items() if (k[0].split("_")[0] == i and  k[1].split("_")[0] == str(int(i)+1)) or (k[1].split("_")[0] == i and  k[0].split("_")[0] == str(int(i)+1))]).sum())
#     print(((pd.Series([v['weight'] for k, v in dict(G_botton.edges()).items() if (k[0].split("_")[0] == i and  k[1].split("_")[0] == str(int(i)+1)) or (k[1].split("_")[0] == i and  k[0].split("_")[0] == str(int(i)+1))]).sum())))
# print("centro", center_of_edge)
# center_of_edge/pd.Series([v['weight'] for k, v in dict(G_top.edges()).items()]).sum()
# top_edges_by_layer
# pd.Series([v['weight'] for k, v in dict(G_top.edges()).items()]).
# plt.scatter(camadas,top_edges_by_layer)
# import math 
# camadas = []
# for x in range(12 + 1):
#     camadas.append(str(x))
# center_of_strenght = 0
# sum_var = 0
# array_of_strenght = []
# for i in camadas:
#     center_of_strenght += ((pd.Series([v for k, v in dict(G_top.degree(weight='weight')).items() if k.split("_")[0] == i]).std()) * (int(i) - (12 / 2)))
#     array_of_strenght.append(((pd.Series([v for k, v in dict(G_top.degree(weight='weight')).items() if k.split("_")[0] == i]).std()) * (int(i) - (12 / 2))))
#     sum_var += abs(((pd.Series([v for k, v in dict(G_top.degree(weight='weight')).items() if k.split("_")[0] == i]).std()) * (int(i) - (12 / 2))))
#     # print(center_of_strenght)
#     # print('momento:', ((pd.Series([v for k, v in dict(G_top.degree(weight='weight')).items() if k.split("_")[0] == i]).var())))
# pd.Series(array_of_strenght).mean()
# print("Centro of std:", center_of_strenght)
# print("Centro por soma de vairância:", center_of_strenght / sum_var)
# print("Centro por soma total:", center_of_strenght/pd.Series([v for k, v in dict(G_top.degree(weight='weight')).items() if k.split("_")[0] == i]).sum())
# print("Centro por soma média:", center_of_strenght/pd.Series([v for k, v in dict(G_top.degree(weight='weight')).items() if k.split("_")[0] == i]).mean())


In [4]:
# exp = "iam"
# exp = "dpoc"
experiments = [
    "iam",
    "dpoc"
]

lang = "pt"
# # lang = "en"

map_dimensions = [
    5,
    10,
    25
]

models = { 
    # "nlpie/tiny-clinicalbert": 4,          
    # "distilbert-base-uncased": 6,
    # "google-bert/bert-base-uncased": 12,
    # "emilyalsentzer/Bio_ClinicalBERT": 12,
    # "google-bert/bert-large-uncased": 24,

    "neuralmind/bert-base-portuguese-cased": 12, # 110M
    "pucpr/biobertpt-all": 12, # 110M
    "google-bert/bert-base-multilingual-cased": 12, # 110M
    "neuralmind/bert-large-portuguese-cased": 24, # 330M
    
    # "openai-community/gpt2": 12,
    # "openai-community/gpt2-large": 36
    # "FacebookAI/xlm-roberta-large": 24,
    # "facebook/xlm-roberta-xl": 36,          # 3.48B

    "pierreguillou/gpt2-small-portuguese": 12, # 1.5B
    "pucpr/gpt2-bio-pt": 12, # 1.5B
}


# number_of_layers = 1 + list(models.items())[position][1]

# df_basic = pd.DataFrame(columns = ['mean_degree','var_degree','average_node_connectivity','assortativity','density','center_of_mass','model_name','map_dimensions','label'])
# df_projection = pd.DataFrame(columns = ['mean_degree','var_degree','average_clustering','average_node_connectivity','assortativity','density','model_name','map_dimensions','label','side'])
# df_basic = pd.DataFrame()
# df_projection = pd.DataFrame()

# df_basic = pd.read_csv('data/comparison_basic_metrics' + lang + exp + '.csv')
# df_projection = pd.read_csv('data/comparison_projection_metrics' + lang + exp + '.csv')

for exp in experiments:
    df_basic = pd.DataFrame()
    df_projection = pd.DataFrame()
    
    for model_name, number_of_layers in models.items():
        for dimension in map_dimensions:
            print("Getting metrics:", exp, " - ", model_name, " - ", dimension, " - ", number_of_layers)
            
            G_top = nx.read_gml('graphs/' + exp + '_' + model_name.split('/')[1] + '_' + str(dimension) + '_top.gml')
            G_botton = nx.read_gml('graphs/' + exp + '_' + model_name.split('/')[1] + '_' + str(dimension) + '_botton.gml')
            G_composed = nx.read_gml('graphs/' + exp + '_' + model_name.split('/')[1] + '_' + str(dimension) + '_composed.gml')
            
            top_metrics = LLM_metrics(G_top, 'top', model_name.split('/')[-1], dimension, number_of_layers)
            botton_metrics = LLM_metrics(G_botton, 'botton',model_name.split('/')[-1], dimension, number_of_layers)
    
            df_basic = pd.concat([pd.DataFrame.from_dict([
                top_metrics.get_basic_metrics(),
                botton_metrics.get_basic_metrics()
            ]), df_basic], ignore_index=True)
            
            df_projection = pd.concat([pd.DataFrame.from_dict([
                top_metrics.get_projection_metrics_even(),
                top_metrics.get_projection_metrics_odd(),
                botton_metrics.get_projection_metrics_even(),
                botton_metrics.get_projection_metrics_odd()
            ]), df_projection], ignore_index=True)



df_basic.to_csv('data/comparison_basic_metrics_pt_' + exp + '.csv', index=False)
df_projection.to_csv('data/comparison_projection_metrics_pt_' + exp + '.csv', index=False)


Getting metrics: iam  -  neuralmind/bert-base-portuguese-cased  -  5  -  12
Getting metrics: iam  -  neuralmind/bert-base-portuguese-cased  -  10  -  12
Getting metrics: iam  -  neuralmind/bert-base-portuguese-cased  -  25  -  12
Getting metrics: iam  -  pucpr/biobertpt-all  -  5  -  12
Getting metrics: iam  -  pucpr/biobertpt-all  -  10  -  12
Getting metrics: iam  -  pucpr/biobertpt-all  -  25  -  12
Getting metrics: iam  -  google-bert/bert-base-multilingual-cased  -  5  -  12
Getting metrics: iam  -  google-bert/bert-base-multilingual-cased  -  10  -  12
Getting metrics: iam  -  google-bert/bert-base-multilingual-cased  -  25  -  12
Getting metrics: iam  -  neuralmind/bert-large-portuguese-cased  -  5  -  24
Getting metrics: iam  -  neuralmind/bert-large-portuguese-cased  -  10  -  24
Getting metrics: iam  -  neuralmind/bert-large-portuguese-cased  -  25  -  24
Getting metrics: iam  -  pierreguillou/gpt2-small-portuguese  -  5  -  12
Getting metrics: iam  -  pierreguillou/gpt2-smal