In [240]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import csv
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [247]:
# Lectura de datos
nodes = pd.read_csv("../datos/musae_git_target.csv")
nodes = nodes.sort_values(by='id_node')
atributeDegreeCentrality = pd.read_csv("../metricas/degree_centrality.csv")
atributeDegreeCentrality = atributeDegreeCentrality.sort_values(by="id_node")
atributeClosenessCentrality = pd.read_csv("../metricas/closeness_centrality.csv")
atributeClosenessCentrality = atributeClosenessCentrality.sort_values(by='id_node')
atributeBetweennessCentrality = pd.read_csv("../metricas/betweenness_centrality.csv")
atributeBetweennessCentrality = atributeBetweennessCentrality.sort_values(by='id_node')
atributeClusteringCoefficient = pd.read_csv("../metricas/clustering_coefficient.csv")
atributeClusteringCoefficient = atributeClusteringCoefficient.sort_values(by='id_node')
atributeGeneralizedDegree = pd.read_csv("../metricas/generalized_degree.csv")
atributeGeneralizedDegree = atributeGeneralizedDegree.sort_values(by='id_node')
atributeTriangles = pd.read_csv("../metricas/triangles.csv")
atributeTriangles = atributeTriangles.sort_values(by='id_node')
atributeGreedyModularityCommunities = pd.read_csv("../metricas/greedy_modularity_communities.csv")
atributeGreedyModularityCommunities = atributeGreedyModularityCommunities.sort_values(by='id_node')
atributeGreedyModularityCommunities = atributeGreedyModularityCommunities.iloc[:, ::-1]
atributeCoreNumber = pd.read_csv("../metricas/core_number.csv")
atributeCoreNumber = atributeCoreNumber.sort_values(by='id_node')
atributeAsynLpaCommunities = pd.read_csv("../metricas/asyn_lpa_communities.csv")
atributeAsynLpaCommunities = atributeAsynLpaCommunities.sort_values(by='id_node')


In [249]:
# Renombrar columnas
atributeDegreeCentrality.rename(columns={'Degree Centrality': 'degree_centrality'}, inplace=True)
atributeClosenessCentrality.rename(columns={'Closeness Centrality': 'closeness_centrality'}, inplace=True)
atributeBetweennessCentrality.rename(columns={'Betweenness Centrality': 'betweenness_centrality'}, inplace=True)
atributeClusteringCoefficient.rename(columns={'Clustering Coefficient': 'clustering_coefficient'}, inplace=True)
atributeGeneralizedDegree.rename(columns={'Generalized Degree': 'generalized_degree'}, inplace=True)
atributeTriangles.rename(columns={'Triangles': 'triangles'}, inplace=True)
atributeGreedyModularityCommunities.rename(columns={'Community': 'greedy_modularity_communities'}, inplace=True)
atributeCoreNumber.rename(columns={'Core Number': 'core_number'}, inplace=True)
atributeAsynLpaCommunities.rename(columns={'Community': 'asyn_lpa_communities'}, inplace=True)

# Convertir la columna 'id_node' a tipo de datos str en todos los DataFrames
nodes['id_node'] = nodes['id_node'].astype(str)
atributeDegreeCentrality['id_node'] = atributeDegreeCentrality['id_node'].astype(str)
atributeClosenessCentrality['id_node'] = atributeClosenessCentrality['id_node'].astype(str)
atributeBetweennessCentrality['id_node'] = atributeBetweennessCentrality['id_node'].astype(str)
atributeClusteringCoefficient['id_node'] = atributeClusteringCoefficient['id_node'].astype(str)
atributeGeneralizedDegree['id_node'] = atributeGeneralizedDegree['id_node'].astype(str)
atributeTriangles['id_node'] = atributeTriangles['id_node'].astype(str)
atributeGreedyModularityCommunities['id_node'] = atributeGreedyModularityCommunities['id_node'].astype(str)
atributeCoreNumber['id_node'] = atributeCoreNumber['id_node'].astype(str)
atributeAsynLpaCommunities['id_node'] = atributeAsynLpaCommunities['id_node'].astype(str)

# Fusionar todos los DataFrames en uno
nodes = pd.merge(nodes, atributeDegreeCentrality, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeClosenessCentrality, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeBetweennessCentrality, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeClusteringCoefficient, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeGeneralizedDegree, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeTriangles, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeGreedyModularityCommunities, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeCoreNumber, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeAsynLpaCommunities, on='id_node', how='outer')

# Convertir 'id_node' a tipo de datos numérico, manejando errores
nodes['id_node'] = pd.to_numeric(nodes['id_node'], errors='coerce')

# Eliminar filas con id_node NaN (resultantes de conversiones fallidas)
nodes = nodes.dropna(subset=['id_node'])

# Convertir 'id_node' a tipo entero
nodes['id_node'] = nodes['id_node'].astype(int)

# Ordenar por id_node
nodes_sorted = nodes.sort_values(by='id_node', ascending=True)

# Convertir 'id_node' de nuevo a string si es necesario
nodes_sorted['id_node'] = nodes_sorted['id_node'].astype(str)

nodes_sorted.head(7)


Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,generalized_degree,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,Eiryyy,0.0,2.7e-05,0.275005,0.0,0.0,Counter({0: 1}),0.0,0,1.0,3.0
1,1,shawflying,0.0,0.000212,0.294956,1.149733e-06,0.178571,"Counter({0: 3, 1: 2, 3: 2, 2: 1})",5.0,1,6.0,3.0
11112,2,JpMCarrilho,1.0,2.7e-05,0.261845,0.0,0.0,Counter({0: 1}),0.0,0,1.0,3.0
22223,3,SuhwanCha,0.0,0.000133,0.278718,5.316292e-05,0.0,Counter({0: 5}),0.0,2,4.0,3.0
31034,4,sunilangadi2,1.0,5.3e-05,0.243084,6.134318e-09,0.0,Counter({0: 2}),0.0,5,2.0,3.0
32145,5,j6montoya,0.0,2.7e-05,0.343412,0.0,0.0,Counter({0: 1}),0.0,0,1.0,3.0
33256,6,sfate,0.0,0.000159,0.372244,2.098552e-06,0.333333,"Counter({2: 2, 1: 2, 0: 1, 4: 1})",5.0,0,6.0,3.0


In [251]:
# Obtener la fila correspondiente al ID_NODE específico (por ejemplo, ID = 0)
id_especifico = '29982'  # Convertir el ID_NODE a tipo de datos str
fila_id_especifico = result_sorted[result_sorted['id_node'] == id_especifico]

fila_id_especifico.head()

Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,generalized_degree,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
22204,29982,dead-horse,0.0,0.01297,0.403407,0.001086,0.035335,"Counter({3: 37, 5: 36, 8: 33, 6: 32, 2: 30, 4:...",4216.0,1,34.0,3.0


In [244]:
# Ahora discretizamos el valor de la columna "generalized_degree" para que lo podamos utilizar más cómodamente


# Convertir la columna 'generalized_degree' a tipo numérico
nodes['generalized_degree'] = pd.to_numeric(nodes['generalized_degree'], errors='coerce')

# Eliminar filas con valores NaN en la columna 'generalized_degree'
nodes = nodes.dropna(subset=['generalized_degree'])

# Discretizar la columna 'generalized_degree'
# Supongamos que quieres discretizar en 5 categorías
bins = pd.cut(nodes['generalized_degree'], bins=5, labels=False)
nodes['generalized_degree_discretized'] = bins

nodes.head()

KeyError: 'generalized_degree'

In [246]:
#Normalizar columnas
columns_to_normalize = ['clustering_coefficient', 'triangles', 'asyn_lpa_communities','greedy_modularity_communities', 'Core number']  # Reemplaza con los nombres de las columnas que deseas normalizar

# Inicializa el escalador
scaler = MinMaxScaler()

# Aplica el escalador a las columnas seleccionadas
nodes[columns_to_normalize] = scaler.fit_transform(nodes[columns_to_normalize])
nodes.head()

KeyError: "None of [Index(['clustering_coefficient', 'triangles', 'asyn_lpa_communities',\n       'greedy_modularity_communities', 'Core number'],\n      dtype='object')] are in the [columns]"

In [None]:
# Tablas con todos los atributos sin las centralidades

# Renombrar columnas para evitar conflictos
atributeClusteringCoefficient.rename(columns={'Clustering Coefficient': 'clustering_coefficient'}, inplace=True)
atributeGeneralizedDegree.rename(columns={'Generalized Degree': 'generalized_degree'}, inplace=True)
atributeTriangles.rename(columns={'Triangles': 'triangles'}, inplace=True)
atributeGreedyModularityCommunities.rename(columns={'Community': 'greedy_modularity_communities'}, inplace=True)
atributeCoreNumber.rename(columns={'Core Number': 'core_number'}, inplace=True)
atributeAsynLpaCommunities.rename(columns={'Community': 'asyn_lpa_communities'}, inplace=True)


# Añadir columnas directamente al DataFrame nodes
nodes = pd.concat([nodes, atributeClusteringCoefficient,
                   atributeGeneralizedDegree, atributeTriangles,
                   atributeGreedyModularityCommunities, atributeCoreNumber,
                   atributeAsynLpaCommunities], axis=1)

# Eliminar las tres primeras columnas
nodes = nodes.loc[:, ~nodes.columns.duplicated()]


# Ordenar el DataFrame por la columna 'id'
nodes_sorted = nodes.sort_values(by='id')

nodes.head()


Unnamed: 0,id,name,ml_target,clustering_coefficient,generalized_degree,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0.0,Eiryyy,0.0,0.0,Counter({0: 1}),0.0,0.0,0.0,0.0
1,1.0,shawflying,0.0,0.09879,"Counter({1: 9, 3: 6, 0: 5, 2: 5, 5: 2, 4: 1, 8...",0.00061,0.0,0.515152,0.0
2,2.0,JpMCarrilho,1.0,0.178571,"Counter({0: 3, 1: 2, 3: 2, 2: 1})",6.2e-05,0.0,0.151515,0.0
3,3.0,SuhwanCha,0.0,0.0,Counter({0: 2}),0.0,0.0,0.030303,0.0
4,4.0,sunilangadi2,1.0,0.176923,"Counter({1: 5, 4: 5, 3: 4, 10: 4, 8: 3, 5: 3, ...",0.001719,0.0,0.757576,0.0


In [None]:
# Obtener la fila correspondiente al ID específico (por ejemplo, ID = 0)
id_especifico = 6
fila_id_especifico = nodes_sorted.loc[nodes_sorted['id'] == id_especifico]

# Mostrar la fila
print(fila_id_especifico)

    id   name  ml_target  clustering_coefficient  \
6  6.0  sfate        0.0                0.035335   

                                  generalized_degree  triangles  \
6  Counter({3: 37, 5: 36, 8: 33, 6: 32, 2: 30, 4:...   0.052512   

   greedy_modularity_communities  Core number  asyn_lpa_communities  
6                            0.0          1.0                   0.0  


In [None]:
#Tablas con todos los atributos sin los de agrupamiento



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,id,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,generalized_degree,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0.0,Eiryyy,0.0,2.7e-05,0.275005,0.0,0.0,Counter({0: 1}),0.0,0,...,0.0,2.7e-05,0.275005,0.0,0.0,Counter({0: 1}),0.0,0,1.0,3.0
1,1.0,shawflying,0.0,0.000849,0.379327,0.0001452081,0.09879,"Counter({1: 9, 3: 6, 0: 5, 2: 5, 5: 2, 4: 1, 8...",49.0,0,...,23977.0,0.000849,0.379327,0.0001452081,0.09879,"Counter({1: 9, 3: 6, 0: 5, 2: 5, 5: 2, 4: 1, 8...",49.0,0,18.0,3.0
2,2.0,JpMCarrilho,1.0,0.000212,0.294956,1.149733e-06,0.178571,"Counter({0: 3, 1: 2, 3: 2, 2: 1})",5.0,0,...,1.0,0.000212,0.294956,1.149733e-06,0.178571,"Counter({0: 3, 1: 2, 3: 2, 2: 1})",5.0,0,6.0,3.0
3,3.0,SuhwanCha,0.0,5.3e-05,0.287208,3.49129e-07,0.0,Counter({0: 2}),0.0,0,...,34526.0,5.3e-05,0.287208,3.49129e-07,0.0,Counter({0: 2}),0.0,0,2.0,3.0
4,4.0,sunilangadi2,1.0,0.001061,0.375335,2.197155e-05,0.176923,"Counter({1: 5, 4: 5, 3: 4, 10: 4, 8: 3, 5: 3, ...",138.0,0,...,2370.0,0.001061,0.375335,2.197155e-05,0.176923,"Counter({1: 5, 4: 5, 3: 4, 10: 4, 8: 3, 5: 3, ...",138.0,0,26.0,3.0


In [None]:
#Tablas con todos los atributos sin los de comunidades



In [None]:
#Tablas con todos los atributos sin los de nucleos

