In [1]:
import networkx as nx
import pandas as pd
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import csv
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from collections import Counter


In [2]:
# Lectura de datos
nodes = pd.read_csv("../datos/musae_git_target.csv")
nodes = nodes.sort_values(by='id_node')
atributeDegreeCentrality = pd.read_csv("../metricas/degree_centrality.csv")
atributeDegreeCentrality = atributeDegreeCentrality.sort_values(by="id_node")
atributeClosenessCentrality = pd.read_csv("../metricas/closeness_centrality.csv")
atributeClosenessCentrality = atributeClosenessCentrality.sort_values(by='id_node')
atributeBetweennessCentrality = pd.read_csv("../metricas/betweenness_centrality.csv")
atributeBetweennessCentrality = atributeBetweennessCentrality.sort_values(by='id_node')
atributeClusteringCoefficient = pd.read_csv("../metricas/clustering_coefficient.csv")
atributeClusteringCoefficient = atributeClusteringCoefficient.sort_values(by='id_node')
atributeSquareClustering = pd.read_csv("../metricas/square_clustering.csv")
atributeSquareClustering = atributeSquareClustering.sort_values(by='id_node')
atributeTriangles = pd.read_csv("../metricas/triangles.csv")
atributeTriangles = atributeTriangles.sort_values(by='id_node')
atributeGreedyModularityCommunities = pd.read_csv("../metricas/greedy_modularity_communities.csv")
atributeGreedyModularityCommunities = atributeGreedyModularityCommunities.sort_values(by='id_node')
atributeGreedyModularityCommunities = atributeGreedyModularityCommunities.iloc[:, ::-1]
atributeCoreNumber = pd.read_csv("../metricas/core_number.csv")
atributeCoreNumber = atributeCoreNumber.sort_values(by='id_node')
atributeAsynLpaCommunities = pd.read_csv("../metricas/asyn_lpa_communities.csv")
atributeAsynLpaCommunities = atributeAsynLpaCommunities.sort_values(by='id_node')


In [3]:
# Vamos a meter todos los atributos en la tabla 

# Renombrar columnas
atributeDegreeCentrality.rename(columns={'Degree Centrality': 'degree_centrality'}, inplace=True)
atributeClosenessCentrality.rename(columns={'Closeness Centrality': 'closeness_centrality'}, inplace=True)
atributeBetweennessCentrality.rename(columns={'Betweenness Centrality': 'betweenness_centrality'}, inplace=True)
atributeClusteringCoefficient.rename(columns={'Clustering Coefficient': 'clustering_coefficient'}, inplace=True)
atributeSquareClustering.rename(columns={'Squared Clustering': 'square_clustering'}, inplace=True)
atributeTriangles.rename(columns={'Triangles': 'triangles'}, inplace=True)
atributeGreedyModularityCommunities.rename(columns={'Community': 'greedy_modularity_communities'}, inplace=True)
atributeCoreNumber.rename(columns={'Core Number': 'core_number'}, inplace=True)
atributeAsynLpaCommunities.rename(columns={'Community': 'asyn_lpa_communities'}, inplace=True)

# Convertir la columna 'id_node' a tipo de datos str en todos los DataFrames
nodes['id_node'] = nodes['id_node'].astype(str)
atributeDegreeCentrality['id_node'] = atributeDegreeCentrality['id_node'].astype(str)
atributeClosenessCentrality['id_node'] = atributeClosenessCentrality['id_node'].astype(str)
atributeBetweennessCentrality['id_node'] = atributeBetweennessCentrality['id_node'].astype(str)
atributeClusteringCoefficient['id_node'] = atributeClusteringCoefficient['id_node'].astype(str)
atributeSquareClustering['id_node'] = atributeSquareClustering['id_node'].astype(str)
atributeTriangles['id_node'] = atributeTriangles['id_node'].astype(str)
atributeGreedyModularityCommunities['id_node'] = atributeGreedyModularityCommunities['id_node'].astype(str)
atributeCoreNumber['id_node'] = atributeCoreNumber['id_node'].astype(str)
atributeAsynLpaCommunities['id_node'] = atributeAsynLpaCommunities['id_node'].astype(str)

# Fusionar todos los DataFrames en uno
nodes = pd.merge(nodes, atributeDegreeCentrality, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeClosenessCentrality, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeBetweennessCentrality, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeClusteringCoefficient, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeSquareClustering, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeTriangles, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeGreedyModularityCommunities, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeCoreNumber, on='id_node', how='outer')
nodes = pd.merge(nodes, atributeAsynLpaCommunities, on='id_node', how='outer')

# Convertir 'id_node' a tipo de datos numérico, manejando errores
nodes['id_node'] = pd.to_numeric(nodes['id_node'], errors='coerce')

# Eliminar filas con id_node NaN (resultantes de conversiones fallidas)
nodes = nodes.dropna(subset=['id_node'])

# Convertir 'id_node' a tipo entero
nodes['id_node'] = nodes['id_node'].astype(int)

# Ordenar por id_node
nodes = nodes.sort_values(by='id_node', ascending=True)

# Convertir 'id_node' de nuevo a string si es necesario
nodes['id_node'] = nodes['id_node'].astype(str)

nodes.head(7)


Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,Eiryyy,0.0,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0,1.0,3.0
1,1,shawflying,0.0,0.000212,0.294956,1.149733e-06,0.178571,0.072344,5.0,1,6.0,3.0
2,2,JpMCarrilho,1.0,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0,1.0,3.0
3,3,SuhwanCha,0.0,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,2,4.0,3.0
4,4,sunilangadi2,1.0,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,5,2.0,3.0
5,5,j6montoya,0.0,2.7e-05,0.343412,0.0,0.0,0.0,0.0,0,1.0,3.0
6,6,sfate,0.0,0.000159,0.372244,2.098552e-06,0.333333,0.038866,5.0,0,6.0,3.0


In [4]:
# Obtener la fila correspondiente al ID_NODE específico (por ejemplo, ID = 0)
id_especifico = '29982'  # Convertir el ID_NODE a tipo de datos str
fila_id_especifico = nodes[nodes['id_node'] == id_especifico]

fila_id_especifico.head()

Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
29982,29982,dead-horse,0.0,0.01297,0.403407,0.001086,0.035335,0.020454,4216.0,1,34.0,3.0


In [5]:
#Normalizar columnas
columns_to_normalize = ['triangles', 'asyn_lpa_communities','greedy_modularity_communities', 'Core number'] 

# Inicializa el escalador
scaler = MinMaxScaler()

# Aplica el escalador a las columnas seleccionadas
nodes[columns_to_normalize] = scaler.fit_transform(nodes[columns_to_normalize])
nodes.head()

Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,Eiryyy,0.0,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,shawflying,0.0,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.002227,0.151515,0.0
2,2,JpMCarrilho,1.0,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,SuhwanCha,0.0,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.004454,0.090909,0.0
4,4,sunilangadi2,1.0,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.011136,0.030303,0.0


In [6]:
#Tablas con todos los atributos

tableWithAllAtributes = nodes.copy()

tableWithAllAtributes.to_csv('../tablas/tableWithAllAtributes.csv', index=False)


In [7]:
#Tablas con todos los atributos sin los de agrupamiento

nodesWithoutClustering = nodes.copy()

# Eliminar las columnas de agrupamiento
nodesWithoutClustering = nodesWithoutClustering.drop(columns=['clustering_coefficient', 'Square clustering','triangles'])

nodesWithoutClustering.to_csv('../tablas/tableWithoutClustering.csv', index=False)

nodesWithoutClustering.head(7)


Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,Eiryyy,0.0,2.7e-05,0.275005,0.0,0.0,0.0,0.0
1,1,shawflying,0.0,0.000212,0.294956,1.149733e-06,0.002227,0.151515,0.0
2,2,JpMCarrilho,1.0,2.7e-05,0.261845,0.0,0.0,0.0,0.0
3,3,SuhwanCha,0.0,0.000133,0.278718,5.316292e-05,0.004454,0.090909,0.0
4,4,sunilangadi2,1.0,5.3e-05,0.243084,6.134318e-09,0.011136,0.030303,0.0
5,5,j6montoya,0.0,2.7e-05,0.343412,0.0,0.0,0.0,0.0
6,6,sfate,0.0,0.000159,0.372244,2.098552e-06,0.0,0.151515,0.0


In [8]:
#Tablas con todos los atributos sin los de comunidades

nodesWithoutCommunity = nodes.copy()

# Eliminar las columnas de agrupamiento
nodesWithoutCommunity = nodesWithoutCommunity.drop(columns=['greedy_modularity_communities','asyn_lpa_communities'])

nodesWithoutCommunity.to_csv('../tablas/tableWithoutCommunity.csv', index=False)


nodesWithoutCommunity.head(7)

Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,Core number
0,0,Eiryyy,0.0,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0
1,1,shawflying,0.0,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.151515
2,2,JpMCarrilho,1.0,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0
3,3,SuhwanCha,0.0,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.090909
4,4,sunilangadi2,1.0,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.030303
5,5,j6montoya,0.0,2.7e-05,0.343412,0.0,0.0,0.0,0.0,0.0
6,6,sfate,0.0,0.000159,0.372244,2.098552e-06,0.333333,0.038866,6.2e-05,0.151515


In [9]:
#Tablas con todos los atributos sin los de nucleos

nodesWithoutKernel = nodes.copy()

# Eliminar las columnas de agrupamiento
nodesWithoutKernel = nodesWithoutKernel.drop(columns=['Core number'])

nodesWithoutKernel.to_csv('../tablas/tableWithoutKernel.csv', index=False)


nodesWithoutKernel.head(7)


Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,asyn_lpa_communities
0,0,Eiryyy,0.0,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0,0.0
1,1,shawflying,0.0,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.002227,0.0
2,2,JpMCarrilho,1.0,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0,0.0
3,3,SuhwanCha,0.0,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.004454,0.0
4,4,sunilangadi2,1.0,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.011136,0.0
5,5,j6montoya,0.0,2.7e-05,0.343412,0.0,0.0,0.0,0.0,0.0,0.0
6,6,sfate,0.0,0.000159,0.372244,2.098552e-06,0.333333,0.038866,6.2e-05,0.0,0.0


In [10]:
nodesWithoutCentrality = nodes.copy()

# Eliminar las columnas de centralidad
nodesWithoutCentrality = nodesWithoutCentrality.drop(columns=['degree_centrality', 'closeness_centrality', 'betweenness_centrality'])

nodesWithoutCentrality.to_csv('../tablas/tableWithoutCentrality.csv', index=False)


nodesWithoutCentrality.head(7)

Unnamed: 0,id_node,name,ml_target,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,Eiryyy,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,shawflying,0.0,0.178571,0.072344,6.2e-05,0.002227,0.151515,0.0
2,2,JpMCarrilho,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,SuhwanCha,0.0,0.0,0.019178,0.0,0.004454,0.090909,0.0
4,4,sunilangadi2,1.0,0.0,0.0,0.0,0.011136,0.030303,0.0
5,5,j6montoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,6,sfate,0.0,0.333333,0.038866,6.2e-05,0.0,0.151515,0.0
