In [41]:
import pandas as pd
import csv
import numpy as np
np.random.seed(357823)
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler


In [42]:
# Lectura del csv para la tabla con todas las métricas
nodes = pd.read_csv("../tablas/tableWithAllAtributes.csv")

# Lectura del csv para la tabla con todos los atributos sin los de agrupamiento
nodesWithoutClustering = pd.read_csv("../tablas/tableWithoutClustering.csv")

# Lectura del csv para la tabla con todos los atributos sin los de comunidades
nodesWithoutCommunity = pd.read_csv("../tablas/tableWithoutCommunity.csv")

#Tablas con todos los atributos sin los de nucleos
nodesWithoutKernel = pd.read_csv("../tablas/tableWithoutKernel.csv")


In [43]:
nodes.head()

Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,Eiryyy,0.0,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,shawflying,0.0,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.002227,0.151515,0.0
2,2,JpMCarrilho,1.0,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,SuhwanCha,0.0,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.004454,0.090909,0.0
4,4,sunilangadi2,1.0,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.011136,0.030303,0.0


In [44]:
# PREPARACIÓN DE LOS DATOS PARA LA TABLA CON TODOS LOS ATRIBUTOS
atributos_discretos = ['name']
atributos_continuos = ['degree_centrality','closeness_centrality','betweenness_centrality','clustering_coefficient','Square clustering','triangles','greedy_modularity_communities','Core number','asyn_lpa_communities']
atributos = nodes.loc[:,['id_node'] + atributos_discretos + atributos_continuos]
atributos.head()

Unnamed: 0,id_node,name,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,Eiryyy,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,shawflying,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.002227,0.151515,0.0
2,2,JpMCarrilho,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,SuhwanCha,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.004454,0.090909,0.0
4,4,sunilangadi2,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.011136,0.030303,0.0


In [45]:
# Objetivo
objetivo = nodes['ml_target']
objetivo.head() 

0    0.0
1    0.0
2    1.0
3    0.0
4    1.0
Name: ml_target, dtype: float64

In [46]:
# Entrenamiento del modelo de Naive Bayes para la tabla con todos los atributos

In [47]:
# Codificación
codificador_atributos_discretos = OrdinalEncoder() # Crear una instancia de la clase correspondiente
codificador_atributos_discretos.fit(atributos[atributos_discretos]) # Usar el método fit para ajustar a los datos los parámetros de la codificación

# Vemos información sobre los atributos discretos
print("///Información sobre los atributos discretos///")
print('Número de atributos detectados:',
      f'{codificador_atributos_discretos.n_features_in_}')
print()
print('Nombres de los atributos detectados:')
print(f'{codificador_atributos_discretos.feature_names_in_}')
print()
print('Categorías detectadas de cada atributo:')
for atributo, categorías in zip(
    codificador_atributos_discretos.feature_names_in_,
    codificador_atributos_discretos.categories_):
    print(f'{atributo}: {categorías}')

# Ahora aplicamos el método transform para codificar los datos
atributos[atributos_discretos] = codificador_atributos_discretos.transform(
    atributos[atributos_discretos]
)


# Normalizamos el name
normalizador = MinMaxScaler(
    # Cada atributo se normaliza al intervalo [0, 1]
    feature_range=(0, 1)
)


# Aplicamos la normalización solo a la columna 'name'
atributos['name'] = normalizador.fit_transform(atributos[['name']])

print()
print("///Tabla con la columna name cambiada a números y normalizado el name///")
atributos.head()


///Información sobre los atributos discretos///
Número de atributos detectados: 1

Nombres de los atributos detectados:
['name']

Categorías detectadas de cada atributo:
name: ['007arunwilson' '007jedgar' '00Kai0' ... 'timothykimemia' 'timoxley'
 'timqian']

///Tabla con la columna name cambiada a números y normalizado el name///


Unnamed: 0,id_node,name,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,0.061673,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.929866,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.002227,0.151515,0.0
2,2,0.106687,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.191517,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.004454,0.090909,0.0
4,4,0.969442,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.011136,0.030303,0.0


In [48]:
# Dividimos los conjuntos de prueba y entrenamiento
"""
# División 1:
 
(atributos_entrenamiento, atributos_prueba,
 objetivo_entrenamiento, objetivo_prueba) = train_test_split(
        # Conjuntos de datos a dividir, usando los mismos índices para ambos
        atributos, objetivo,
        # Tamaño del conjunto de prueba (20 % en este caso)
        test_size=.2,
        # Estratificación según la distribución de clases en el atributo objetivo
        stratify=objetivo)
"""
# División 2:

# Dividimos los conjuntos de prueba y entrenamiento
(atributos_entrenamiento, atributos_prueba,
 objetivo_entrenamiento, objetivo_prueba) = train_test_split(
        # Conjuntos de datos a dividir, usando los mismos índices para ambos
        atributos, objetivo,
        # Tamaño del conjunto de prueba (10 % en este caso)
        test_size=.1,
        # Estratificación según la distribución de clases en el atributo objetivo
        stratify=objetivo)



In [49]:
# Discretizamos:
# Discretizamos usando uniform ya que si usamos el que nos da la practica que es la estrategia de quantile se eliminan datos por ser los intervalos demasiado pequeños
discretizador = KBinsDiscretizer(
    n_bins=700,  # Hemos 700 intervalos ya que es como mejor rendimiento saca para los valores que hemos probado
    encode='ordinal',  # Los intervalos se codifican numéricamente
    strategy='uniform'  # Intervalos de igual tamaño
    )

# Como nos interesa conservar los atributos continuos originales, realizamos
# la discretización sobre una copia del DataFrame de atributos
atributos_discretizados = atributos.copy()

atributos_discretizados[atributos_continuos] = discretizador.fit_transform(
    atributos_discretizados[atributos_continuos]
)
atributos_discretizados.head()

# El id_node no lo discretizamos, al igual que el name

Unnamed: 0,id_node,name,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,0.061673,0.0,245.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.929866,0.0,281.0,0.0,124.0,70.0,0.0,1.0,106.0,0.0
2,2,0.106687,0.0,220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.191517,0.0,251.0,0.0,0.0,18.0,0.0,3.0,63.0,0.0
4,4,0.969442,0.0,186.0,0.0,0.0,0.0,0.0,7.0,21.0,0.0


In [50]:
# Con alpha = 1 que es el mejor valor
tubería_NB1 = Pipeline([('preprocesador', discretizador),
                       ('naive_Bayes', CategoricalNB(alpha=1))])

In [51]:
# Con alpha != 1 
tubería_NB2 = Pipeline([('preprocesador', discretizador),
                       ('naive_Bayes', CategoricalNB(alpha=2))])

In [52]:
# Validación cruzada
resultados_validación_cruzadaConAlphaOptimo = cross_validate(tubería_NB1,
                                               atributos_entrenamiento,
                                               objetivo_entrenamiento,
                                               scoring='recall',
                                               cv=10)
resultados_validación_cruzadaConAlphaOptimo

{'fit_time': array([0.03686166, 0.03129148, 0.02992058, 0.03992105, 0.03441739,
        0.03188252, 0.03724003, 0.03690147, 0.0392158 , 0.03191423]),
 'score_time': array([0.00598192, 0.00598359, 0.00598526, 0.00796914, 0.00601625,
        0.00698113, 0.00698113, 0.00797772, 0.00897622, 0.00698161]),
 'test_score': array([0.52054795, 0.51369863, 0.49771689, 0.49543379, 0.50799087,
        0.51425314, 0.51539339, 0.51197263, 0.5359179 , 0.51539339])}

In [53]:
# Validación cruzada
resultados_validación_cruzadaConAlphaNoOptimo = cross_validate(tubería_NB2,
                                               atributos_entrenamiento,
                                               objetivo_entrenamiento,
                                               scoring='recall',
                                               cv=10)
resultados_validación_cruzadaConAlphaNoOptimo

{'fit_time': array([0.03415895, 0.03172445, 0.0319488 , 0.03291178, 0.03091669,
        0.03142428, 0.03139234, 0.03202152, 0.0304327 , 0.03055191]),
 'score_time': array([0.00598407, 0.00598454, 0.0059495 , 0.00598359, 0.00598407,
        0.00498581, 0.00598073, 0.00598502, 0.00598383, 0.00499129]),
 'test_score': array([0.456621  , 0.45205479, 0.41894977, 0.43378995, 0.44406393,
        0.44355758, 0.45153934, 0.45039909, 0.48004561, 0.44697834])}

In [54]:
resultados_validación_cruzadaConAlphaOptimo['test_score'].mean()

0.5128318572551714

In [55]:
resultados_validación_cruzadaConAlphaNoOptimo['test_score'].mean()

0.44779994064447604

In [56]:
# Vamos a realizar una búsqueda en rejilla para ver cuál es el valor de k más óptimo
tubería_NB = Pipeline([('preprocesador', discretizador),
                       ('naive_Bayes', CategoricalNB())])
rejilla_de_hiperparámetros = {
    # Suavizado del estimador llamado naive_Bayes
    'naive_Bayes__alpha': range(1, 11)
}

In [57]:
búsqueda_en_rejilla = GridSearchCV(tubería_NB1,
                                   rejilla_de_hiperparámetros,
                                   scoring='recall',
                                   cv=10)
búsqueda_en_rejilla.fit(atributos_entrenamiento, objetivo_entrenamiento)

In [58]:
búsqueda_en_rejilla.best_params_ # Observamos que el valor 1 para el Suavizado de Laplace es la mejor opción

{'naive_Bayes__alpha': 1}

In [59]:
búsqueda_en_rejilla.best_score_ # Observamos que coincide con el resultado de la validación cruzada para alpha = 1

0.5128318572551714

In [60]:
# Observamos que con la División 2 obtenemos un mayor rendimiento del modelo: División 1 -> 0.50; División 2 -> 0.51