In [448]:
import pandas as pd
import csv
import numpy as np
np.random.seed(357823)
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import confusion_matrix, recall_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score


In [449]:
# Lectura del csv para la tabla con todas las métricas
nodes = pd.read_csv("../tablas/tableWithAllAtributes.csv")

# Lectura del csv para la tabla con todos los atributos sin los de agrupamiento
nodesWithoutClustering = pd.read_csv("../tablas/tableWithoutClustering.csv")

# Lectura del csv para la tabla con todos los atributos sin los de comunidades
nodesWithoutCommunity = pd.read_csv("../tablas/tableWithoutCommunity.csv")

#Tablas con todos los atributos sin los de nucleos
nodesWithoutKernel = pd.read_csv("../tablas/tableWithoutKernel.csv")


In [450]:
nodes.head()

Unnamed: 0,id_node,name,ml_target,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,Eiryyy,0.0,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,shawflying,0.0,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.002227,0.151515,0.0
2,2,JpMCarrilho,1.0,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,SuhwanCha,0.0,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.004454,0.090909,0.0
4,4,sunilangadi2,1.0,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.011136,0.030303,0.0


In [451]:
# PREPARACIÓN DE LOS DATOS PARA LA TABLA CON TODOS LOS ATRIBUTOS
atributos_discretos = ['name']
atributos_continuos = ['degree_centrality','closeness_centrality','betweenness_centrality','clustering_coefficient','Square clustering','triangles','greedy_modularity_communities','Core number','asyn_lpa_communities']
atributos = nodes.loc[:,['id_node'] + atributos_discretos + atributos_continuos]
atributos.head()

Unnamed: 0,id_node,name,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,Eiryyy,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,shawflying,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.002227,0.151515,0.0
2,2,JpMCarrilho,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,SuhwanCha,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.004454,0.090909,0.0
4,4,sunilangadi2,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.011136,0.030303,0.0


In [452]:
# Objetivo
objetivo = nodes['ml_target']
objetivo.head() 

0    0.0
1    0.0
2    1.0
3    0.0
4    1.0
Name: ml_target, dtype: float64

In [453]:
# Entrenamiento del modelo KNN para la tabla con todos los atributos

In [454]:
# Codificación
codificador_atributos_discretos = OrdinalEncoder() # Crear una instancia de la clase correspondiente
codificador_atributos_discretos.fit(atributos[atributos_discretos]) # Usar el método fit para ajustar a los datos los parámetros de la codificación

# Vemos información sobre los atributos discretos
print("///Información sobre los atributos discretos///")
print('Número de atributos detectados:',
      f'{codificador_atributos_discretos.n_features_in_}')
print()
print('Nombres de los atributos detectados:')
print(f'{codificador_atributos_discretos.feature_names_in_}')
print()
print('Categorías detectadas de cada atributo:')
for atributo, categorías in zip(
    codificador_atributos_discretos.feature_names_in_,
    codificador_atributos_discretos.categories_):
    print(f'{atributo}: {categorías}')

# Ahora aplicamos el método transform para codificar los datos
atributos[atributos_discretos] = codificador_atributos_discretos.transform(
    atributos[atributos_discretos]
)
print()
print("///Tabla con la columna name cambiada a números///")
atributos.head()


///Información sobre los atributos discretos///
Número de atributos detectados: 1

Nombres de los atributos detectados:
['name']

Categorías detectadas de cada atributo:
name: ['007arunwilson' '007jedgar' '00Kai0' ... 'timothykimemia' 'timoxley'
 'timqian']

///Tabla con la columna name cambiada a números///


Unnamed: 0,id_node,name,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,2325.0,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,35055.0,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.002227,0.151515,0.0
2,2,4022.0,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,7220.0,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.004454,0.090909,0.0
4,4,36547.0,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.011136,0.030303,0.0


In [455]:
# Como los valores ya los hemos normalizado previamente, sólo falta por normalizar el name, por tanto:

# Normalizamos el name
normalizador = MinMaxScaler(
    # Cada atributo se normaliza al intervalo [0, 1]
    feature_range=(0, 1)
)

# Aplicamos la normalización solo a la columna 'name'
atributos['name'] = normalizador.fit_transform(atributos[['name']])
atributos.head()

Unnamed: 0,id_node,name,degree_centrality,closeness_centrality,betweenness_centrality,clustering_coefficient,Square clustering,triangles,greedy_modularity_communities,Core number,asyn_lpa_communities
0,0,0.061673,2.7e-05,0.275005,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.929866,0.000212,0.294956,1.149733e-06,0.178571,0.072344,6.2e-05,0.002227,0.151515,0.0
2,2,0.106687,2.7e-05,0.261845,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.191517,0.000133,0.278718,5.316292e-05,0.0,0.019178,0.0,0.004454,0.090909,0.0
4,4,0.969442,5.3e-05,0.243084,6.134318e-09,0.0,0.0,0.0,0.011136,0.030303,0.0


In [456]:
# Dividimos los conjuntos de prueba y entrenamiento

# División 1
(atributos_entrenamiento, atributos_prueba,
 objetivo_entrenamiento, objetivo_prueba) = train_test_split(
        # Conjuntos de datos a dividir, usando los mismos índices para ambos
        atributos, objetivo,
        # Tamaño del conjunto de prueba (30 % en este caso)
        test_size=.3, # El más óptimo es con el 30% de conjunto de prueba
        # Estratificación según la distribución de clases en el atributo objetivo
        stratify=objetivo)

"""
# División 2
(atributos_entrenamiento, atributos_prueba,
 objetivo_entrenamiento, objetivo_prueba) = train_test_split(
        # Conjuntos de datos a dividir, usando los mismos índices para ambos
        atributos, objetivo,
        # Tamaño del conjunto de prueba (10 % en este caso)
        test_size=.1,
        # Estratificación según la distribución de clases en el atributo objetivo
        stratify=objetivo)
"""

'\n# División 2\n(atributos_entrenamiento, atributos_prueba,\n objetivo_entrenamiento, objetivo_prueba) = train_test_split(\n        # Conjuntos de datos a dividir, usando los mismos índices para ambos\n        atributos, objetivo,\n        # Tamaño del conjunto de prueba (10 % en este caso)\n        test_size=.1,\n        # Estratificación según la distribución de clases en el atributo objetivo\n        stratify=objetivo)\n'

In [457]:
tubería_kNN = Pipeline([
                        ('kNN', KNeighborsClassifier())])
rejilla_de_parámetros = {
    # Número de vecinos impar (tarea de clasificación binaria)
    'kNN__n_neighbors': range(1, 10, 2),
    # Considerar las distancias Manhattan y euclídea
    'kNN__metric': ['manhattan', 'euclidean']
}

In [458]:
búsqueda_en_rejilla = GridSearchCV(tubería_kNN,
                                   rejilla_de_parámetros,
                                   scoring='recall',
                                   cv=10)
búsqueda_en_rejilla.fit(atributos_entrenamiento, objetivo_entrenamiento)

In [459]:
búsqueda_en_rejilla.best_params_

{'kNN__metric': 'manhattan', 'kNN__n_neighbors': 1}

In [460]:
búsqueda_en_rejilla.best_score_

0.2750442466443603

In [461]:
clasificador_kNN_division1 = KNeighborsClassifier(
    # Para cada ejemplo se consideran los 1 ejemplos más cercanos
    n_neighbors=1,
    # La cercanía viene determinada por la distancia euclídea
    metric='manhattan'
)
clasificador_kNN_division1.fit(atributos_entrenamiento, objetivo_entrenamiento)

In [462]:
predicciones = clasificador_kNN_division1.predict(atributos_prueba)
predicciones

array([0., 0., 0., ..., 0., 0., 0.])

In [463]:
confusion_matrix(objetivo_prueba, predicciones)

array([[6262, 2126],
       [2128,  794]], dtype=int64)

In [464]:
recall_score(objetivo_prueba, predicciones)

0.27173169062286107

In [445]:

scores = cross_val_score(clasificador_kNN_division1, atributos_entrenamiento, objetivo_entrenamiento, cv=10, scoring='recall')
print("Scores de recall en el conjunto de prueba:", scores)
print("Promedio de scores de recall en el conjunto de prueba:", np.mean(scores))


Scores de recall en el conjunto de prueba: [0.28193833 0.25110132 0.26872247 0.271261   0.28592375 0.30791789
 0.26392962 0.2829912  0.24633431 0.29032258]
Promedio de scores de recall en el conjunto de prueba: 0.2750442466443603


In [446]:
clasificador_kNN_division2 = KNeighborsClassifier(
    # Para cada ejemplo se consideran los 1 ejemplos más cercanos
    n_neighbors=1,
    # La cercanía viene determinada por la distancia euclídea
    metric='manhattan'
)
clasificador_kNN_division2.fit(atributos_entrenamiento, objetivo_entrenamiento)

In [447]:
predicciones = clasificador_kNN_division2.predict(atributos_prueba)
predicciones

array([0., 0., 0., ..., 0., 0., 0.])

In [465]:
confusion_matrix(objetivo_prueba, predicciones)

array([[6262, 2126],
       [2128,  794]], dtype=int64)

In [466]:
recall_score(objetivo_prueba, predicciones)

0.27173169062286107

In [None]:
scores = cross_val_score(clasificador_kNN_division2, atributos_entrenamiento, objetivo_entrenamiento, cv=10, scoring='recall')
print("Scores de recall en el conjunto de prueba:", scores)
print("Promedio de scores de recall en el conjunto de prueba:", np.mean(scores))
