In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
dry_bean = fetch_ucirepo(id=602) 
  
# data (as pandas dataframes) 
X = dry_bean.data.features 
y = dry_bean.data.targets 
  
# metadata 
print(dry_bean.metadata) 
  
# variable information 
print(dry_bean.variables) 


{'uci_id': 602, 'name': 'Dry Bean', 'repository_url': 'https://archive.ics.uci.edu/dataset/602/dry+bean+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/602/data.csv', 'abstract': 'Images of 13,611 grains of 7 different registered dry beans were taken with a high-resolution camera. A total of 16 features; 12 dimensions and 4 shape forms, were obtained from the grains.', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 13611, 'num_features': 16, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Thu Mar 28 2024', 'dataset_doi': '10.24432/C50S4B', 'creators': [], 'intro_paper': {'ID': 244, 'type': 'NATIVE', 'title': 'Multiclass classification of dry beans using computer vision and machine learning techniques', 'authors': 'M. Koklu, Ilker Ali Özkan', 'venue': 'Co

In [2]:
# Une la columna de la variable objetivo al conjunto de datos (CLASES)
X=X.join(y)

# Modifica el nombre de las columnas a números enteros
X.columns = range(17)

#Se imprime el conjunto de datos
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.998430,SEKER
2,29380,624.110,212.826130,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER
3,30008,645.884,210.557999,182.516516,1.153638,0.498616,30724,195.467062,0.782681,0.976696,0.903936,0.928329,0.007017,0.003215,0.861794,0.994199,SEKER
4,30140,620.134,201.847882,190.279279,1.060798,0.333680,30417,195.896503,0.773098,0.990893,0.984877,0.970516,0.006697,0.003665,0.941900,0.999166,SEKER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13606,42097,759.696,288.721612,185.944705,1.552728,0.765002,42508,231.515799,0.714574,0.990331,0.916603,0.801865,0.006858,0.001749,0.642988,0.998385,DERMASON
13607,42101,757.499,281.576392,190.713136,1.476439,0.735702,42494,231.526798,0.799943,0.990752,0.922015,0.822252,0.006688,0.001886,0.676099,0.998219,DERMASON
13608,42139,759.321,281.539928,191.187979,1.472582,0.734065,42569,231.631261,0.729932,0.989899,0.918424,0.822730,0.006681,0.001888,0.676884,0.996767,DERMASON
13609,42147,763.779,283.382636,190.275731,1.489326,0.741055,42667,231.653247,0.705389,0.987813,0.907906,0.817457,0.006724,0.001852,0.668237,0.995222,DERMASON


In [3]:
# DATOS CONOCIDOS: 50% registros aleatorios
d_conocidos = X.sample(frac=0.5, random_state=0)

# DATOS DESCONOCIDOS: 50% restante
d_desconocidos = X.drop(d_conocidos.index)

In [4]:
#Demostración del 50% de datos conocidos y desconocidos
print(f"{d_conocidos.shape[0]} registros conocidos y {d_desconocidos.shape[0]} registros desconocidos")

6806 registros conocidos y 6805 registros desconocidos


In [5]:
atributos = [0, 2, 4, 6, 10, 16] #ID = 426440 INDEX 16 es la clase
d_conocidos = d_conocidos[atributos]
d_desconocidos = d_desconocidos[atributos]

In [6]:
from scipy.spatial import distance
from tqdm.notebook import tqdm
import concurrent.futures
import numpy as np
import multiprocessing
import pandas as pd

# Convierte los datos a arreglos de numpy (las primeras cinco columnas)
d_desconocidos_array = d_desconocidos.iloc[:, :5].values
d_conocidos_array = d_conocidos.iloc[:, :5].values

# Clases de los datos conocidos
d_conocidos_classes = d_conocidos[16].values


# Computa la similitud entre un registro desconocido y todos los registros conocidos
def compute_similitudes(i):

    # Calcula la distancia euclidiana entre el registro desconocido y todos los registros conocidos
    distances = np.linalg.norm(d_conocidos_array - d_desconocidos_array[i], axis=1)
    return {
        'index': np.arange(d_conocidos_array.shape[0]),
        'similitud': distances,
        'class': d_conocidos_classes
    }

# Ejecuta la computación de similitudes en paralelo
def run_similarity_computation():
    registros_similitudes = []

    # Utiliza un ThreadPoolExecutor para ejecutar la computación en paralelo
    with concurrent.futures.ThreadPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:

        # Ejecuta la computación en paralelo para cada registro desconocido
        futures = {executor.submit(compute_similitudes, i): i for i in range(d_desconocidos_array.shape[0])}
        
        # Recopila los resultados de la computación en paralelo a medida que se completan
        for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
            try:
                registros_similitudes.append(future.result())
            except Exception as e:
                print(f"Error processing future: {e}")
        
        similitudes_df = pd.DataFrame(registros_similitudes)
    return similitudes_df

registros_similitudes = run_similarity_computation()


  0%|          | 0/6805 [00:00<?, ?it/s]

In [7]:
registros_similitudes

Unnamed: 0,index,similitud,class
0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[20622.092389658734, 32395.82344413365, 19255....","[DERMASON, DERMASON, DERMASON, SIRA, SEKER, CA..."
1,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[1321.9547887013425, 10453.974165667258, 2690....","[DERMASON, DERMASON, DERMASON, SIRA, SEKER, CA..."
2,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[21366.027499287487, 33140.03853250174, 19998....","[DERMASON, DERMASON, DERMASON, SIRA, SEKER, CA..."
3,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[14404.515111624365, 26178.5785465717, 13036.9...","[DERMASON, DERMASON, DERMASON, SIRA, SEKER, CA..."
4,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[17511.72039557044, 29282.941241743534, 16146....","[DERMASON, DERMASON, DERMASON, SIRA, SEKER, CA..."
...,...,...,...
6800,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[6632.687893518379, 18406.75081444794, 5265.33...","[DERMASON, DERMASON, DERMASON, SIRA, SEKER, CA..."
6801,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[6892.209338488729, 18666.251555278686, 5525.2...","[DERMASON, DERMASON, DERMASON, SIRA, SEKER, CA..."
6802,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[6666.059090836348, 18440.05601238625, 5298.38...","[DERMASON, DERMASON, DERMASON, SIRA, SEKER, CA..."
6803,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[6740.203143900562, 18514.262882413754, 5372.9...","[DERMASON, DERMASON, DERMASON, SIRA, SEKER, CA..."


In [8]:
# Ordena las similitudes de cada registro desconocido
def sort_row(row):
    sorted_indices = np.argsort(row['similitud'])
    row['index'] = np.array(row['index'])[sorted_indices].tolist()
    row['similitud'] = np.array(row['similitud'])[sorted_indices].tolist()
    row['class'] = np.array(row['class'])[sorted_indices].tolist()
    return row

# Aplica la función de ordenamiento a cada fila
registros_similitudes = registros_similitudes.apply(sort_row, axis=1)

In [9]:
# K's predefinidas
K = [3, 5, 7, 11, 13]

# Función para predecir las clases de los registros desconocidos
def predict_classes(k):
    predictions = []
    for i, row in registros_similitudes.iterrows():
        top_k_classes = row['class'][:k]
        predicted_class = max(set(top_k_classes), key=top_k_classes.count)
        predictions.append(predicted_class)
    return predictions

for k in K:
    registros_similitudes[f'predicted_class_k_{k}'] = predict_classes(k)

registros_similitudes

Unnamed: 0,index,similitud,class,predicted_class_k_3,predicted_class_k_5,predicted_class_k_7,predicted_class_k_11,predicted_class_k_13
0,"[3345, 1831, 2214, 2080, 3891, 2351, 2143, 238...","[41.28262148934, 73.96372098537354, 79.6751847...","[SIRA, SIRA, SIRA, HOROZ, HOROZ, SIRA, HOROZ, ...",SIRA,SIRA,SIRA,SIRA,HOROZ
1,"[4488, 3030, 2068, 513, 5290, 2538, 3233, 1146...","[50.711019103118645, 56.14997527764809, 64.954...","[DERMASON, DERMASON, SEKER, DERMASON, SEKER, S...",DERMASON,DERMASON,DERMASON,DERMASON,SEKER
2,"[2502, 1460, 4467, 6322, 3875, 747, 4809, 147,...","[57.92290977096799, 91.13562994718343, 101.906...","[SIRA, SEKER, HOROZ, HOROZ, HOROZ, HOROZ, HORO...",HOROZ,HOROZ,HOROZ,HOROZ,HOROZ
3,"[4523, 970, 4981, 5235, 5913, 860, 607, 634, 2...","[35.0137804195386, 62.82197255497016, 79.32017...","[SEKER, SIRA, SEKER, SIRA, SEKER, SIRA, SIRA, ...",SEKER,SEKER,SIRA,SIRA,SIRA
4,"[3266, 3322, 4787, 4408, 1624, 4588, 596, 249,...","[136.90500733414027, 151.48940688918918, 200.3...","[SIRA, SIRA, SIRA, SIRA, SIRA, SIRA, SIRA, HOR...",SIRA,SIRA,SIRA,SIRA,SIRA
...,...,...,...,...,...,...,...,...
6800,"[6182, 6069, 490, 2785, 6439, 1941, 1142, 3420...","[19.606630419328365, 25.734030091075248, 31.47...","[SIRA, DERMASON, SIRA, SEKER, SEKER, DERMASON,...",SIRA,SEKER,SEKER,SEKER,SEKER
6801,"[4347, 1263, 4768, 4980, 5148, 5317, 1860, 501...","[33.02481230679712, 39.31995819253115, 55.9499...","[SEKER, DERMASON, SIRA, SEKER, SIRA, SIRA, SIR...",DERMASON,SEKER,SIRA,SEKER,SEKER
6802,"[1941, 3420, 6439, 6182, 2785, 651, 2026, 6069...","[9.170551258255989, 25.78246859434094, 29.0918...","[DERMASON, SEKER, SEKER, SIRA, SEKER, DERMASON...",SEKER,SEKER,DERMASON,DERMASON,DERMASON
6803,"[6077, 920, 1483, 3414, 4288, 6550, 1876, 651,...","[15.360450471087523, 30.62969253146595, 30.804...","[SIRA, DERMASON, DERMASON, SIRA, SIRA, SEKER, ...",DERMASON,SIRA,SIRA,DERMASON,DERMASON


In [10]:
def confusion_matrix(k):
    classes = d_desconocidos[16].unique().tolist()
    num_classes = len(classes)

    # Inicializar con ceros
    confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)

    # Extraer los valores de las clases reales y predichas
    actual_values = d_desconocidos[16].values
    predicted_values = registros_similitudes[f'predicted_class_k_{k}'].values

    # Calcular la matriz de confusión
    for actual, predicted in zip(actual_values, predicted_values):
        # Obtener los índices de las clases
        actual_index = classes.index(actual)
        predicted_index = classes.index(predicted)

        # Incrementar la celda correspondiente
        confusion_matrix[actual_index, predicted_index] += 1
        
    # Crearlo como un DataFrame
    confusion_df = pd.DataFrame(
        confusion_matrix, 
        index=[i for i in classes], 
        columns=[i for i in classes]
    )
    print(f"Confusion matrix for k={k} with {confusion_df.sum().sum()} total predictions")

    return confusion_df

confusion_matrices = [confusion_matrix(k) for k in K]

Confusion matrix for k=3 with 6805 total predictions
Confusion matrix for k=5 with 6805 total predictions
Confusion matrix for k=7 with 6805 total predictions
Confusion matrix for k=11 with 6805 total predictions
Confusion matrix for k=13 with 6805 total predictions


In [11]:
import math

# Calcula la sensibilidad
def calc_sens(confusion_matrix):
    # Inicializa la lista de sensibilidades
    sensitivities = []

    # Recorre cada clase
    for i in range(len(confusion_matrix.columns)):
        # Calcula los valores de la matriz de confusión (VP y FN)
        TP = confusion_matrix.iloc[i,i]
        FN = confusion_matrix.iloc[i].sum() - TP

        # Calcula la sensibilidad
        sensitivity = TP/(TP+FN)

        # Agrega la sensibilidad a la lista
        sensitivities.append(sensitivity)
    return sensitivities

# Calcula la especificidad
def calc_spec(confusion_matrix):
    # Inicializa la lista de especificidades
    specificities = []

    # Recorre cada clase
    for column in confusion_matrix.columns:
        # Calcula los valores de la matriz de confusión (TN, FP y TP)
        TN = confusion_matrix.sum().sum() - confusion_matrix[column].sum() - confusion_matrix.loc[column].sum() + confusion_matrix.loc[column][column]
        TP = confusion_matrix.loc[column][column]
        FP = confusion_matrix[column].sum() - TP

        # Calcula la especificidad
        specificity = TN/(TN+FP)

        # Agrega la especificidad a la lista
        specificities.append(specificity)
    return specificities

# Calcula el DDP
def calc_DDP(sensitivity, specificity):
    return math.sqrt((1-sensitivity)**2 + (1-specificity)**2)

ddps = []
for i in range(len(K)):
    #Calcula la sensibilidad y especificidad
    sens = calc_sens(confusion_matrices[i])
    spec = calc_spec(confusion_matrices[i])

    #Calcula el DDP usando el promedio de sensibilidad y especificidad
    ddp = calc_DDP(np.mean(sens), np.mean(spec))
    ddps.append(ddp)
    print(f"DDP for K {K[i]}: {ddp}")

for i in range(len(K)):
    # Imprime el mejor DDP según el valor menor
    if ddps[i] == min(ddps):
        print(f"\nEL MEJOR DDP ES PARA K={K[i]} CON UN VALOR DE DDP DE: {ddps[i]}")



DDP for K 3: 0.3350783702561002
DDP for K 5: 0.33297239853348115
DDP for K 7: 0.3406647053167041
DDP for K 11: 0.34249764495883417
DDP for K 13: 0.3483375156668893

EL MEJOR DDP ES PARA K=5 CON UN VALOR DE DDP DE: 0.33297239853348115
