# Preprocesamiento de los datos

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tabulate import tabulate

## Cargar los datos del archivo `data.csv

### Leer el Dataset

In [2]:
# Cargar el dataset desde el archivo CSV
file_path = "../data/data.csv"
df = pd.read_csv(file_path, header=None)

## Valores faltantes en el DataFrame

In [3]:
# Verificar si hay o no valores faltantes en cada columna
missing_values = df.isnull().sum()

# Mostrar el número de valores faltantes por columna
print("Valores faltantes por columna:")
print(missing_values)

# Verificar si hay alguna columna con valores faltantes
if missing_values.any():
    print("\n¡Advertencia! Hay columnas con valores faltantes en el dataset.")
else:
    print("\nNo hay valores faltantes en el dataset.")

Valores faltantes por columna:
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
dtype: int64

No hay valores faltantes en el dataset.


## Nombre de las 30 características

| **Mean**     |                |                             | I     | **Error**    |                |                             | I     | **Worst**    |                  |                                |
|--------------|----------------|-----------------------------|-------|--------------|----------------|-----------------------------|-------|--------------|------------------|--------------------------------|
| Nombre Corto | Característica | Nombre de la Característica | **I** | Nombre Corto | Característica | Nombre de la Característica | **I** | Nombre Corto | Característica   | Nombre de la Característica    |
|--------------|----------------|-----------------------------|       |--------------|----------------|-----------------------------|       |--------------|------------------|--------------------------------|
| feat01       | feature01      | mean radius                 | **I** | feat11       | feature11      | radius error                | **I** |  feat21      | feature21        | worst radius                   |
| feat02       | feature02      | mean texture                | **I** | feat12       | feature12      | texture error               | **I** |  feat22      | feature22        | worst texture                  |
| feat03       | feature03      | mean perimeter              | **I** | feat13       | feature13      | perimeter error             | **I** |  feat23      | feature23        | worst perimeter                |
| feat04       | feature04      | mean area                   | **I** | feat14       | feature14      | area error                  | **I** |  feat24      | feature24        | worst area                     |
| feat05       | feature05      | mean smoothness             | **I** | feat15       | feature15      | smoothness error            | **I** |  feat25      | feature25        | worst smoothness               |
| feat06       | feature06      | mean compactness            | **I** | feat16       | feature16      | compactness error           | **I** |  feat26      | feature26        | worst compactness              |
| feat07       | feature07      | mean concavity              | **I** | feat17       | feature17      | concavity error             | **I** |  feat27      | feature27        | worst concavity                |
| feat08       | feature08      | mean concave points         | **I** | feat18       | feature18      | concave points error        | **I** |  feat28      | feature28        | worst concave points           |
| feat09       | feature09      | mean symmetry               | **I** | feat19       | feature19      | symmetry error              | **I** |  feat29      | feature29        | worst symmetry                 |
| feat10       | feature10      | mean fractal dimension      | **I** | feat20       | feature20      | fractal dimension error     | **I** |  feat30      | feature30        | worst fractal dimension        |

In [4]:
# Crear nombres cortos para las características
feature_names = [f'feat{str(i+1).zfill(2)}' for i in range(30)]

# Eliminar columna ID
df = df.drop([0], axis=1)  # El ID no aporta información

# Asignar nombres cortos a las columnas de características
df.columns = ['diagnosis'] + feature_names  # Concatenamos la lista

# Mostrar los primeros registros en formato tabla
print("\nPrimeros registros del dataset:\n")
print(tabulate(df.head(21), headers='keys', tablefmt='rst', showindex=True, floatfmt='.4f'))


Primeros registros del dataset:

  ..  diagnosis      feat01    feat02    feat03     feat04    feat05    feat06    feat07    feat08    feat09    feat10    feat11    feat12    feat13    feat14    feat15    feat16    feat17    feat18    feat19    feat20    feat21    feat22    feat23     feat24    feat25    feat26    feat27    feat28    feat29    feat30
   0  M             17.9900   10.3800  122.8000  1001.0000    0.1184    0.2776    0.3001    0.1471    0.2419    0.0787    1.0950    0.9053    8.5890  153.4000    0.0064    0.0490    0.0537    0.0159    0.0300    0.0062   25.3800   17.3300  184.6000  2019.0000    0.1622    0.6656    0.7119    0.2654    0.4601    0.1189
   1  M             20.5700   17.7700  132.9000  1326.0000    0.0847    0.0786    0.0869    0.0702    0.1812    0.0567    0.5435    0.7339    3.3980   74.0800    0.0052    0.0131    0.0186    0.0134    0.0139    0.0035   24.9900   23.4100  158.8000  1956.0000    0.1238    0.1866    0.2416    0.1860    0.2750    0.0890
   2  

## Información sobre el DataFrame

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   diagnosis  569 non-null    object 
 1   feat01     569 non-null    float64
 2   feat02     569 non-null    float64
 3   feat03     569 non-null    float64
 4   feat04     569 non-null    float64
 5   feat05     569 non-null    float64
 6   feat06     569 non-null    float64
 7   feat07     569 non-null    float64
 8   feat08     569 non-null    float64
 9   feat09     569 non-null    float64
 10  feat10     569 non-null    float64
 11  feat11     569 non-null    float64
 12  feat12     569 non-null    float64
 13  feat13     569 non-null    float64
 14  feat14     569 non-null    float64
 15  feat15     569 non-null    float64
 16  feat16     569 non-null    float64
 17  feat17     569 non-null    float64
 18  feat18     569 non-null    float64
 19  feat19     569 non-null    float64
 20  feat20    

## Mostrar información básica del dataset

In [6]:
# Mostrar información básica del dataset
print("Dimensiones del dataset:")
print(f"Número de muestras: {df.shape[0]}")
print(f"Número de características: {df.shape[1] - 1}")  # Restamos 1 para excluir 'diagnosis'

# Calcular distribución de clases con porcentajes
class_distribution = df['diagnosis'].value_counts()
class_percentages = df['diagnosis'].value_counts(normalize=True) * 100

print("\nDistribución de clases:")
print("B (Benigno):", class_distribution['B'], f"muestras ({class_percentages['B']:.1f}%)")
print("M (Maligno):", class_distribution['M'], f"muestras ({class_percentages['M']:.1f}%)")

Dimensiones del dataset:
Número de muestras: 569
Número de características: 30

Distribución de clases:
B (Benigno): 357 muestras (62.7%)
M (Maligno): 212 muestras (37.3%)


# Estadística descriptiva
Descripción de los datos de entrenamiento utilizando una serie de métricas que permiten describir las columnas del Dataset.

In [9]:
import os
# Vemos en que directorio estamos. Seguramente estaremos en:
# ............../perceptron/notebooks
print(os.getcwd())

/Users/apa/Documents/github/ai/perceptron/notebooks


In [10]:
import sys
sys.path.append('..')  # Añade el directorio padre (logistic_regression) al path
from src.ft_functions import *

In [11]:
def calculate_metrics(df):
    """Calculate metrics for float columns."""
    # Select numeric columns (float64)
    numeric_columns = df.select_dtypes(include=['float64']).columns
    metrics = {}
    
    for col in numeric_columns:
        values = df[col].dropna().tolist()
        metrics[col] = {
            "Count": ft_count(values),
            "Mean": ft_mean(values),
            "Std": ft_std(values),
            "Min": ft_min(values),
            "25%": ft_percentile(values, 0.25),
            "50%": ft_median(values),
            "75%": ft_percentile(values, 0.75),
            "Max": ft_max(values),
            "IQR": ft_iqr(values),
            "Skewness": ft_skewness(values),
            "Kurtosis": ft_kurtosis(values),
            "CV": ft_cv(values)
        }
    
    return metrics

In [12]:
def print_metrics_table(metrics):
    """Print calculated metrics in a formatted table."""
    table_data = []
    headers = [""] + list(metrics.keys())
    
    metrics_to_display = [
        "Count", "Mean", "Std", "Min", "25%", "50%", "75%", "Max",
        "IQR", "Skewness", "Kurtosis", "CV"
    ]
    
    for metric in metrics_to_display:
        row = [metric]
        for col in metrics:
            value = metrics[col][metric]
            row.append(f"{value:.6f}" if isinstance(value, float) else f"{value}")
        table_data.append(row)
    
    print(tabulate(table_data, headers=headers, tablefmt="fancy_grid"))

In [16]:
def analyze_dataframe(df):
    """Analyze dataframe by loading and calculating metrics."""
    metrics = calculate_metrics(df)
    print_metrics_table(metrics)

analyze_dataframe(df)

╒══════════╤════════════╤════════════╤════════════╤═════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤═══════════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤═════════════╤════════════╤════════════╤════════════╤════════════╤════════════╤════════════╕
│          │     feat01 │     feat02 │     feat03 │      feat04 │     feat05 │     feat06 │     feat07 │     feat08 │     feat09 │     feat10 │     feat11 │     feat12 │     feat13 │    feat14 │     feat15 │     feat16 │     feat17 │     feat18 │     feat19 │     feat20 │     feat21 │     feat22 │     feat23 │      feat24 │     feat25 │     feat26 │     feat27 │     feat28 │     feat29 │     feat30 │
╞══════════╪════════════╪════════════╪════════════╪═════════════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╪════════════╪══════════