# Discretización de columnas numéricas
Este notebook aplica varios métodos de discretización sobre columnas numéricas.

## 1) Evaluar columnas que necesita discretizar

In [1]:
import pandas as pd

# Cargar el archivo proporcionado
file_path = "data/fact_laptops_preprocesado.csv"
df = pd.read_csv(file_path)

# Evaluar columnas numéricas candidatas para discretización
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Eliminar 'LaptopID' de la lista de columnas numéricas si existe
if 'LaptopID' in numeric_cols:
    numeric_cols.remove('LaptopID')

# Crear tabla resumen de columnas numéricas candidatas
numeric_summary = pd.DataFrame(df[numeric_cols].dtypes, columns=["Tipo de Dato"])
numeric_summary["Valores Únicos"] = df[numeric_cols].nunique()
numeric_summary = numeric_summary.reset_index().rename(columns={"index": "Columna"})

numeric_summary


Unnamed: 0,Columna,Tipo de Dato,Valores Únicos
0,Inches,float64,58
1,Ram,int64,4
2,Weight,float64,251
3,PriceEuros,float64,3962
4,CPUFrequency,float64,36
5,PrimaryStorage,int64,4
6,SecondaryStorage,int64,4


## 2) Discretización y Codificación con Equal-width Binning

In [2]:

from sklearn.preprocessing import KBinsDiscretizer

# Copia del dataset
df_ew = df[["Inches", "Ram", "Weight", "PriceEuros"]].copy()

# Aplicamos Equal-width Binning con 3 bins
ew = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
df_ew_discretized = pd.DataFrame(ew.fit_transform(df_ew), columns=[col + "_ew" for col in df_ew.columns])

df_ew_discretized = df_ew_discretized.astype(int)
df_ew_discretized

Unnamed: 0,Inches_ew,Ram_ew,Weight_ew,PriceEuros_ew
0,0,0,2,2
1,2,2,0,1
2,1,1,1,2
3,2,0,2,2
4,1,0,1,0
...,...,...,...,...
3995,2,2,1,2
3996,0,1,1,0
3997,2,0,0,1
3998,1,1,0,1


## 3) Discretización y Codificación con Equal-frequency Binning

In [3]:

# Aplicamos Equal-frequency Binning con 3 bins
ef = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile')
df_ef_discretized = pd.DataFrame(ef.fit_transform(df_ew), columns=[col + "_ef" for col in df_ew.columns])

df_ef_discretized = df_ef_discretized.astype(int)
df_ef_discretized



Unnamed: 0,Inches_ef,Ram_ef,Weight_ef,PriceEuros_ef
0,0,1,2,2
1,2,2,0,1
2,1,2,1,2
3,2,0,2,2
4,1,1,1,0
...,...,...,...,...
3995,2,2,1,2
3996,0,2,1,0
3997,2,0,0,1
3998,1,2,0,1


## 4) Discretización y Codificación con One-Hot Encoding

In [4]:
from sklearn.preprocessing import OneHotEncoder
# Aplicamos One-Hot Encoding sobre 'Ram' (categórica discreta)
encoder = OneHotEncoder(sparse_output=False, dtype=int) # Correccion sparse por sparce_output
ram_encoded = encoder.fit_transform(df[["Ram"]])
df_oh = pd.DataFrame(ram_encoded, columns=[f"Ram_{int(i)}" for i in encoder.categories_[0]])
df_oh

Unnamed: 0,Ram_4,Ram_8,Ram_16,Ram_32
0,0,1,0,0
1,0,0,0,1
2,0,0,1,0
3,1,0,0,0
4,0,1,0,0
...,...,...,...,...
3995,0,0,0,1
3996,0,0,1,0
3997,1,0,0,0
3998,0,0,1,0


## 5) Discretización con K-means

In [5]:
from sklearn.cluster import KMeans
df_km = df[["Inches", "Weight", "PriceEuros"]].copy()

# Aplicamos K-means discretización
kmeans_models = {}
df_kmeans_discretized = pd.DataFrame()

for col in df_km.columns:
    model = KMeans(n_clusters=3, random_state=0)
    labels = model.fit_predict(df_km[[col]])
    df_kmeans_discretized[col + "_km"] = labels
    kmeans_models[col] = model

# Concatenar columnas originales y discretizadas
df_kmeans_resultado = pd.concat([df_km, df_kmeans_discretized], axis=1)

# Mostrar las primeras filas como tabla
df_kmeans_resultado.head(10)

Unnamed: 0,Inches,Weight,PriceEuros,Inches_km,Weight_km,PriceEuros_km
0,13.0,2.84,2127.09,1,2,0
1,15.7,1.55,1891.02,0,1,2
2,13.6,2.51,2479.25,1,0,0
3,16.0,3.14,2639.51,0,2,0
4,15.1,2.34,960.99,2,0,1
5,12.0,1.53,1647.92,1,1,2
6,14.4,3.15,712.67,2,2,1
7,15.5,2.92,1218.47,2,2,2
8,12.5,2.35,2402.29,1,0,0
9,12.1,3.04,2940.56,1,2,0
