Importación y carga de datos de CoverType

In [None]:
import sklearn 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Cargar dataset CoverType
data = sklearn.datasets.fetch_covtype(as_frame=True)

X = data.data      
y = data.target    

print(X.head())
print(y.head())

   Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0     2596.0    51.0    3.0                             258.0   
1     2590.0    56.0    2.0                             212.0   
2     2804.0   139.0    9.0                             268.0   
3     2785.0   155.0   18.0                             242.0   
4     2595.0    45.0    2.0                             153.0   

   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                             0.0                            510.0   
1                            -6.0                            390.0   
2                            65.0                           3180.0   
3                           118.0                           3090.0   
4                            -1.0                            391.0   

   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
0          221.0           232.0          148.0   
1          220.0           235.0          151.0   
2          234.0           238.0   

In [None]:
binary_cols = [col for col in X.columns if set(X[col].unique()) <= {0,1}]
numeric_cols = [col for col in X.columns if col not in binary_cols]

# Scale only numeric columns
scaler = StandardScaler()
X_scaled = X.copy()
X_scaled[numeric_cols] = scaler.fit_transform(X[numeric_cols])

print("\nScaled values:\n", X_scaled[numeric_cols].head())
print("\nBinary columns left untouched:\n", X_scaled[binary_cols].head())

Original values:
    Elevation  Aspect  Slope  Horizontal_Distance_To_Hydrology  \
0     2596.0    51.0    3.0                             258.0   
1     2590.0    56.0    2.0                             212.0   
2     2804.0   139.0    9.0                             268.0   
3     2785.0   155.0   18.0                             242.0   
4     2595.0    45.0    2.0                             153.0   

   Vertical_Distance_To_Hydrology  Horizontal_Distance_To_Roadways  \
0                             0.0                            510.0   
1                            -6.0                            390.0   
2                            65.0                           3180.0   
3                           118.0                           3090.0   
4                            -1.0                            391.0   

   Hillshade_9am  Hillshade_Noon  Hillshade_3pm  \
0          221.0           232.0          148.0   
1          220.0           235.0          151.0   
2          234.0 

De acuerdo con las variables vistas en este conjunto de datos se observa lo siguiente: 

In [8]:
y_binary = (y == 2).astype(int)  # 1 = normal (tipo 2), 0 = anómalo

# Revisar
print(y.value_counts())       # distribución original
print(y_binary.value_counts()) # distribución binaria

Cover_Type
2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: count, dtype: int64
Cover_Type
0    297711
1    283301
Name: count, dtype: int64


Como se puede observar, este es el tipo de 

In [19]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Split normal and anomaly subsets
X_normal = X[y_binary == 1]
X_anomaly = X[y_binary == 0]

# ---- TRAIN & VALIDATION ----
# Only normals
X_train, X_val = train_test_split(
    X_normal, test_size=0.2, random_state=42
)

# ---- TEST ----
# Contains both normals and anomalies
X_test = pd.concat([X_normal.sample(5000, random_state=42),
                    X_anomaly.sample(5000, random_state=42)])
y_test = np.concatenate([np.ones(5000), np.zeros(5000)])  # 1=normal, 0=anomaly

# Shuffle test set (mix normal + anomaly)
test_df = X_test.copy()
test_df["label"] = y_test
test_df = test_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Final test sets
X_test = test_df.drop(columns=["label"])
y_test = test_df["label"].values
y_test = y_test.astype(int)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)
print("Test shape:", X_test.shape, "with labels:", np.bincount(y_test))

Train shape: (226640, 54)
Validation shape: (56661, 54)
Test shape: (10000, 54) with labels: [5000 5000]
