In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

In [5]:
df = pd.read_csv("../data/The_Cancer_data_1500_V2.csv")

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               1500 non-null   int64  
 1   Gender            1500 non-null   int64  
 2   BMI               1500 non-null   float64
 3   Smoking           1500 non-null   int64  
 4   GeneticRisk       1500 non-null   int64  
 5   PhysicalActivity  1500 non-null   float64
 6   AlcoholIntake     1500 non-null   float64
 7   CancerHistory     1500 non-null   int64  
 8   Diagnosis         1500 non-null   int64  
dtypes: float64(3), int64(6)
memory usage: 105.6 KB


### Feature engineering

In [26]:
# Interacciones
df["Age_Smoking"] = df["Age"] * df["Smoking"]
df["BMI_Activity"] = df["BMI"] * df["PhysicalActivity"]
df["Genetic_Smoking"] = df["GeneticRisk"] * df["Smoking"]
df["Alcohol_Smoking"] = df["AlcoholIntake"] * df["Smoking"]

# Variables derivadas
df["Obese"] = (df["BMI"] >= 30).astype(int)
df["LowActivity"] = (df["PhysicalActivity"] < 2).astype(int)
df["HeavyDrinker"] = (df["AlcoholIntake"] > 3).astype(int)

# Variables trasnformadas
df["Log_Alcohol"] = np.log1p(df["AlcoholIntake"])
df["Log_Activity"] = np.log1p(df["PhysicalActivity"])

# Nuevas variables no lineales
df["Age_squared"] = df["Age"] ** 2
df["BMI_squared"] = df["BMI"] ** 2
df["BMI_per_Age"] = df["BMI"] / df["Age"]

In [27]:
df.head()

Unnamed: 0,Age,Gender,BMI,Smoking,GeneticRisk,PhysicalActivity,AlcoholIntake,CancerHistory,Diagnosis,Age_Smoking,...,Genetic_Smoking,Alcohol_Smoking,Obese,LowActivity,HeavyDrinker,Log_Alcohol,Log_Activity,Age_squared,BMI_squared,BMI_per_Age
0,58,1,16.085313,0,1,8.146251,4.148219,1,1,0,...,0,0.0,0,0,1,1.638651,2.213344,3364,258.737305,0.277333
1,71,0,30.828784,0,1,9.36163,3.519683,0,0,0,...,0,0.0,1,0,1,1.508442,2.33811,5041,950.413947,0.434208
2,48,1,38.785084,0,2,5.135179,4.728368,0,1,0,...,0,0.0,1,0,1,1.745431,1.814039,2304,1504.282706,0.808023
3,34,0,30.040296,0,0,9.502792,2.044636,0,0,0,...,0,0.0,1,0,0,1.113381,2.351641,1156,902.419354,0.883538
4,62,1,35.479721,0,0,5.35689,3.309849,0,1,0,...,0,0.0,1,0,1,1.460903,1.849539,3844,1258.810637,0.572254


### Selección final de variables

In [28]:
target = "Diagnosis"

numerical_cols = [
    "Age", "BMI", "PhysicalActivity", "AlcoholIntake",
    "Age_Smoking", "BMI_Activity", "Genetic_Smoking",
    "Alcohol_Smoking", "Log_Alcohol", "Log_Activity",
    "Age_squared", "BMI_squared", "BMI_per_Age"
]

binary_cols = [
    "Gender", "Smoking", "CancerHistory", 
    "Obese", "LowActivity", "HeavyDrinker"
]

ordinal_cols = ["GeneticRisk"]

In [29]:
X = df.drop(columns=[target])
y = df[target]

In [30]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [31]:
scaler = StandardScaler()

X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])