<a href="https://colab.research.google.com/github/danielruizuleta/Proyecto-Grupo-Argos/blob/main/TP4/feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Exploración de datos y descripción

Creación feature engineering basado en este analisis de datos del siguiente Notebook https://github.com/danielruizuleta/Proyecto-Grupo-Argos/blob/main/TP3/Analisis_de_datos.ipynb

se realizará utilizando pipelines y transformadores de scikit-learn

In [51]:
from pathlib import Path

import pandas as pd
import sklearn as sk
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [52]:

# URL del archivo en GitHub
url = "https://raw.githubusercontent.com/danielruizuleta/Proyecto-Grupo-Argos/main/dataset_alpha_betha.csv"


alpha_df = pd.read_csv(url)


print(alpha_df.head())

       autoID  SeniorCity Partner Dependents Service1          Service2  \
0  7590-VHVEG           0     Yes         No       No  No phone service   
1  5575-GNVDE           0      No         No      Yes                No   
2  3668-QPYBK           0      No         No      Yes                No   
3  7795-CFOCW           0      No         No       No  No phone service   
4  9237-HQITU           0      No         No      Yes                No   

  Security OnlineBackup DeviceProtection TechSupport        Contract  \
0       No          Yes               No          No  Month-to-month   
1      Yes           No              Yes          No        One year   
2      Yes          Yes               No          No  Month-to-month   
3      Yes           No              Yes         Yes        One year   
4       No           No               No          No  Month-to-month   

  PaperlessBilling              PaymentMethod  Charges   Demand  Class  
0              Yes           Electronic che

In [53]:
# Crear la carpeta si no existe
DATA_DIR.mkdir(parents=True, exist_ok=True)
print(f"Se creó la carpeta: {DATA_DIR}")


Se creó la carpeta: /content/data


In [54]:
DATA_DIR.mkdir(parents=True, exist_ok=True)
print(f"Carpeta creada en: {DATA_DIR}")



Carpeta creada en: /content/data


In [55]:


DATA_DIR = Path.cwd() / "data" / "01_raw"


print("Archivos en 01_raw después de subir:", list(DATA_DIR.iterdir()))

Archivos en 01_raw después de subir: [PosixPath('/content/data/01_raw/dataset_alpha_betha.csv')]


In [56]:
from pathlib import Path


DATA_DIR = Path.cwd() / "data"
intermediate_dir = DATA_DIR / "02_intermediate"

intermediate_dir.mkdir(parents=True, exist_ok=True)


alpha_df = pd.read_csv(DATA_DIR / "01_raw/dataset_alpha_betha.csv")

alpha_df.to_parquet(intermediate_dir / "dataset_alpha_betha.parquet", engine="pyarrow")

print(f"Archivo guardado en: {intermediate_dir / 'dataset_alpha_betha.parquet'}")

Archivo guardado en: /content/data/02_intermediate/dataset_alpha_betha.parquet


In [57]:
alpha_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   autoID            7043 non-null   object 
 1   SeniorCity        7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   Service1          7043 non-null   object 
 5   Service2          7043 non-null   object 
 6   Security          7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  Contract          7043 non-null   object 
 11  PaperlessBilling  7043 non-null   object 
 12  PaymentMethod     7043 non-null   object 
 13  Charges           7043 non-null   float64
 14  Demand            7043 non-null   object 
 15  Class             7043 non-null   object 
dtypes: float64(1), int64(1), object(14)
memory

# Data preparation

In [58]:
# se elimina autoId porque no es relevante para el modelo
selected_features = [
    "SeniorCity",
    "Partner",
    "Dependents",
    "Service1",
    "Service2",
    "Security",
    "OnlineBackup",
    "DeviceProtection",
    "TechSupport",
    "Contract",
    "PaperlessBilling",
    "PaymentMethod",
    "Charges",
    "Demand",
    "Class",
]

alpha_features = alpha_df[selected_features].copy()
alpha_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCity        7043 non-null   int64  
 1   Partner           7043 non-null   object 
 2   Dependents        7043 non-null   object 
 3   Service1          7043 non-null   object 
 4   Service2          7043 non-null   object 
 5   Security          7043 non-null   object 
 6   OnlineBackup      7043 non-null   object 
 7   DeviceProtection  7043 non-null   object 
 8   TechSupport       7043 non-null   object 
 9   Contract          7043 non-null   object 
 10  PaperlessBilling  7043 non-null   object 
 11  PaymentMethod     7043 non-null   object 
 12  Charges           7043 non-null   float64
 13  Demand            7043 non-null   object 
 14  Class             7043 non-null   object 
dtypes: float64(1), int64(1), object(13)
memory usage: 825.5+ KB


**valores faltantes**

In [59]:
alpha_features.isna().sum()

Unnamed: 0,0
SeniorCity,0
Partner,0
Dependents,0
Service1,0
Service2,0
Security,0
OnlineBackup,0
DeviceProtection,0
TechSupport,0
Contract,0


***Duplicados***

In [60]:
duplicate_rows = alpha_features.duplicated().sum()
print("Number of duplicate rows: ", duplicate_rows)

Number of duplicate rows:  41


In [61]:
alpha_features.sample(10, random_state=42)

titanic_features = alpha_features.drop_duplicates()
titanic_features.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7002 entries, 0 to 7042
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCity        7002 non-null   int64  
 1   Partner           7002 non-null   object 
 2   Dependents        7002 non-null   object 
 3   Service1          7002 non-null   object 
 4   Service2          7002 non-null   object 
 5   Security          7002 non-null   object 
 6   OnlineBackup      7002 non-null   object 
 7   DeviceProtection  7002 non-null   object 
 8   TechSupport       7002 non-null   object 
 9   Contract          7002 non-null   object 
 10  PaperlessBilling  7002 non-null   object 
 11  PaymentMethod     7002 non-null   object 
 12  Charges           7002 non-null   float64
 13  Demand            7002 non-null   object 
 14  Class             7002 non-null   object 
dtypes: float64(1), int64(1), object(13)
memory usage: 875.2+ KB


In [62]:
# Mapea los valores 'Alpha' y 'Beta' a 1 y 0 respectivamente
alpha_features["Class"] = alpha_features["Class"].map({"Alpha": 1, "Betha": 0})

# Luego convierte la columna 'Class' a tipo 'int'
alpha_features["Class"] = alpha_features["Class"].astype(int)


In [63]:
 #Encode target variable
alpha_features["Class"] = alpha_features["Class"].astype("int")

# True = 1, False = 0
alpha_features.sample(10)

Unnamed: 0,SeniorCity,Partner,Dependents,Service1,Service2,Security,OnlineBackup,DeviceProtection,TechSupport,Contract,PaperlessBilling,PaymentMethod,Charges,Demand,Class
6067,0,No,No,No,No phone service,Yes,No,Yes,No,One year,No,Mailed check,35.3,264.8,1
5904,0,No,No,Yes,Yes,No,Yes,No,No,One year,No,Electronic check,98.9,6838.6,1
2621,0,Yes,No,Yes,No,No,Yes,Yes,Yes,Two year,Yes,Credit card (automatic),79.1,5564.85,1
2674,1,No,No,Yes,No,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.05,91.45,1
586,0,No,No,Yes,No,Yes,Yes,No,No,Two year,Yes,Credit card (automatic),63.25,3342.45,1
6023,1,No,No,Yes,No,No internet service,No internet service,No internet service,No internet service,Two year,No,Credit card (automatic),19.6,967.9,1
4045,0,No,No,Yes,No,No,No,No,No,Month-to-month,Yes,Credit card (automatic),71.0,914.0,0
4564,0,No,No,Yes,No,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.0,20.0,0
6966,0,No,Yes,No,No phone service,Yes,Yes,No,No,One year,Yes,Bank transfer (automatic),43.05,2208.05,1
3312,0,No,No,Yes,No,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.3,20.3,1


In [64]:
# Definición de columnas por tipo
cols_numeric = ["SeniorCity", "Charges"]  # Variables numéricas
cols_categoric = [
    "Partner", "Dependents", "Service1", "Service2", "Security",
    "OnlineBackup", "DeviceProtection", "TechSupport", "Contract",
    "PaperlessBilling", "PaymentMethod"
]  # Variables categóricas sin orden específico
cols_categoric_ord = ["Class"]  # Variable categórica ordinal (Alpha/Beta)

# Pipeline para datos numéricos
numeric_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),  # Reemplazar valores faltantes con la mediana
    ]
)

# Pipeline para datos categóricos sin orden
categorical_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),  # Reemplazar valores faltantes con la moda
        ("onehot", OneHotEncoder(handle_unknown="ignore")),  # Codificación One-Hot
    ]
)

# Pipeline para datos categóricos ordinales
categorical_ord_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),  # Reemplazar valores faltantes con la moda
        ("ordinal", OrdinalEncoder(categories=[["Beta", "Alpha"]])),  # Codificación ordinal
    ]
)

# Combinación de preprocesamientos
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipe, cols_numeric),
        ("categoric", categorical_pipe, cols_categoric),
        ("categoric ordinales", categorical_ord_pipe, cols_categoric_ord),
    ]
)

# Mostrar la configuración del preprocesador
print(preprocessor)

ColumnTransformer(transformers=[('numeric',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['SeniorCity', 'Charges']),
                                ('categoric',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('onehot',
                                                  OneHotEncoder(handle_unknown='ignore'))]),
                                 ['Partner', 'Dependents', 'Service1',
                                  'Service2', 'Security', 'OnlineBackup',
                                  'DeviceProtection', 'TechSupport', 'Contract',
                                  'PaperlessBilling', 'PaymentMethod']),
                                ('categoric ordinales',
                                 Pipelin

# Train / Test split

In [66]:
from sklearn.model_selection import train_test_split

# Seleccionar características (X) y objetivo (y)
X = alpha_features.drop(columns=["Class"])  # Eliminar columna objetivo
y = alpha_features["Class"]  # Columna objetivo

# Dividir en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Verificar las dimensiones de los conjuntos
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)


X_train: (5634, 14)
X_test: (1409, 14)
y_train: (5634,)
y_test: (1409,)


In [69]:

cols_numeric = ["SeniorCity", "Charges"]  # Variables numéricas
cols_categoric = ["Partner", "Dependents", "Service1", "Service2", "Security",
                  "OnlineBackup", "DeviceProtection", "TechSupport",
                  "PaperlessBilling", "PaymentMethod"]
cols_categoric_ord = ["Contract"]  # Variables categóricas ordinales


for col in cols_numeric + cols_categoric + cols_categoric_ord:
    if col not in alpha_features.columns:
        print(f"La columna '{col}' no está en el DataFrame.")

# Crear pipelines para cada tipo de dato
numeric_pipe = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median"))]
)

categorical_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)

categorical_ord_pipe = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal", OrdinalEncoder())
    ]
)

# Definir ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipe, cols_numeric),
        ("categoric", categorical_pipe, cols_categoric),
        ("categoric_ord", categorical_ord_pipe, cols_categoric_ord)
    ]
)

# Aplicar preprocesador
pipeline = Pipeline(steps=[("preprocessor", preprocessor)])

# Dividir datos
X = alpha_features.drop(columns=["Class"])
y = alpha_features["Class"]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Ajustar pipeline
pipeline.fit(X_train)
X_train_preprocessed = pipeline.transform(X_train)


In [70]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Modelo
model = RandomForestClassifier()

# Pipeline completo
pipeline = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", model),
    ]
)

# Entrenamiento del modelo
pipeline.fit(X_train, y_train)

# Predicciones
y_pred = pipeline.predict(X_test)


In [71]:
from sklearn.model_selection import train_test_split

X_features = alpha_features.drop("Class", axis="columns")
Y_target = alpha_features["Class"]

# Dividir datos en entrenamiento (80%) y prueba (20%) con estratificación
x_train, x_test, y_train, y_test = train_test_split(
    X_features, Y_target, test_size=0.2, stratify=Y_target, random_state=42
)
print("Tamaños de los conjuntos:")
print("x_train:", x_train.shape)
print("y_train:", y_train.shape)
print("x_test:", x_test.shape)
print("y_test:", y_test.shape)


Tamaños de los conjuntos:
x_train: (5634, 14)
y_train: (5634,)
x_test: (1409, 14)
y_test: (1409,)


**Preprocessing pipeline**

In [72]:
# Ajustar el preprocesador con los datos de entrenamiento y transformar
x_train_transformed = preprocessor.fit_transform(x_train)

# Obtener los nombres de las características transformadas
feature_names = preprocessor.get_feature_names_out()

# Convertir los datos transformados en un DataFrame para facilitar la visualización
x_train_transformed = pd.DataFrame(x_train_transformed, columns=feature_names)

# Información del DataFrame transformado
x_train_transformed.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5634 entries, 0 to 5633
Data columns (total 30 columns):
 #   Column                                              Non-Null Count  Dtype  
---  ------                                              --------------  -----  
 0   numeric__SeniorCity                                 5634 non-null   float64
 1   numeric__Charges                                    5634 non-null   float64
 2   categoric__Partner_No                               5634 non-null   float64
 3   categoric__Partner_Yes                              5634 non-null   float64
 4   categoric__Dependents_No                            5634 non-null   float64
 5   categoric__Dependents_Yes                           5634 non-null   float64
 6   categoric__Service1_No                              5634 non-null   float64
 7   categoric__Service1_Yes                             5634 non-null   float64
 8   categoric__Service2_No                              5634 non-null   float64
 9

In [73]:
x_train_transformed.head()

Unnamed: 0,numeric__SeniorCity,numeric__Charges,categoric__Partner_No,categoric__Partner_Yes,categoric__Dependents_No,categoric__Dependents_Yes,categoric__Service1_No,categoric__Service1_Yes,categoric__Service2_No,categoric__Service2_No phone service,...,categoric__TechSupport_No,categoric__TechSupport_No internet service,categoric__TechSupport_Yes,categoric__PaperlessBilling_No,categoric__PaperlessBilling_Yes,categoric__PaymentMethod_Bank transfer (automatic),categoric__PaymentMethod_Credit card (automatic),categoric__PaymentMethod_Electronic check,categoric__PaymentMethod_Mailed check,categoric_ord__Contract
0,0.0,20.3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,90.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,104.35,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0
3,1.0,49.15,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,0.0,89.9,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,2.0


In [74]:
x_train.head()

Unnamed: 0,SeniorCity,Partner,Dependents,Service1,Service2,Security,OnlineBackup,DeviceProtection,TechSupport,Contract,PaperlessBilling,PaymentMethod,Charges,Demand
6576,0,No,No,Yes,No,No internet service,No internet service,No internet service,No internet service,Month-to-month,No,Mailed check,20.3,41.2
4814,0,No,No,Yes,Yes,No,No,No,Yes,Month-to-month,Yes,Electronic check,90.0,1993.8
1781,1,Yes,No,Yes,Yes,No,Yes,Yes,No,One year,Yes,Electronic check,104.35,6339.45
674,1,No,No,No,No phone service,No,Yes,No,No,Month-to-month,Yes,Electronic check,49.15,2550.9
2983,0,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Two year,No,Credit card (automatic),89.9,6457.15
