# Data Preparation
## Import Libraries

In [1]:
import pandas as pd 
import numpy as np 
import scipy as sp 

import matplotlib.pyplot as plt
import seaborn as sns

# estas son las clases para sustitutición con sklearn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# dividir dataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier

# one hot encoding con feature-engine
from feature_engine.encoding import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier

# GroupKFold en un pipeline con StandardScaler y SVC
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV

## Import Data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
# Obtener las columnas numéricas y categóricas
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = df.select_dtypes(include=['object']).columns
# Convertir las columnas categóricas a tipo str
df[categorical_columns] = df[categorical_columns].astype(str)

# Definir las transformaciones para las columnas numéricas
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Definir las transformaciones para las columnas categóricas
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Crear la columna transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

# Crear la pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Aplicar la pipeline al DataFrame
transformed_df = pipeline.fit_transform(df)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Entrenar un modelo de bosque aleatorio
model = RandomForestClassifier()
model.fit(transformed_df, target_variable)

# Obtener la importancia de las características
feature_importance = model.feature_importances_

# Crear un DataFrame con las características y su importancia
feature_importance_df = pd.DataFrame({'Feature': all_columns_transformed, 'Importance': feature_importance})

# Ordenar por importancia descendente
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Visualizar las características más importantes
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Importancia de las Características')
plt.show()


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

# Seleccionar las k mejores características (ajusta k según sea necesario)
k_best = 10
selector = SelectKBest(score_func=f_classif, k=k_best)
selected_features = selector.fit_transform(transformed_df, target_variable)

# Obtener los nombres de las características seleccionadas
selected_feature_names = all_columns_transformed[selector.get_support()]

# Visualizar las características seleccionadas
selected_features_df = pd.DataFrame(selected_features, columns=selected_feature_names)


## Explore Data