In [5]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
import joblib

## Preprocesamiento

In [6]:
train = pd.read_csv('../data/processed/train/train-dengue.csv')

In [7]:
train.dropna(inplace=True)

In [10]:
# Seleccionar columnas relevantes
relevant_columns = ['SEXO', 'EDAD_ANOS', 'TIPO_PACIENTE', 'HEMORRAGICOS', 'DIABETES',
                    'HIPERTENSION', 'ENFERMEDAD_ULC_PEPTICA', 'ENFERMEDAD_RENAL',
                    'INMUNOSUPR', 'CIRROSIS_HEPATICA', 'EMBARAZO', 'RESULTADO_PCR', 'DEFUNCION']

df_train = train[relevant_columns]

# Preprocesar datos (dummy variables para categorías)
df_train = pd.get_dummies(df_train, columns=['SEXO', 'TIPO_PACIENTE', 'RESULTADO_PCR'], drop_first=True)

# Separar características y etiqueta
X_train = df_train.drop('DEFUNCION', axis=1)
y_train = df_train['DEFUNCION']

# Normalizar datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Convertir nombres de columnas a string
X_train_df = pd.DataFrame(X_train, columns=[str(i) for i in range(X_train.shape[1])])
X_train_df['DEFUNCION'] = y_train.values

# Guardar los datos preprocesados
X_train_df.to_parquet('processed_train_data.parquet')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [11]:
# Construir y evaluar el modelo
model = LogisticRegression()

# K-Fold cross-validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)
cv_results = cross_val_score(model, X_train_df.drop('DEFUNCION', axis=1), y_train, cv=kf, scoring='accuracy')

print(f"Cross-validation accuracy scores: {cv_results}")
print(f"Mean cross-validation accuracy: {cv_results.mean()}")

# Entrenar el modelo en todo el conjunto de entrenamiento
model.fit(X_train_df.drop('DEFUNCION', axis=1), y_train)

# Guardar el modelo
joblib.dump(model, 'logistic_regression_model.joblib')

Cross-validation accuracy scores: [0.99645094 0.99624139 0.99561495 0.99665901 0.99791188]
Mean cross-validation accuracy: 0.9965756337047627


['logistic_regression_model.joblib']