In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import joblib

train = pd.read_csv('../data/processed/train/train-dengue.csv')
test = pd.read_csv('../data/processed/test/test-dengue.csv')

In [None]:
# Preprocesar datos (dummy variables para categorías)
df_train = pd.get_dummies(train, columns=['SEXO', 'TIPO_PACIENTE', 'RESULTADO_PCR'], drop_first=True)

# Separar características y etiqueta
X_train = df_train.drop('ESTATUS_CASO', axis=1)
y_train = df_train['ESTATUS_CASO']

# Normalizar datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Convertir nombres de columnas a string
X_train_df = pd.DataFrame(X_train, columns=[str(i) for i in range(X_train.shape[1])])
X_train_df['ESTATUS_CASO'] = y_train.values

# Guardar los datos preprocesados
X_train_df.to_parquet('processed_train_data.parquet')
joblib.dump(scaler, 'scaler.joblib')

In [None]:
# Construir y evaluar el modelo
model = LogisticRegression(max_iter=1000)

# K-Fold cross-validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)
cv_results = cross_val_score(model, X_train, y_train, cv=kf, scoring='accuracy')

print(f"Cross-validation accuracy scores: {cv_results}")
print(f"Mean cross-validation accuracy: {cv_results.mean()}")

# Entrenar el modelo en todo el conjunto de entrenamiento
model.fit(X_train, y_train)

In [None]:
# Preprocesar datos de prueba
df_test = pd.get_dummies(test, columns=['SEXO', 'TIPO_PACIENTE', 'RESULTADO_PCR'], drop_first=True)
X_test = df_test.drop('ESTATUS_CASO', axis=1)
y_test = df_test['ESTATUS_CASO']

# Normalizar datos
X_test = scaler.transform(X_test)

In [None]:
# Modelo de regresión logística
model = LogisticRegression(max_iter=1000)

# Ajuste de hiperparámetros
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}  # Ejemplo de valores para ajustar
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Mejor modelo encontrado
best_model = grid_search.best_estimator_

# Predicción en el conjunto de prueba
y_pred = best_model.predict(X_test)

# Evaluación del modelo
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)