In [2]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix
import joblib

## Preprocesamiento

In [3]:
train = pd.read_csv('../data/processed/train/train-dengue.csv')
test = pd.read_csv('../data/processed/test/test-dengue.csv')

In [4]:
print(train.shape)
print(test.shape)

(26146, 35)
(26146, 35)


In [5]:
train.isna().sum()

SEXO                      1094
EDAD_ANOS                 1094
ENTIDAD_RES               1094
MUNICIPIO_RES             1094
FECHA_SIGN_SINTOMAS       1094
TIPO_PACIENTE             1094
HEMORRAGICOS              1094
DIABETES                  1094
HIPERTENSION              1094
ENFERMEDAD_ULC_PEPTICA    1094
ENFERMEDAD_RENAL          1094
INMUNOSUPR                1094
CIRROSIS_HEPATICA         1094
EMBARAZO                  1094
DEFUNCION                 1094
RESULTADO_PCR             1094
ESTATUS_CASO              1094
PCA_GEOGRAFICO_1          1106
PCA_GEOGRAFICO_2          1106
PCA_GEOGRAFICO_3          1106
PCA_GEOGRAFICO_4          1106
PCA_GEOGRAFICO_5          1106
PCA_GEOGRAFICO_6          1106
PCA_GEOGRAFICO_7          1106
PCA_GEOGRAFICO_8          1106
PCA_GEOGRAFICO_9          1106
PCA_GEOGRAFICO_10         1106
PCA_GEOGRAFICO_11         1106
PCA_COMORBILIDAD_1        1106
PCA_COMORBILIDAD_2        1106
PCA_COMORBILIDAD_3        1106
ANO_SIGN_SINTOMAS         1094
MES_SIGN

In [6]:
test.isna().sum()

SEXO                      1119
EDAD_ANOS                 1119
ENTIDAD_RES               1119
MUNICIPIO_RES             1119
FECHA_SIGN_SINTOMAS       1119
TIPO_PACIENTE             1119
HEMORRAGICOS              1119
DIABETES                  1119
HIPERTENSION              1119
ENFERMEDAD_ULC_PEPTICA    1119
ENFERMEDAD_RENAL          1119
INMUNOSUPR                1119
CIRROSIS_HEPATICA         1119
EMBARAZO                  1119
DEFUNCION                 1119
RESULTADO_PCR             1119
ESTATUS_CASO              1119
PCA_GEOGRAFICO_1          1107
PCA_GEOGRAFICO_2          1107
PCA_GEOGRAFICO_3          1107
PCA_GEOGRAFICO_4          1107
PCA_GEOGRAFICO_5          1107
PCA_GEOGRAFICO_6          1107
PCA_GEOGRAFICO_7          1107
PCA_GEOGRAFICO_8          1107
PCA_GEOGRAFICO_9          1107
PCA_GEOGRAFICO_10         1107
PCA_GEOGRAFICO_11         1107
PCA_COMORBILIDAD_1        1107
PCA_COMORBILIDAD_2        1107
PCA_COMORBILIDAD_3        1107
ANO_SIGN_SINTOMAS         1119
MES_SIGN

In [10]:
train.dropna(inplace=True)
test.dropna(inplace=True)

## Clases desbalanceadas

In [8]:
train['DEFUNCION'].value_counts()

DEFUNCION
2.0    23864
1.0       82
Name: count, dtype: int64

In [9]:
test['DEFUNCION'].value_counts()

DEFUNCION
2.0    23829
1.0       91
Name: count, dtype: int64

Realizamos un oversampling de la clase minoritaria

In [13]:
#pip install imbalanced-learn

In [18]:
from imblearn.over_sampling import RandomOverSampler

# Separar las características (X) y la variable objetivo (y)
X_train = train.drop(columns=['DEFUNCION'])
y_train = train['DEFUNCION']

# Crear una instancia de RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Aplicar el sobremuestreo
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

# Verificar las nuevas distribuciones de clases
print(y_resampled.value_counts())

DEFUNCION
2.0    23864
1.0    23864
Name: count, dtype: int64


In [19]:
# Separar las características (X) y la variable objetivo (y)
X_test = test.drop(columns=['DEFUNCION'])
y_test = test['DEFUNCION']

# Crear una instancia de RandomOverSampler
ros = RandomOverSampler(random_state=42)

# Aplicar el sobremuestreo
X_resampled, y_resampled = ros.fit_resample(X_test, y_test)

# Verificar las nuevas distribuciones de clases
print(y_resampled.value_counts())

DEFUNCION
2.0    23829
1.0    23829
Name: count, dtype: int64


In [32]:
# Seleccionar columnas relevantes
relevant_columns = ['SEXO', 'EDAD_ANOS', 'TIPO_PACIENTE', 'HEMORRAGICOS', 'DIABETES',
                    'HIPERTENSION', 'ENFERMEDAD_ULC_PEPTICA', 'ENFERMEDAD_RENAL',
                    'INMUNOSUPR', 'CIRROSIS_HEPATICA', 'EMBARAZO', 'RESULTADO_PCR', 'DEFUNCION']

X_train = train[relevant_columns]
X_test = test[relevant_columns]

# Preprocesar datos (dummy variables para categorías)
X_train = pd.get_dummies(X_train, columns=['SEXO', 'TIPO_PACIENTE', 'RESULTADO_PCR'], drop_first=True)
X_test = pd.get_dummies(X_test, columns=['SEXO', 'TIPO_PACIENTE', 'RESULTADO_PCR'], drop_first=True)

# Normalizar datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Generar los dataframes de entrenamiento y test
train_df = pd.DataFrame(X_train, columns=[str(i) for i in range(X_train.shape[1])])
train_df['DEFUNCION'] = y_train.values

test_df = pd.DataFrame(X_test, columns=[str(i) for i in range(X_train.shape[1])])
test_df['DEFUNCION'] = y_test.values

# Guardar los datos preprocesados
train_df.to_parquet('processed_train_data.parquet')
test_df.to_parquet('processed_test_data.parquet')
joblib.dump(scaler, 'scaler.joblib')

['scaler.joblib']

In [34]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,DEFUNCION
0,2.512231,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,1.065,1.93534,-0.07302,-0.258417,-0.021438,0.284409,2.0
1,-0.150693,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,-0.938967,-0.516705,-0.07302,-0.258417,46.646641,-3.516066,2.0
2,-0.660614,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,-0.938967,-0.516705,-0.07302,-0.258417,-0.021438,0.284409,2.0
3,-0.207351,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,-0.938967,-0.516705,-0.07302,-0.258417,-0.021438,0.284409,2.0
4,-1.113878,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,1.065,1.93534,-0.07302,-0.258417,-0.021438,0.284409,2.0


In [35]:
test_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,DEFUNCION
0,-0.320667,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,1.065,-0.516705,-0.07302,-0.258417,-0.021438,0.284409,2.0
1,-0.320667,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,-0.938967,-0.516705,-0.07302,3.869709,-0.021438,-3.516066,2.0
2,-1.283852,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,1.065,-0.516705,-0.07302,-0.258417,-0.021438,0.284409,2.0
3,-0.887246,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,1.065,1.93534,-0.07302,-0.258417,-0.021438,0.284409,2.0
4,0.019281,0.088238,0.145585,0.120013,0.02044,0.048416,0.043871,0.028179,0.112447,0.058619,-0.938967,-0.516705,-0.07302,-0.258417,-0.021438,0.284409,2.0


In [38]:
# Construir y evaluar el modelo
model = LogisticRegression()

# K-Fold cross-validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)
cv_results = cross_val_score(model, train_df.drop('DEFUNCION', axis=1), y_train, cv=kf, scoring='accuracy')

print(f"Cross-validation accuracy scores: {cv_results}")
print(f"Mean cross-validation accuracy: {cv_results.mean()}")

# Entrenar el modelo en todo el conjunto de entrenamiento
model.fit(train_df.drop('DEFUNCION', axis=1), train_df['DEFUNCION'])

# Guardar el modelo
joblib.dump(model, 'logistic_regression_model.joblib')

Cross-validation accuracy scores: [1. 1. 1. 1. 1.]
Mean cross-validation accuracy: 1.0


['logistic_regression_model.joblib']

In [39]:
conf_matrix = confusion_matrix(y_test, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [23920, 23946]