In [1]:
import pandas as pd

# Se carga la base de datos del Excel
database = pd.read_excel('bd_dg.xlsx')

In [2]:
database

Unnamed: 0,Case Number,Age,No of Pregnancy,Gestation in previous Pregnancy,BMI,HDL,Family History,unexplained prenetal loss,Large Child or Birth Default,PCOS,Sys BP,Dia BP,OGTT,Hemoglobin,Sedentary Lifestyle,Prediabetes,Class Label(GDM /Non GDM)
0,1,22,2,1,,55.0,0,0,0,0,102.0,69,,12.0,0,0,0
1,2,26,2,1,,53.0,0,0,0,0,101.0,63,,12.4,0,0,0
2,3,29,1,0,,50.0,0,0,0,0,118.0,79,,14.3,0,0,0
3,4,28,2,1,,51.0,0,0,0,0,99.0,70,,15.0,0,0,0
4,5,21,2,1,,52.0,0,0,0,0,116.0,65,,15.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3520,3521,31,4,1,24.1,32.0,0,0,1,0,150.0,107,187.0,13.4,1,1,1
3521,3522,26,3,1,34.5,43.0,1,1,0,1,166.0,85,164.0,14.2,0,0,1
3522,3523,35,2,2,23.6,56.0,1,0,1,0,178.0,81,141.0,15.3,0,1,1
3523,3524,37,2,0,23.3,28.0,1,0,1,1,139.0,115,133.0,13.3,0,1,0


In [3]:
# Se borran los datos que sean tipo NaN
database_cleaned = database.dropna()
database_cleaned

Unnamed: 0,Case Number,Age,No of Pregnancy,Gestation in previous Pregnancy,BMI,HDL,Family History,unexplained prenetal loss,Large Child or Birth Default,PCOS,Sys BP,Dia BP,OGTT,Hemoglobin,Sedentary Lifestyle,Prediabetes,Class Label(GDM /Non GDM)
2999,3000,39,3,2,32.1,31.0,1,0,0,0,139.0,80,214.0,12.8,1,1,1
3000,3001,42,1,0,30.6,30.0,1,0,0,1,145.0,92,187.0,17.8,0,1,1
3001,3002,42,1,0,32.0,29.0,1,0,0,1,157.0,98,193.0,18.0,0,1,1
3002,3003,44,3,2,30.1,30.0,1,0,0,1,154.0,105,200.0,17.1,0,1,1
3003,3004,41,2,1,31.3,31.0,1,0,0,1,159.0,94,217.0,17.1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3520,3521,31,4,1,24.1,32.0,0,0,1,0,150.0,107,187.0,13.4,1,1,1
3521,3522,26,3,1,34.5,43.0,1,1,0,1,166.0,85,164.0,14.2,0,0,1
3522,3523,35,2,2,23.6,56.0,1,0,1,0,178.0,81,141.0,15.3,0,1,1
3523,3524,37,2,0,23.3,28.0,1,0,1,1,139.0,115,133.0,13.3,0,1,0


In [4]:
# Se separa la entrada target de las otras entradas
y = database_cleaned['Class Label(GDM /Non GDM)']
X = database_cleaned.drop(['Class Label(GDM /Non GDM)'], axis=1)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Se crea el modelo de regresión logística
log_regres = LogisticRegression()

# Se divide la base de datos en entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.330, random_state=45)

In [12]:
from itertools import combinations
import numpy as np

# Combinaciones de 4 y 5 columnas
all_columns = X.columns
combinations_4 = list(combinations(all_columns, 4))
combinations_5 = list(combinations(all_columns, 5))

# Todas las posibles combinaciones
all_combinations = combinations_4 + combinations_5

#Se hace un shuffle de las combinaciones
rng = np.random.default_rng(seed=8172398273)
rng.shuffle(all_combinations)

# Se seleccionan las primeras 10 combinaciones que cumplan con el criterio de correlación de (-0.3, 0.3)
selected_combinations = []
for combination in all_combinations:
    subset = X[list(combination)]
    corr_matrix = subset.corr()
    np.fill_diagonal(corr_matrix.values, 0)

    if ((corr_matrix.abs() >= -0.3).all().all() and (corr_matrix.abs() <= 0.3)).all().all():
        selected_combinations.append(combination)
    if len(selected_combinations) >= 10:
        break

selected_combinations


[('BMI', 'unexplained prenetal loss', 'Large Child or Birth Default', 'PCOS'),
 ('Gestation in previous Pregnancy',
  'HDL',
  'Large Child or Birth Default',
  'Sys BP',
  'Prediabetes'),
 ('Age',
  'Gestation in previous Pregnancy',
  'Large Child or Birth Default',
  'Dia BP',
  'Prediabetes'),
 ('Age', 'unexplained prenetal loss', 'Large Child or Birth Default', 'OGTT'),
 ('No of Pregnancy', 'Large Child or Birth Default', 'PCOS', 'Dia BP', 'OGTT'),
 ('BMI', 'Large Child or Birth Default', 'PCOS', 'Dia BP'),
 ('Gestation in previous Pregnancy', 'BMI', 'PCOS', 'Sys BP'),
 ('Case Number', 'Gestation in previous Pregnancy', 'Sys BP', 'OGTT'),
 ('Gestation in previous Pregnancy',
  'BMI',
  'unexplained prenetal loss',
  'PCOS'),
 ('Gestation in previous Pregnancy',
  'unexplained prenetal loss',
  'PCOS',
  'Sys BP',
  'OGTT')]

In [16]:
scores = []

# Se entrena el modelo con cada combinación y se guarda el score
for combination in selected_combinations:
    log_regres.fit(x_train[list(combination)], y_train)
    score = log_regres.score(x_test[list(combination)], y_test)
    scores.append({
        'combination': combination,
        'score': score
    })
    print(f'Columns: {combination}, Score: {score}')

# Se obtiene la mejor combinación que tiene el mayor score
best_score = max(scores, key=lambda x: x['score'])
print(f'El mejor score es: {best_score["score"]} con las columnas {best_score["combination"]}')

Columns: ('BMI', 'unexplained prenetal loss', 'Large Child or Birth Default', 'PCOS'), Score: 0.8275862068965517
Columns: ('Gestation in previous Pregnancy', 'HDL', 'Large Child or Birth Default', 'Sys BP', 'Prediabetes'), Score: 0.7701149425287356
Columns: ('Age', 'Gestation in previous Pregnancy', 'Large Child or Birth Default', 'Dia BP', 'Prediabetes'), Score: 0.7758620689655172
Columns: ('Age', 'unexplained prenetal loss', 'Large Child or Birth Default', 'OGTT'), Score: 0.7586206896551724
Columns: ('No of Pregnancy', 'Large Child or Birth Default', 'PCOS', 'Dia BP', 'OGTT'), Score: 0.7931034482758621
Columns: ('BMI', 'Large Child or Birth Default', 'PCOS', 'Dia BP'), Score: 0.8045977011494253
Columns: ('Gestation in previous Pregnancy', 'BMI', 'PCOS', 'Sys BP'), Score: 0.8103448275862069
Columns: ('Case Number', 'Gestation in previous Pregnancy', 'Sys BP', 'OGTT'), Score: 0.8103448275862069
Columns: ('Gestation in previous Pregnancy', 'BMI', 'unexplained prenetal loss', 'PCOS'), Sc

In [17]:
import numpy as np
import itertools

# Se preparan los hiperparametros basados en lo especificado
iterations_max = np.arange(50, 161, 1)
solver = ['liblinear', 'newton-cholesky']
penalty = ['l1', 'l2', 'elasticnet']
combinations = list(itertools.product(iterations_max, solver, penalty))

x_train = x_train[list(best_score['combination'])]
x_test = x_test[list(best_score['combination'])]

results = []

# Se entrena el modelo con cada combinación de hiperparámetros y se guarda el score
for combination in combinations:
    try:
        logistic = LogisticRegression(max_iter=combination[0], solver=combination[1], penalty=combination[2])
        logistic.fit(x_train, y_train)

        print(f'Iteraciones: {combination[0]}, Solver: {combination[1]}, Penalty: {combination[2]}')

        score = logistic.score(x_test, y_test)
        print(f'---> Score: {score}\n')
        results.append({
            'combination': combination,
            'score': score
        })
    except Exception as e:
        print(f'Error: {e}')

# Se obtiene la mejor combinación que tiene el mayor score
best_score = max(results, key=lambda x: x['score'])
print(f'El mejor score, con optimizacion de hiperparametros es: {best_score["score"]} con la combinación {best_score["combination"]}')


Iteraciones: 50, Solver: liblinear, Penalty: l1
---> Score: 0.8160919540229885

Iteraciones: 50, Solver: liblinear, Penalty: l2
---> Score: 0.8103448275862069

Error: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.
Error: Solver newton-cholesky supports only 'l2' or None penalties, got l1 penalty.
Iteraciones: 50, Solver: newton-cholesky, Penalty: l2
---> Score: 0.8275862068965517

Error: Solver newton-cholesky supports only 'l2' or None penalties, got elasticnet penalty.
Iteraciones: 51, Solver: liblinear, Penalty: l1
---> Score: 0.8160919540229885

Iteraciones: 51, Solver: liblinear, Penalty: l2
---> Score: 0.8103448275862069

Error: Only 'saga' solver supports elasticnet penalty, got solver=liblinear.
Error: Solver newton-cholesky supports only 'l2' or None penalties, got l1 penalty.
Iteraciones: 51, Solver: newton-cholesky, Penalty: l2
---> Score: 0.8275862068965517

Error: Solver newton-cholesky supports only 'l2' or None penalties, got elasticnet penalty.
It