In [20]:
import warnings
warnings.filterwarnings("ignore")

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest

- Auxiliar functions

In [44]:
def check_outlier_by_IQR(value):
    return 1 if value >1 else 0

In [22]:
def check_data_in_range(value, min_value, max_value):
    if value > max_value or value < min_value:
        return True
    else:
        return False

In [23]:
def generate_df_count(df_values, name_columns):
    matrix_data = []

    for column in df_values.columns:

        value_counts = df_values[column].value_counts()

        row = [column, 0, 0]

        if True in value_counts.index:
            row[1] = value_counts[True]
        if False in value_counts.index:
            row[2] = value_counts[False]
        
        matrix_data.append(row)

    df_summary = pd.DataFrame(data=matrix_data, columns=name_columns)
    return df_summary

- Lectura y descripcion del dataset

In [24]:
df_data = pd.read_csv("gdm_first_trimester_ml_dataset.csv")
df_data.head(4)

Unnamed: 0,age_years,bmi_prepreg_kg_m2,systolic_bp_mmHg,diastolic_bp_mmHg,map_mmHg,gestational_weeks,fpg_mmol_l,hba1c_percent,insulin_uIU_ml,homa_ir,triglycerides_mmol_l,hdl_mmol_l,parity,family_history_t2d,previous_gdm,pcos,smoking_first_trimester,physical_activity_level,diet_score_0_100,label_gdm
0,26.8,26.9,119.0,57.0,77.7,8.7,4.1,5.33,7.8,1.42,1.37,1.36,0,0,1,0,1,2.0,62.0,0
1,22.6,27.3,,69.0,80.8,9.0,3.53,5.73,7.2,1.12,1.13,1.6,0,1,0,1,0,1.0,42.0,0
2,29.9,33.1,103.0,84.0,89.9,10.3,4.45,,4.3,0.86,1.27,1.16,0,1,0,0,0,1.0,72.0,0
3,26.3,23.6,112.0,69.0,83.3,12.2,4.79,4.69,13.7,2.91,1.22,1.73,0,0,0,0,0,0.0,81.0,0


- Inspeccione dimensiones del dataset, tipos de datos y valores faltantes.

In [25]:
df_data.shape

(1500, 20)

In [26]:
df_data.dtypes

age_years                  float64
bmi_prepreg_kg_m2          float64
systolic_bp_mmHg           float64
diastolic_bp_mmHg          float64
map_mmHg                   float64
gestational_weeks          float64
fpg_mmol_l                 float64
hba1c_percent              float64
insulin_uIU_ml             float64
homa_ir                    float64
triglycerides_mmol_l       float64
hdl_mmol_l                 float64
parity                       int64
family_history_t2d           int64
previous_gdm                 int64
pcos                         int64
smoking_first_trimester      int64
physical_activity_level    float64
diet_score_0_100           float64
label_gdm                    int64
dtype: object

In [27]:
df_null = df_data.isna()
df_summary_null= generate_df_count(df_null, ["descriptor", "count_null", "count_not_null"])
df_summary_null.sort_values(by="count_null", ascending=False)

Unnamed: 0,descriptor,count_null,count_not_null
7,hba1c_percent,162,1338
10,triglycerides_mmol_l,148,1352
6,fpg_mmol_l,109,1391
8,insulin_uIU_ml,108,1392
11,hdl_mmol_l,86,1414
17,physical_activity_level,70,1430
18,diet_score_0_100,69,1431
2,systolic_bp_mmHg,45,1455
3,diastolic_bp_mmHg,36,1464
1,bmi_prepreg_kg_m2,0,1500


- Analice estad ́ısticos descriptivos de cada variable.

In [28]:
categorical_columns=["parity", "family_history_t2d", "previous_gdm",
                     "pcos", "smoking_first_trimester", "physical_activity_level", "label_gdm"]

In [29]:
matrix_descriptors = []

for column in df_data.columns:
    if column not in categorical_columns:
        value_descriptors = df_data[column].describe()

        IQR = value_descriptors["75%"] - value_descriptors["25%"]

        row = {
            "descriptor" : column,
            "mean": value_descriptors["mean"],
            "std": value_descriptors["std"],
            "25%": value_descriptors["25%"],
            "50%": value_descriptors["50%"],
            "75%": value_descriptors["75%"],
            "IQR": IQR,
            "max_range" : value_descriptors["75%"] + IQR*1.5,
            "min_range" : value_descriptors["25%"] - IQR*1.5
        }

        matrix_descriptors.append(row)
        
df_descriptors_statistical = pd.DataFrame(matrix_descriptors)
df_descriptors_statistical

Unnamed: 0,descriptor,mean,std,25%,50%,75%,IQR,max_range,min_range
0,age_years,29.1296,4.988332,25.6,29.1,32.525,6.925,42.9125,15.2125
1,bmi_prepreg_kg_m2,26.284738,5.00536,22.9,26.35,29.4,6.5,39.15,13.15
2,systolic_bp_mmHg,112.029553,12.254459,104.0,112.0,120.0,16.0,144.0,80.0
3,diastolic_bp_mmHg,70.535519,8.516566,65.0,70.0,76.0,11.0,92.5,48.5
4,map_mmHg,84.398244,7.215258,79.7,84.3,88.9,9.2,102.7,65.9
5,gestational_weeks,10.811667,1.611753,9.4,10.8,12.2,2.8,16.4,5.2
6,fpg_mmol_l,4.801589,0.977795,4.3,4.71,5.15,0.85,6.425,3.025
7,hba1c_percent,5.218923,0.376506,5.0,5.2,5.4,0.4,6.0,4.4
8,insulin_uIU_ml,10.99052,4.487567,8.075,10.2,12.9,4.825,20.1375,0.8375
9,homa_ir,2.394468,1.683579,1.64,2.12,2.72,1.08,4.34,0.02


- Descriptores categoricos

In [30]:
for column in categorical_columns:
    print(df_data[column].value_counts())

parity
1    561
0    545
2    269
3    101
4     20
5      4
Name: count, dtype: int64
family_history_t2d
0    1043
1     457
Name: count, dtype: int64
previous_gdm
0    1351
1     149
Name: count, dtype: int64
pcos
0    1351
1     149
Name: count, dtype: int64
smoking_first_trimester
0    1331
1     169
Name: count, dtype: int64
physical_activity_level
1.0    654
0.0    487
2.0    289
Name: count, dtype: int64
label_gdm
0    1239
1     261
Name: count, dtype: int64


- Identifique posibles outliers.

In [31]:
df_outliers = pd.DataFrame()

for column in df_data.columns:
    if column not in categorical_columns:
        filter_value = df_descriptors_statistical[df_descriptors_statistical["descriptor"] == column]
        filter_value.reset_index(inplace=True)

        min_value, max_value = filter_value["min_range"][0], filter_value["max_range"][0]

        df_outliers[column] = df_data[column].apply(lambda x: check_data_in_range(x, min_value, max_value))
df_outliers

Unnamed: 0,age_years,bmi_prepreg_kg_m2,systolic_bp_mmHg,diastolic_bp_mmHg,map_mmHg,gestational_weeks,fpg_mmol_l,hba1c_percent,insulin_uIU_ml,homa_ir,triglycerides_mmol_l,hdl_mmol_l,diet_score_0_100
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,False,True,True,False,False,False,True,True,True,True,True,False,False
1496,False,False,False,True,True,False,False,False,False,False,False,False,False
1497,False,False,False,False,False,False,False,False,False,False,False,False,False
1498,False,False,False,False,False,False,False,False,False,False,True,False,False


In [32]:
df_summary_outlier= generate_df_count(df_outliers, ["descriptor", "count_outlier", "count_not_outlier"])
df_summary_outlier.sort_values(by="count_outlier", ascending=False)

Unnamed: 0,descriptor,count_outlier,count_not_outlier
9,homa_ir,59,1441
8,insulin_uIU_ml,46,1454
10,triglycerides_mmol_l,45,1455
6,fpg_mmol_l,24,1476
7,hba1c_percent,24,1476
4,map_mmHg,23,1477
11,hdl_mmol_l,20,1480
3,diastolic_bp_mmHg,18,1482
12,diet_score_0_100,14,1486
1,bmi_prepreg_kg_m2,13,1487


In [33]:
df_data_categorical = df_data[categorical_columns]
df_data_categorical = df_data_categorical.drop(columns=["label_gdm"])
df_data_categorical

Unnamed: 0,parity,family_history_t2d,previous_gdm,pcos,smoking_first_trimester,physical_activity_level
0,0,0,1,0,1,2.0
1,0,1,0,1,0,1.0
2,0,1,0,0,0,1.0
3,0,0,0,0,0,0.0
4,1,0,0,0,0,2.0
...,...,...,...,...,...,...
1495,1,0,0,0,0,1.0
1496,0,0,1,0,0,0.0
1497,2,0,0,0,1,2.0
1498,0,0,1,0,0,0.0


In [34]:
instance_IF = IsolationForest(random_state=42)
instance_IF.fit(df_data_categorical)
predictions_outlier = instance_IF.predict(df_data_categorical)

In [35]:
df_data_categorical["is_isolated"] = predictions_outlier
df_data_categorical["is_isolated"].value_counts()

is_isolated
 1    953
-1    547
Name: count, dtype: int64

In [36]:
columns_no_categorical = [column for column in df_data.columns if column not in categorical_columns]
df_no_categorical = df_data[columns_no_categorical]

instance_IF = IsolationForest(random_state=42)
instance_IF.fit(df_no_categorical)
predictions_outlier = instance_IF.predict(df_no_categorical)
df_no_categorical["is_isolated"] = predictions_outlier 
df_no_categorical["is_isolated"].value_counts()

is_isolated
 1    1424
-1      76
Name: count, dtype: int64

In [37]:
df_values = df_data.drop(columns=["label_gdm"])
instance_IF = IsolationForest(random_state=42)
instance_IF.fit(df_values)

predictions_outlier = instance_IF.predict(df_values)
df_values["is_isolated"] = predictions_outlier 
df_values["is_isolated"].value_counts()

is_isolated
 1    1315
-1     185
Name: count, dtype: int64

In [40]:
df_outliers = df_outliers.astype(int)
df_outliers["count_outlier"] = df_outliers.sum(axis=1)
df_outliers["count_outlier"].value_counts()

count_outlier
0    1347
1     101
2      28
6       8
7       5
5       4
4       2
3       2
9       2
8       1
Name: count, dtype: int64

In [42]:
df_data["outlier_by_IQR"] = df_outliers["count_outlier"].values
df_data["is_isolated_full"] = df_values["is_isolated"].values
df_data["is_isolated_just_values"] = df_no_categorical["is_isolated"].values
df_data["is_isolated_just_cat"] = df_data_categorical["is_isolated"].values
df_data


Unnamed: 0,age_years,bmi_prepreg_kg_m2,systolic_bp_mmHg,diastolic_bp_mmHg,map_mmHg,gestational_weeks,fpg_mmol_l,hba1c_percent,insulin_uIU_ml,homa_ir,...,previous_gdm,pcos,smoking_first_trimester,physical_activity_level,diet_score_0_100,label_gdm,outlier_by_IQR,is_isolated_full,is_isolated_just_values,is_isolated_just_cat
0,26.8,26.900000,119.0,57.0,77.7,8.7,4.100000,5.330000,7.800000,1.420000,...,1,0,1,2.0,62.0,0,0,-1,1,-1
1,22.6,27.300000,,69.0,80.8,9.0,3.530000,5.730000,7.200000,1.120000,...,0,1,0,1.0,42.0,0,0,-1,1,-1
2,29.9,33.100000,103.0,84.0,89.9,10.3,4.450000,,4.300000,0.860000,...,0,0,0,1.0,72.0,0,0,1,1,1
3,26.3,23.600000,112.0,69.0,83.3,12.2,4.790000,4.690000,13.700000,2.910000,...,0,0,0,0.0,81.0,0,0,1,1,1
4,31.9,31.000000,117.0,69.0,84.9,8.9,,5.120000,9.500000,2.230000,...,0,0,0,2.0,53.0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,16.0,44.281931,152.0,74.0,100.0,13.1,12.549164,7.184281,35.447597,19.770565,...,0,0,0,1.0,57.0,0,7,-1,-1,1
1496,23.0,26.000000,101.0,47.0,65.0,8.3,4.130000,4.830000,6.800000,1.260000,...,1,0,0,0.0,57.0,0,2,-1,1,-1
1497,27.6,25.300000,110.0,67.0,81.5,8.4,4.600000,5.470000,10.900000,2.240000,...,0,0,1,2.0,58.0,0,0,1,1,-1
1498,38.4,26.700000,114.0,73.0,86.7,9.2,4.280000,5.390000,10.700000,2.030000,...,1,0,0,0.0,71.0,0,1,1,1,-1


In [43]:
df_data["is_isolated_full"] = df_data["is_isolated_full"].replace({-1: 1, 1:0})
df_data["is_isolated_just_values"] = df_data["is_isolated_just_values"].replace({-1: 1, 1:0})
df_data["is_isolated_just_cat"] = df_data["is_isolated_just_cat"].replace({-1: 1, 1:0})

In [45]:
df_data["outlier_by_IQR"] = df_data["outlier_by_IQR"].apply(check_outlier_by_IQR)

In [47]:
df_data["vote_outlier"] = df_data[["outlier_by_IQR", "is_isolated_full",
                                   "is_isolated_just_values", "is_isolated_just_cat"]].sum(axis=1)

df_data["vote_outlier"].value_counts()

vote_outlier
0    888
1    417
2    150
3     37
4      8
Name: count, dtype: int64

In [53]:
df_data[(df_data["vote_outlier"] == 3) & (df_data["label_gdm"]==1)]

Unnamed: 0,age_years,bmi_prepreg_kg_m2,systolic_bp_mmHg,diastolic_bp_mmHg,map_mmHg,gestational_weeks,fpg_mmol_l,hba1c_percent,insulin_uIU_ml,homa_ir,...,pcos,smoking_first_trimester,physical_activity_level,diet_score_0_100,label_gdm,outlier_by_IQR,is_isolated_full,is_isolated_just_values,is_isolated_just_cat,vote_outlier
37,26.0,24.3,120.0,49.0,72.3,11.4,5.24,5.79,10.1,2.35,...,1,0,0.0,35.0,1,0,1,1,1,3
276,38.1,28.1,113.0,54.0,73.9,13.2,5.37,,9.7,2.32,...,1,0,1.0,46.0,1,0,1,1,1,3
319,37.6,37.846963,131.0,89.0,103.0,8.1,12.951687,7.100647,40.27432,23.183128,...,0,0,1.0,57.0,1,1,1,1,0,3
375,30.3,39.1,114.0,84.0,93.8,8.2,5.11,5.15,13.8,3.14,...,0,1,0.0,75.0,1,0,1,1,1,3
406,36.6,31.6,119.0,72.0,87.8,9.3,4.61,5.19,21.6,4.43,...,0,1,2.0,64.0,1,1,1,0,1,3
880,27.5,30.982365,131.0,95.0,107.0,10.8,9.584298,7.127807,54.377082,23.162941,...,0,0,0.0,46.0,1,1,1,1,0,3
925,30.0,16.0,102.0,67.0,78.4,8.0,3.22,5.48,14.9,2.13,...,1,0,1.0,53.0,1,0,1,1,1,3
970,28.5,22.8,117.0,51.0,73.3,12.8,5.43,4.77,15.1,3.65,...,0,1,1.0,65.0,1,0,1,1,1,3
1040,28.5,26.9,108.0,74.0,85.5,10.9,5.43,5.04,24.5,5.92,...,1,0,1.0,44.0,1,1,1,0,1,3
1060,28.6,35.279085,156.0,98.0,117.333333,10.4,10.14753,7.382042,,2.51,...,0,0,1.0,72.0,1,1,1,1,0,3


In [51]:
df_data["label_gdm"].value_counts()

label_gdm
0    1239
1     261
Name: count, dtype: int64

In [54]:
df_data_post = df_data[df_data["vote_outlier"]<3]
df_data_out = df_data[df_data["vote_outlier"]>=3]

In [57]:
df_data_post = df_data_post.drop(columns=['outlier_by_IQR', 'is_isolated_full',
       'is_isolated_just_values', 'is_isolated_just_cat', 'vote_outlier'])

df_data_out = df_data_out.drop(columns=['outlier_by_IQR', 'is_isolated_full',
       'is_isolated_just_values', 'is_isolated_just_cat', 'vote_outlier'])


In [58]:
df_data_post.to_csv("data_selection.csv", index=False)
df_data_out.to_csv("data_isolated_for_testing.csv", index=False)