In [1]:
# !pip install imbalanced-learn

In [17]:
import pandas as pd
import numpy as np

from imblearn.over_sampling import SMOTE

In [18]:
# !gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_02_fe_v01.parquet /home/eanegrin/datasets/

In [19]:
# base_path = '/content/drive/MyDrive/DMEyF/2024/'
base_path = 'C:/Eugenio/Maestria/DMEyF/'
# base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_02_fe_v01_undersampled_10_24M_altbinaria.parquet'

data = pd.read_parquet(dataset_path + dataset_file)

In [20]:
semillas = [122219, 109279, 400391, 401537, 999961]

In [21]:
data['foto_mes'].unique()

array([201906, 201907, 201908, 201909, 201910, 201911, 201912, 202001,
       202002, 202003, 202004, 202005, 202006, 202007, 202008, 202009,
       202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105,
       202106])

In [22]:
data.shape

(393861, 679)

In [23]:
proportions = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

proportions.columns = ['foto_mes', 'proportion_0', 'proportion_1']
proportions

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,201906,0.95505,0.04495
1,201907,0.949931,0.050069
2,201908,0.960052,0.039948
3,201909,0.958878,0.041122
4,201910,0.956339,0.043661
5,201911,0.949661,0.050339
6,201912,0.95922,0.04078
7,202001,0.966304,0.033696
8,202002,0.98758,0.01242
9,202003,0.975315,0.024685


In [24]:
data['clase_binaria'].value_counts()

clase_binaria
0    377325
1     16536
Name: count, dtype: int64

In [25]:
# Asignamos pesos a las clases

data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

# y borramos clase_ternaria porque SMOTE no acepta strings

data = data.drop(columns=['clase_ternaria'])

In [26]:
data['clase_peso'].value_counts()

clase_peso
1.00000    375738
1.00002     16536
1.00001      1587
Name: count, dtype: int64

In [27]:
df_resampled = []

for mes, group in data.groupby("foto_mes"):
    
    X = group.drop(columns="clase_binaria")
    y = group["clase_binaria"]

    # Get class counts
    minority_count = y.value_counts().get(1, 0)
    majority_count = y.value_counts().get(0, 0)

    # Calculate the target sampling strategy to double the minority class
    target_minority_count = minority_count * 1.3
    sampling_strategy = target_minority_count / majority_count

    # Temporarily replace NaN with a placeholder for SMOTE compatibility
    X_temp = X.fillna(-999)

    # Apply SMOTE with the calculated sampling strategy
    smote = SMOTE(random_state=semillas[0], sampling_strategy=sampling_strategy)
    X_res, y_res = smote.fit_resample(X_temp, y)

    # Revert the placeholder (-999) back to NaN
    X_res = pd.DataFrame(X_res, columns=X.columns).replace(-999, np.nan)

    # Rebuild the resampled DataFrame for the group
    group_resampled = pd.concat([X_res, pd.Series(y_res, name="clase_binaria")], axis=1)
    group_resampled["foto_mes"] = mes  # Add back the 'foto_mes' column

    # Add the resampled DataFrame to the list
    df_resampled.append(group_resampled)

# Concatenate all resampled groups into a single DataFrame
data = pd.concat(df_resampled, ignore_index=True)


In [28]:
# check

data['clase_peso'].value_counts()

clase_peso
1.00000    375738
1.00002     21485
1.00001      1587
Name: count, dtype: int64

In [29]:
data['clase_binaria'].value_counts()

clase_binaria
0    377325
1     21485
Name: count, dtype: int64

In [30]:
data.shape

(398810, 679)

In [31]:
proportions = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

proportions.columns = ['foto_mes', 'proportion_0', 'proportion_1']
proportions

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,201906,0.942369,0.057631
1,201907,0.93592,0.06408
2,201908,0.948723,0.051277
3,201909,0.947246,0.052754
4,201910,0.943988,0.056012
5,201911,0.935564,0.064436
6,201912,0.947652,0.052348
7,202001,0.956672,0.043328
8,202002,0.983946,0.016054
9,202003,0.968171,0.031829


In [32]:
output_file = 'competencia_02_fe_v01_undersampled_10_24M_altbinaria_SMOTE.parquet'

data.to_parquet(dataset_path + output_file, index=False)

In [None]:
# !gsutil cp /home/eanegrin/datasets/competencia_02_fe_v01_undersampled_10_24M_altbinaria.parquet /home/eanegrin/buckets/b1/datasets/

Copying file:///home/eanegrin/datasets/competencia_02_fe_v01_undersampled_10_24M_altbinaria.parquet...
- [1 files][607.0 MiB/607.0 MiB]                                                
Operation completed over 1 objects/607.0 MiB.                                    
