In [1]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler

In [3]:
!gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_02_fe_v01.parquet /home/eanegrin/datasets/

Copying file:///home/eanegrin/buckets/b1/datasets/competencia_02_fe_v01.parquet...
- [1 files][  7.1 GiB/  7.1 GiB]                                                
Operation completed over 1 objects/7.1 GiB.                                      


In [4]:
# base_path = '/content/drive/MyDrive/DMEyF/2024/'
# base_path = 'C:/Eugenio/Maestria/DMEyF/'
base_path = '/home/eanegrin/buckets/b1/'

dataset_path = base_path + 'datasets/'
modelos_path = base_path + 'modelos/'
db_path = base_path + 'db/'
dataset_file = 'competencia_02_fe_v01.parquet'

ganancia_acierto = 273000
costo_estimulo = 7000

# agregue sus semillas
semillas = [122219, 109279, 400391, 401537, 999961]

data = pd.read_parquet('/home/eanegrin/datasets/' + dataset_file)

In [5]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [6]:
meses_train = [202006,202007,202008,202009,202010,202011,202012,
               202101,202102,202103,202104,202105,202106]

data = data[data['foto_mes'].isin(meses_train)]

In [7]:
data.shape

(2088895, 679)

In [8]:
proportions = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

proportions.columns = ['foto_mes', 'proportion_0', 'proportion_1']
proportions

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,202006,0.991831,0.008169
1,202007,0.992495,0.007505
2,202008,0.993531,0.006469
3,202009,0.993446,0.006554
4,202010,0.993408,0.006592
5,202011,0.992933,0.007067
6,202012,0.992057,0.007943
7,202101,0.991236,0.008764
8,202102,0.988921,0.011079
9,202103,0.987775,0.012225


In [9]:
df_subsampled = []

for mes, group in data.groupby("foto_mes"):

    X = group.drop(columns="clase_binaria")
    y = group["clase_binaria"]
    
    # Calculo la proporcion de bajas
    minority_proportion = y.value_counts(normalize=True).get(1, 0)
    print(f'Original proportion:{minority_proportion}')
    
    # voy a incrementar esa proporcion por 5 (hasta la v03 habia probado con 10)
    new_proportion = minority_proportion * 5

    # Apply RandomUnderSampler with the calculated strategy
    rus = RandomUnderSampler(sampling_strategy=new_proportion, random_state=semillas[0])
    X_res, y_res = rus.fit_resample(X, y)

    # Rebuild the resampled DataFrame for the group
    group_resampled = pd.concat([X_res, y_res], axis=1)
    group_resampled["foto_mes"] = mes  # Add back the 'foto_mes' column

    # Add the resampled DataFrame to the list
    df_subsampled.append(group_resampled)

# Concatenate all resampled groups into a single DataFrame
data = pd.concat(df_subsampled, ignore_index=True)

Original proportion:0.00816873378122622
Original proportion:0.007504943375876326
Original proportion:0.006468947777254263
Original proportion:0.006554230256802066
Original proportion:0.006592333360462277
Original proportion:0.0070672257406278385
Original proportion:0.007942993697609054
Original proportion:0.008764025526767309
Original proportion:0.011079276465452578
Original proportion:0.012224699880868741
Original proportion:0.01323054421354135
Original proportion:0.012756419212382231
Original proportion:0.012021155292462215


In [10]:
data.shape

(437201, 679)

In [11]:
proportions = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

proportions.columns = ['foto_mes', 'proportion_0', 'proportion_1']
proportions

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,202006,0.960759,0.039241
1,202007,0.963832,0.036168
2,202008,0.968668,0.031332
3,202009,0.968269,0.031731
4,202010,0.96809,0.03191
5,202011,0.965869,0.034131
6,202012,0.961802,0.038198
7,202101,0.958019,0.041981
8,202102,0.947511,0.052489
9,202103,0.942397,0.057603


In [12]:
output_file = 'competencia_02_fe_v01_undersampled.parquet'

data.to_parquet('/home/eanegrin/datasets/' + output_file, index=False)

In [13]:
!gsutil cp /home/eanegrin/datasets/competencia_02_fe_v01_undersampled.parquet /home/eanegrin/buckets/b1/datasets/

Copying file:///home/eanegrin/datasets/competencia_02_fe_v01_undersampled.parquet...
- [1 files][654.9 MiB/654.9 MiB]                                                
Operation completed over 1 objects/654.9 MiB.                                    
