In [1]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler

In [3]:
!gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_03_fe_v08.parquet /home/eanegrin/datasets/

Copying file:///home/eanegrin/buckets/b1/datasets/competencia_03_fe_v08.parquet...
- [1 files][  8.4 GiB/  8.4 GiB]                                                
Operation completed over 1 objects/8.4 GiB.                                      


In [4]:
base_path = '/home/eanegrin/'
dataset_path = base_path + 'datasets/'

dataset_file = 'competencia_03_fe_v08.parquet'

semillas = [122219, 109279, 400391, 401537, 999961]

In [5]:
data = pd.read_parquet(dataset_path + dataset_file)

In [6]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 1)

In [7]:
meses_excluidos = [201901, 201902, 201903, 201904, 202006, 202108, 202109]

data = data[~data['foto_mes'].isin(meses_excluidos)]
data['foto_mes'].unique()

array([201905, 201906, 201907, 201908, 201909, 201910, 201911, 201912,
       202001, 202002, 202003, 202004, 202005, 202007, 202008, 202009,
       202010, 202011, 202012, 202101, 202102, 202103, 202104, 202105,
       202106, 202107])

In [8]:
data.shape

(3912411, 1192)

In [9]:
proportions = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

proportions.columns = ['foto_mes', 'proportion_0', 'proportion_1']
proportions

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,201905,0.990287,0.009713
1,201906,0.990169,0.009831
2,201907,0.990071,0.009929
3,201908,0.990691,0.009309
4,201909,0.991594,0.008406
5,201910,0.991169,0.008831
6,201911,0.990207,0.009793
7,201912,0.99053,0.00947
8,202001,0.992311,0.007689
9,202002,0.995289,0.004711


In [10]:
df_subsampled = []

for mes, group in data.groupby("foto_mes"):

    X = group.drop(columns="clase_binaria")
    y = group["clase_binaria"]
    
    # Calculo la proporcion de bajas
    minority_proportion = y.value_counts(normalize=True).get(1, 0)
    # print(f'Original proportion:{minority_proportion}')
    
    estrategia={0: int(len(y[y == 0]) * 0.05), 
                1: len(y[y == 1])}

    print(f"Se retienen {estrategia[0]} de la clase mayoritaria y {estrategia[1]} de la minoritaria")

    rus = RandomUnderSampler(sampling_strategy=estrategia, random_state=semillas[0])
    X_res, y_res = rus.fit_resample(X, y)

    # Rearmar
    group_resampled = pd.concat([X_res, y_res], axis=1)
    group_resampled["foto_mes"] = mes

    df_subsampled.append(group_resampled)

# Mergear
data = pd.concat(df_subsampled, ignore_index=True)

data.shape

Se retienen 6320 de la clase mayoritaria y 1240 de la minoritaria
Se retienen 6395 de la clase mayoritaria y 1270 de la minoritaria
Se retienen 6471 de la clase mayoritaria y 1298 de la minoritaria
Se retienen 6571 de la clase mayoritaria y 1235 de la minoritaria
Se retienen 6659 de la clase mayoritaria y 1129 de la minoritaria
Se retienen 6773 de la clase mayoritaria y 1207 de la minoritaria
Se retienen 6865 de la clase mayoritaria y 1358 de la minoritaria
Se retienen 6966 de la clase mayoritaria y 1332 de la minoritaria
Se retienen 7142 de la clase mayoritaria y 1107 de la minoritaria
Se retienen 7320 de la clase mayoritaria y 693 de la minoritaria
Se retienen 7439 de la clase mayoritaria y 564 de la minoritaria
Se retienen 7448 de la clase mayoritaria y 910 de la minoritaria
Se retienen 7504 de la clase mayoritaria y 1165 de la minoritaria
Se retienen 7729 de la clase mayoritaria y 1169 de la minoritaria
Se retienen 7802 de la clase mayoritaria y 1016 de la minoritaria
Se retienen 7

(228903, 1192)

In [11]:
# proporciones luego de hacer el ajuste

proportions = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

proportions.columns = ['foto_mes', 'proportion_0', 'proportion_1']
proportions

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,201905,0.835979,0.164021
1,201906,0.834312,0.165688
2,201907,0.832926,0.167074
3,201908,0.841788,0.158212
4,201909,0.855033,0.144967
5,201910,0.848747,0.151253
6,201911,0.834853,0.165147
7,201912,0.839479,0.160521
8,202001,0.865802,0.134198
9,202002,0.913516,0.086484


In [12]:
output_file = 'competencia_03_fe_v08_undersampled.parquet'

data.to_parquet('/home/eanegrin/datasets/' + output_file, index=False)

In [13]:
!gsutil cp /home/eanegrin/datasets/competencia_03_fe_v08_undersampled.parquet /home/eanegrin/buckets/b1/datasets/

Copying file:///home/eanegrin/datasets/competencia_03_fe_v08_undersampled.parquet...
- [1 files][582.8 MiB/582.8 MiB]                                                
Operation completed over 1 objects/582.8 MiB.                                    
