In [1]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.12.4

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler

In [3]:
!gsutil cp /home/eanegrin/buckets/b1/datasets/competencia_03_fe_v04.parquet /home/eanegrin/datasets/

Copying file:///home/eanegrin/buckets/b1/datasets/competencia_03_fe_v04.parquet...
- [1 files][  2.7 GiB/  2.7 GiB]                                                
Operation completed over 1 objects/2.7 GiB.                                      


In [4]:
base_path = '/home/eanegrin/'
dataset_path = base_path + 'datasets/'

dataset_file = 'competencia_03_fe_v04.parquet'

semillas = [122219, 109279, 400391, 401537, 999961]

In [5]:
data = pd.read_parquet(dataset_path + dataset_file)

In [6]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)

In [7]:
meses_excluidos = [202108, 202109] # meses con clase ternaria incompleta

data = data[~data['foto_mes'].isin(meses_excluidos)]
data['foto_mes'].unique()

array([201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
       201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004,
       202005, 202006, 202007, 202008, 202009, 202010, 202011, 202012,
       202101, 202102, 202103, 202104, 202105, 202106, 202107])

In [8]:
data.shape

(4570151, 764)

In [9]:
proportions = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

proportions.columns = ['foto_mes', 'proportion_0', 'proportion_1']
proportions

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,201901,0.994245,0.005755
1,201902,0.994531,0.005469
2,201903,0.993989,0.006011
3,201904,0.995441,0.004559
4,201905,0.99483,0.00517
5,201906,0.995294,0.004706
6,201907,0.994729,0.005271
7,201908,0.995839,0.004161
8,201909,0.995712,0.004288
9,201910,0.995435,0.004565


In [11]:
df_subsampled = []

for mes, group in data.groupby("foto_mes"):

    X = group.drop(columns="clase_binaria")
    y = group["clase_binaria"]
    
    # Calculo la proporcion de bajas
    minority_proportion = y.value_counts(normalize=True).get(1, 0)
    # print(f'Original proportion:{minority_proportion}')
    
    # voy a incrementar esa proporcion por 10

    estrategia={0: int(len(y[y == 0]) * 0.02), 
                1: len(y[y == 1])}

    print(f"Se retienen {estrategia[0]} de la clase mayoritaria y {estrategia[1]} de la minoritaria")
    
    # new_proportion = minority_proportion * 10

    rus = RandomUnderSampler(sampling_strategy=estrategia, random_state=semillas[0])
    X_res, y_res = rus.fit_resample(X, y)

    # Rearmar
    group_resampled = pd.concat([X_res, y_res], axis=1)
    group_resampled["foto_mes"] = mes

    df_subsampled.append(group_resampled)

# Mergear
data = pd.concat(df_subsampled, ignore_index=True)

data.shape

Se retienen 2480 de la clase mayoritaria y 718 de la minoritaria
Se retienen 2502 de la clase mayoritaria y 688 de la minoritaria
Se retienen 2513 de la clase mayoritaria y 760 de la minoritaria
Se retienen 2528 de la clase mayoritaria y 579 de la minoritaria
Se retienen 2539 de la clase mayoritaria y 660 de la minoritaria
Se retienen 2571 de la clase mayoritaria y 608 de la minoritaria
Se retienen 2600 de la clase mayoritaria y 689 de la minoritaria
Se retienen 2642 de la clase mayoritaria y 552 de la minoritaria
Se retienen 2674 de la clase mayoritaria y 576 de la minoritaria
Se retienen 2721 de la clase mayoritaria y 624 de la minoritaria
Se retienen 2758 de la clase mayoritaria y 735 de la minoritaria
Se retienen 2801 de la clase mayoritaria y 598 de la minoritaria
Se retienen 2869 de la clase mayoritaria y 502 de la minoritaria
Se retienen 2938 de la clase mayoritaria y 185 de la minoritaria
Se retienen 2979 de la clase mayoritaria y 378 de la minoritaria
Se retienen 2986 de la cl

(112198, 764)

In [12]:
# proporciones luego de hacer el ajuste

proportions = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

proportions.columns = ['foto_mes', 'proportion_0', 'proportion_1']
proportions

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,201901,0.775485,0.224515
1,201902,0.784326,0.215674
2,201903,0.767797,0.232203
3,201904,0.813647,0.186353
4,201905,0.793686,0.206314
5,201906,0.808745,0.191255
6,201907,0.790514,0.209486
7,201908,0.827176,0.172824
8,201909,0.822769,0.177231
9,201910,0.813453,0.186547


In [13]:
output_file = 'competencia_03_fe_v04_undersampled.parquet'

data.to_parquet('/home/eanegrin/datasets/' + output_file, index=False)

In [14]:
!gsutil cp /home/eanegrin/datasets/competencia_03_fe_v04_undersampled.parquet /home/eanegrin/buckets/b1/datasets/

Copying file:///home/eanegrin/datasets/competencia_03_fe_v04_undersampled.parquet...
- [1 files][103.8 MiB/103.8 MiB]                                                
Operation completed over 1 objects/103.8 MiB.                                    
