# Codigo sacado de Eugenio Negrin

In [1]:
import pandas as pd
import numpy as np

from imblearn.under_sampling import RandomUnderSampler

In [2]:
base_path = '/home/fedepicado/'
modelos_path = base_path + 'buckets/b1/modelos/'
db_path = base_path + 'buckets/b1/db/'
dataset_path = base_path + 'buckets/b1/datasets/'
exp_path = base_path + 'buckets/b1/exp/'
dataset_file = 'competencia_02_DQ_ft.parquet'
full_path = dataset_path + dataset_file

In [3]:
#semillas
semillas = [540079,250829,314299,302111,801007]

#meses entrenamiento
meses_train = [202012,202101,202102,202103,202104]

In [4]:
data = pd.read_parquet(dataset_path + dataset_file)

In [6]:
data['clase_binaria'] = 0
data['clase_binaria'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 0)

In [7]:
data = data[data['foto_mes'].isin(meses_train)]

In [9]:
df_prop = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

df_prop.columns = ['foto_mes', 'proportion_0', 'proportion_1']
df_prop

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,202012,0.996075,0.003925
1,202101,0.995155,0.004845
2,202102,0.993747,0.006253
3,202103,0.994007,0.005993
4,202104,0.992754,0.007246


In [11]:
df_subsampled = []

for mes, group in data.groupby("foto_mes"):

    X = group.drop(columns="clase_binaria")
    y = group["clase_binaria"]
    
    # Calculo la proporcion de bajas
    minority_proportion = y.value_counts(normalize=True).get(1, 0)
    print(f'Original proportion:{minority_proportion}')
    
    # voy a incrementar esa proporcion por 10
    new_proportion = minority_proportion * 10

    # Apply RandomUnderSampler with the calculated strategy
    rus = RandomUnderSampler(sampling_strategy=new_proportion, random_state=semillas[0])
    X_res, y_res = rus.fit_resample(X, y)

    # Rebuild the resampled DataFrame for the group
    group_resampled = pd.concat([X_res, y_res], axis=1)
    group_resampled["foto_mes"] = mes  # Add back the 'foto_mes' column

    # Add the resampled DataFrame to the list
    df_subsampled.append(group_resampled)

# Concatenate all resampled groups into a single DataFrame
data = pd.concat(df_subsampled, ignore_index=True)

Original proportion:0.003925064695466983
Original proportion:0.004844901435572068
Original proportion:0.006252843598981838
Original proportion:0.0059932186822250055
Original proportion:0.007246023523676031


In [12]:
data.shape

(86001, 289)

In [14]:
df_prop = (
    data.groupby('foto_mes')['clase_binaria']
    .value_counts(normalize=True)
    .unstack(fill_value=0)
    .reset_index()
)

df_prop.columns = ['foto_mes', 'proportion_0', 'proportion_1']
df_prop

Unnamed: 0,foto_mes,proportion_0,proportion_1
0,202012,0.96223,0.03777
1,202101,0.953788,0.046212
2,202102,0.941149,0.058851
3,202103,0.943455,0.056545
4,202104,0.932436,0.067564


In [15]:
dataset_file1 = 'competencia_02_DQ_ft_under_5meses.parquet'
full_path1 = dataset_path + dataset_file1

# Guarda el DataFrame en formato Parquet
data.to_parquet(full_path1, engine='pyarrow') 