In [None]:
%pip install imblearn

In [None]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler

In [None]:
base_path = '/home/cburich_pymnts/buckets/b1/'


dataset_path = base_path + 'datasets/'
dataset_file = 'competencia_02_fe_k210.csv'

semillas = [165229,165211,165203,165237,165247]

data = pd.read_csv(dataset_path + dataset_file)

In [None]:
data['clase_peso'] = 1.0

data.loc[data['clase_ternaria'] == 'BAJA+2', 'clase_peso'] = 1.00002
data.loc[data['clase_ternaria'] == 'BAJA+1', 'clase_peso'] = 1.00001

In [None]:
data['clase_binaria1'] = np.nan
data['clase_binaria2'] = np.nan

# Update values while keeping NaN as NaN
data['clase_binaria1'] = np.where(data['clase_ternaria'] == 'BAJA+2', 1, 
                                  np.where(data['clase_ternaria'].isna(), np.nan, 0))
data['clase_binaria2'] = np.where(data['clase_ternaria'] == 'CONTINUA', 0, 
                                  np.where(data['clase_ternaria'].isna(), np.nan, 1))

In [None]:
data.shape

### UNDERSAMPLING

In [None]:
# Separar los datos de test (foto_mes == 202108)
test_data = data[data['foto_mes'] == 202108]

# Datos para el undersampling (excluyendo foto_mes == 202108)
# train_data = data[data['foto_mes'] != 202108]
train_data = data[(data['foto_mes'] != 202107) & (data['foto_mes'] != 202108)]

# Calcular el ratio de las clases
class_counts = train_data['clase_binaria2'].value_counts()
class_1_count = class_counts[1]  # Cantidad de clase_binaria2 = 1
class_0_count = class_counts[0]  # Cantidad de clase_binaria2 = 0

# Ratio deseado: 10% de la clase minoritaria
undersample_ratio = {0: int(class_1_count / 0.1), 1: class_1_count}

# Dividir X (features) y y (target)
X_train = train_data.drop(columns=['clase_binaria2'])
y_train = train_data['clase_binaria2']

# Aplicar RandomUnderSampler
rus = RandomUnderSampler(sampling_strategy=undersample_ratio, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X_train, y_train)

# Combinar datos resampleados
resampled_train = X_resampled.copy()
resampled_train['clase_binaria2'] = y_resampled

# Reunir el dataset completo (entrenamiento + test)
data_resampled = pd.concat([resampled_train, test_data])


In [None]:
# Verificar las proporciones después del undersampling
print(data_resampled['clase_binaria2'].value_counts())


In [None]:
data_resampled.shape

In [None]:
# Exportar el DataFrame a un archivo CSV
data_resampled.to_csv('/home/cburich_pymnts/buckets/b1/datasets/competencia_02_fe_U_k210.csv', index=False)
