In [None]:
import pandas as pd
import numpy as np
from os.path import join
import os
from itertools import product
from tqdm import tqdm

In [None]:
INPUT_DIR = '../input'
SUBMISSIONS_DIR = '../submissions'
PSEUDOLABELS_DIR = join(SUBMISSIONS_DIR, 'pseudo-labels/')
SMOOTHEDLABELS_DIR = join(PSEUDOLABELS_DIR, 'smoothed-labels/')
os.makedirs(SMOOTHEDLABELS_DIR, exist_ok=True)


exp_train_name = 'exp_train_02'
target = 'target'
target_class = 'target_class'

pcts = [50, 80, 100]
pseudo_label_sets_names = [f'{exp_train_name}_{name}.csv' 
                           for name in ['rnd_undspml', 'str_undspml', 'rdf_bnd']]

# Define funcs

In [None]:
smooth_funcs = [
    (1, lambda x: x),
    (2, lambda x: 0.10 if x == 0 else 0.90),
    (3, lambda x: 0.05 if x == 0 else 0.95),
    (4, lambda x: 0.05 if x == 0 else 1.00),
    (5, lambda x: 0.10 if x == 0 else 1.00),
    (6, lambda x: 0.20 if x == 0 else 1.00),
]

In [None]:
def take_pct_of_df(df: pd.DataFrame, pct: float, 
                   target: str = 'target', 
                   target_class: str = 'target_class'):
    part = pct / 100
    df_0 = df[df[target_class] == 0]
    indices_0 = (df_0
                 .sort_values(by=target)
                 .reset_index()
                 .loc[:int(len(df_0) * part), :]
                 ['index'].values)
    df_1 = df[df[target_class] == 1]
    indices_1 = (df_1
                 .sort_values(by=target, ascending=False)
                 .reset_index()
                 .loc[:int(len(df_1) * part), :]
                 ['index'].values)
    
    indices = np.concatenate([indices_0, indices_1])
    return df.loc[indices].reset_index(drop=True)

# Create experiments (stratificially folded `.csv` on each experiment)

In [None]:
experiments_params = list(product(pseudo_label_sets_names, pcts, smooth_funcs))

for name, pct, (func_code, smooth_func) in tqdm(experiments_params):
    df = pd.read_csv(join(PSEUDOLABELS_DIR, name)) 
    if pct != 100: 
        df = take_pct_of_df(df, pct)
    df['target'] = df['target_class'].apply(smooth_func)
    
    experiment_path = join(SMOOTHEDLABELS_DIR, f'{name[:-4]}-{pct}-{func_code}.csv')
    
    df.to_csv(experiment_path, index=False)
    !python create_folds_stratified.py -i $experiment_path -f image_name -t target_class > /dev/null 2>&1