In [1]:
import pandas as pd
import numpy as np
from os.path import join
import os

In [2]:
INPUT_DIR = '../input'
SUBMISSIONS_DIR = '../submissions'
PSEUDOLABELS_DIR = join(SUBMISSIONS_DIR, 'pseudo-labels/')
os.makedirs(PSEUDOLABELS_DIR, exist_ok=True)

exp_train_name = 'exp_train_02'
target = 'target'
target_class = 'target_class'

In [3]:
train_df = pd.read_csv(join(INPUT_DIR, 'train.csv'))
test_df  = pd.read_csv(join(INPUT_DIR, 'test.csv' ))
print(train_df[target].apply(round).value_counts(normalize=True))
print(train_df[target].apply(round).value_counts())

0    0.98237
1    0.01763
Name: target, dtype: float64
0    32542
1      584
Name: target, dtype: int64


In [4]:
submission_df = pd.read_csv(join(SUBMISSIONS_DIR, f'{exp_train_name}.csv'))
print(submission_df[target].apply(round).value_counts(normalize=True))
print(submission_df[target].apply(round).value_counts()) # just to chectarget

0    0.99481
1    0.00519
Name: target, dtype: float64
0    10925
1       57
Name: target, dtype: int64


# Create UnderSampled Datasets

## Randomly

In [5]:
def get_randomly_undersampled_df(df: pd.DataFrame, ethalon_df: pd.DataFrame, 
                                 target: str = 'target',
                                 target_class: str = 'target_class'):
    df[target_class] = df[target].apply(round)
    df_dis = df[target_class].value_counts(normalize=True)
    ethalon_dis = ethalon_df[target].value_counts(normalize=True)
    
    underrepersented_class = (df_dis - ethalon_dis).argmin()
    undersampling_k = ethalon_dis[underrepersented_class] / df_dis[underrepersented_class]
    
    overrepresented_df  = df[~(df[target_class] == underrepersented_class)]
    underrepresented_df = df[ (df[target_class] == underrepersented_class)]
    
    samples_N_to_choose = int(len(overrepresented_df) / undersampling_k)
    random_indices_to_choose = np.random.choice(overrepresented_df.index, samples_N_to_choose, replace=False)
    randomly_undersampled_indices = np.concatenate([underrepresented_df.index, random_indices_to_choose])
    
    randomly_undersampled_df = df.loc[randomly_undersampled_indices].reset_index(drop=True)
    return randomly_undersampled_df


rnd_undspml_df = get_randomly_undersampled_df(submission_df.copy(), train_df.copy())
print(rnd_undspml_df[target_class].value_counts(normalize=True)) # just to check
print(rnd_undspml_df[target_class].value_counts()) # just to check

0    0.982585
1    0.017415
Name: target_class, dtype: float64
0    3216
1      57
Name: target_class, dtype: int64


In [6]:
rnd_undspml_df = pd.merge(rnd_undspml_df, test_df[['image_name', 'patient_id']], on=['image_name'], how='left')
rnd_undspml_df

Unnamed: 0,image_name,target,target_class,patient_id
0,ISIC_0470699,0.530370,1,IP_6587568
1,ISIC_0637104,0.633393,1,IP_7242799
2,ISIC_0945202,0.521758,1,IP_5422189
3,ISIC_1294368,0.688024,1,IP_5661694
4,ISIC_1364884,0.647823,1,IP_5038795
...,...,...,...,...
3268,ISIC_0528304,0.013196,0,IP_0791728
3269,ISIC_8122278,0.009904,0,IP_3678754
3270,ISIC_4679024,0.000138,0,IP_0270988
3271,ISIC_6316211,0.001922,0,IP_2920133


In [7]:
rnd_undspml_df.to_csv(join(PSEUDOLABELS_DIR, f'{exp_train_name}_rnd_undspml.csv'), index=False)

## Stratificially

In [8]:
def get_stratificially_undersampled_df(df: pd.DataFrame, ethalon_df: pd.DataFrame, 
                                       target: str = 'target',
                                       target_class: str = 'target_class'):
    bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.1]
    df['target_bins'] = pd.cut(df[target], bins=bins, right=False).astype(str)
    
    df[target_class] = df[target].apply(round)
    df_dis = df[target_class].value_counts(normalize=True)
    ethalon_dis = ethalon_df[target].value_counts(normalize=True)
    underrepersented_class = (df_dis - ethalon_dis).argmin()
    undersampling_k = ethalon_dis[underrepersented_class] / df_dis[underrepersented_class]
    
    overrepresented_df  = df[~(df[f'{target}_class'] == underrepersented_class)]
    underrepresented_df = df[ (df[f'{target}_class'] == underrepersented_class)]

    stratified_indices = []

    for target_bin in overrepresented_df['target_bins'].unique():
        target_bin_df = overrepresented_df[overrepresented_df['target_bins'] == target_bin]
        samples_N_to_choose = int(len(target_bin_df) / undersampling_k)
        stratified_indices_to_choose = np.random.choice(target_bin_df.index, samples_N_to_choose, replace=False).tolist()
        stratified_indices.extend(stratified_indices_to_choose)

    stratificially_undersampled_indices = np.concatenate([underrepresented_df.index, stratified_indices])
    stratificially_undersampled_df = df.loc[stratificially_undersampled_indices].reset_index(drop=True)
    return stratificially_undersampled_df.drop(['target_bins'], axis=1)

str_undspml_df = get_stratificially_undersampled_df(submission_df.copy(), train_df.copy())
print(str_undspml_df[target_class].value_counts(normalize=True)) # just to check
print(str_undspml_df[target_class].value_counts()) # just to check

0    0.982574
1    0.017426
Name: target_class, dtype: float64
0    3214
1      57
Name: target_class, dtype: int64


In [9]:
str_undspml_df = pd.merge(str_undspml_df, test_df[['image_name', 'patient_id']], on=['image_name'], how='left')
str_undspml_df

Unnamed: 0,image_name,target,target_class,patient_id
0,ISIC_0470699,0.530370,1,IP_6587568
1,ISIC_0637104,0.633393,1,IP_7242799
2,ISIC_0945202,0.521758,1,IP_5422189
3,ISIC_1294368,0.688024,1,IP_5661694
4,ISIC_1364884,0.647823,1,IP_5038795
...,...,...,...,...
3266,ISIC_7132534,0.499000,0,IP_5868128
3267,ISIC_3869964,0.414945,0,IP_2496585
3268,ISIC_1030252,0.464344,0,IP_6490054
3269,ISIC_3361906,0.489682,0,IP_1790831


In [10]:
str_undspml_df.to_csv(join(PSEUDOLABELS_DIR, f'{exp_train_name}_str_undspml.csv'), index=False)

# Create Dataset with Redefined Class-Assigning Boundary

In [11]:
def get_redefined_boundary_df(df: pd.DataFrame, ethalon_df: pd.DataFrame, 
                              target: str = 'target',
                              target_class: str = 'target_class'):
    df[target_class] = df[target].apply(round)
    df_dis = df[target_class].value_counts(normalize=True)
    ethalon_dis = ethalon_df[target].value_counts(normalize=True)
    underrepersented_class = (df_dis - ethalon_dis).argmin()
    undersampling_k = ethalon_dis[underrepersented_class] / df_dis[underrepersented_class]
    
    abs_n_underrepresented = df[target_class].value_counts()[underrepersented_class]
    n_to_set_as_underrepresented = int((abs_n_underrepresented*undersampling_k))
    indices_to_set_as_underrepresented = df.sort_values(by='target').iloc[-n_to_set_as_underrepresented:, :].index
    df.loc[indices_to_set_as_underrepresented, [target_class]] = underrepersented_class
    
    return df

rdf_bnd_df = get_redefined_boundary_df(submission_df.copy(), train_df.copy())
print(rdf_bnd_df[target_class].value_counts(normalize=True)) # just to check
print(rdf_bnd_df[target_class].value_counts()) # just to check

0    0.982426
1    0.017574
Name: target_class, dtype: float64
0    10789
1      193
Name: target_class, dtype: int64


In [12]:
rdf_bnd_df = pd.merge(rdf_bnd_df, test_df[['image_name', 'patient_id']], on=['image_name'], how='left')
rdf_bnd_df

Unnamed: 0,image_name,target,target_class,patient_id
0,ISIC_0052060,0.001729,0,IP_3579794
1,ISIC_0052349,0.000069,0,IP_7782715
2,ISIC_0058510,0.000073,0,IP_7960270
3,ISIC_0073313,0.000065,0,IP_6375035
4,ISIC_0073502,0.022205,0,IP_0589375
...,...,...,...,...
10977,ISIC_9992485,0.000617,0,IP_4152479
10978,ISIC_9996992,0.023771,0,IP_4890115
10979,ISIC_9997917,0.044372,0,IP_2852390
10980,ISIC_9998234,0.000000,0,IP_8861963


In [13]:
rdf_bnd_df.to_csv(join(PSEUDOLABELS_DIR, f'{exp_train_name}_rdf_bnd.csv'), index=False)