In [1]:
import pandas as pd
import numpy as np
from os.path import join

In [2]:
INPUT_DIR = '../input'
SUBMISSIONS_DIR = '../submissions'
exp_name = 'exp_train_02'
target = 'target'

# Create UnderSampling Datasets

In [3]:
train_df = pd.read_csv(join(INPUT_DIR, 'train.csv'))
test_df  = pd.read_csv(join(INPUT_DIR, 'test.csv' ))
print(train_df[target].apply(round).value_counts(normalize=True))
print(train_df[target].apply(round).value_counts())

0    0.98237
1    0.01763
Name: target, dtype: float64
0    32542
1      584
Name: target, dtype: int64


In [4]:
submission_df = pd.read_csv(join(SUBMISSIONS_DIR, f'{exp_name}.csv'))
print(submission_df[target].apply(round).value_counts(normalize=True))
print(submission_df[target].apply(round).value_counts()) # just to chectarget

0    0.99481
1    0.00519
Name: target, dtype: float64
0    10925
1       57
Name: target, dtype: int64


In [5]:
def get_randomly_undersampled_df(df: pd.DataFrame, ethalon_df: pd.DataFrame, 
                                target: str = 'target'):
    df_dis = df[target].apply(round).value_counts(normalize=True)
    ethalon_dis = ethalon_df[target].value_counts(normalize=True)
    
    underrepersented_class = (df_dis - ethalon_dis).argmin()
    undersampling_k = ethalon_dis[underrepersented_class] / df_dis[underrepersented_class]
    
    overrepresented_df  = df[~(df[target].apply(round) == underrepersented_class)]
    underrepresented_df = df[ (df[target].apply(round) == underrepersented_class)]
    
    samples_N_to_choose = int(len(overrepresented_df) / undersampling_k)
    random_indices_to_choose = np.random.choice(overrepresented_df.index, samples_N_to_choose, replace=False)
    randomly_undersampled_indices = np.concatenate([underrepresented_df.index, random_indices_to_choose])
    
    randomly_undersampled_df = df.loc[randomly_undersampled_indices].reset_index(drop=True)
    return randomly_undersampled_df


rnd_undspml_df = get_randomly_undersampled_df(submission_df, train_df)
print(rnd_undspml_df[target].apply(round).value_counts(normalize=True)) # just to check
print(rnd_undspml_df[target].apply(round).value_counts()) # just to check

0    0.982585
1    0.017415
Name: target, dtype: float64
0    3216
1      57
Name: target, dtype: int64


In [6]:
rnd_undspml_df = pd.merge(rnd_undspml_df, test_df[['image_name', 'patient_id']], on=['image_name'], how='left')
rnd_undspml_df['target_digit'] = rnd_undspml_df['target'].apply(round)
rnd_undspml_df

Unnamed: 0,image_name,target,patient_id,target_digit
0,ISIC_0470699,0.530370,IP_6587568,1
1,ISIC_0637104,0.633393,IP_7242799,1
2,ISIC_0945202,0.521758,IP_5422189,1
3,ISIC_1294368,0.688024,IP_5661694,1
4,ISIC_1364884,0.647823,IP_5038795,1
...,...,...,...,...
3268,ISIC_1989507,0.000614,IP_3579794,0
3269,ISIC_1841330,0.076807,IP_2020841,0
3270,ISIC_2982140,0.018963,IP_7706695,0
3271,ISIC_1599454,0.000713,IP_9225429,0


In [7]:
rnd_undspml_df.to_csv(join(INPUT_DIR, f'{exp_name}_rnd_undspml.csv'), index=False)

In [8]:
def get_stratificially_undersampled_df(df: pd.DataFrame, ethalon_df: pd.DataFrame, 
                                       target: str = 'target'):
    bins = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.1]
    df['target_bins'] = pd.cut(df[target], bins=bins, right=False).astype(str)
    
    df_dis = df['target'].apply(round).value_counts(normalize=True)
    ethalon_dis = ethalon_df[target].value_counts(normalize=True)
    underrepersented_class = (df_dis - ethalon_dis).argmin()
    undersampling_k = ethalon_dis[underrepersented_class] / df_dis[underrepersented_class]

    df['oversampled'] = df[target].apply(round).apply(lambda x: x != underrepersented_class)
    
    overrepresented_df = df[df['oversampled']]
    underrepresented_df = df[~df['oversampled']]
    
    stratified_indices = []

    for target_bin in overrepresented_df['target_bins'].unique():
        target_bin_df = overrepresented_df[overrepresented_df['target_bins'] == target_bin]
        samples_N_to_choose = int(len(target_bin_df) / undersampling_k)
        stratified_indices_to_choose = np.random.choice(target_bin_df.index, samples_N_to_choose, replace=False).tolist()
        stratified_indices.extend(stratified_indices_to_choose)

    stratificially_undersampled_indices = np.concatenate([underrepresented_df.index, stratified_indices])
    stratificially_undersampled_df = df.loc[stratificially_undersampled_indices].reset_index(drop=True)
    return stratificially_undersampled_df.drop(['target_bins', 'oversampled'], axis=1)

str_undspml_df = get_stratificially_undersampled_df(submission_df, train_df)
print(str_undspml_df[target].apply(round).value_counts(normalize=True)) # just to check
print(str_undspml_df[target].apply(round).value_counts()) # just to check

0    0.982574
1    0.017426
Name: target, dtype: float64
0    3214
1      57
Name: target, dtype: int64


In [9]:
str_undspml_df = pd.merge(str_undspml_df, test_df[['image_name', 'patient_id']], on=['image_name'], how='left')
str_undspml_df['target_digit'] = str_undspml_df['target'].apply(round)
str_undspml_df

Unnamed: 0,image_name,target,patient_id,target_digit
0,ISIC_0470699,0.530370,IP_6587568,1
1,ISIC_0637104,0.633393,IP_7242799,1
2,ISIC_0945202,0.521758,IP_5422189,1
3,ISIC_1294368,0.688024,IP_5661694,1
4,ISIC_1364884,0.647823,IP_5038795,1
...,...,...,...,...
3266,ISIC_7904829,0.456778,IP_3872865,0
3267,ISIC_8234611,0.452973,IP_1061000,0
3268,ISIC_5994923,0.442949,IP_6488213,0
3269,ISIC_1198439,0.490370,IP_3872004,0


In [10]:
str_undspml_df.to_csv(join(INPUT_DIR, f'{exp_name}_str_undspml.csv'), index=False)