In [1]:
import pandas as pd
import numpy as np
from os.path import join
from typing import List, Union, Tuple

In [2]:
INPUT_DIR = '../input'
SUBMISSIONS_DIR = '../submissions'

In [3]:
submission_name = 'exp_train_02.csv'

In [4]:
def print_distribution_info(df: pd.DataFrame, target: str, name_of_df: str):
    print(f'Number of samples in {name_of_df}: {df.shape[0]}')
    print(f'Target dictribution:')
    print(df[target].apply(round).value_counts(normalize=True) * 100)


train_df = pd.read_csv(join(INPUT_DIR, 'train.csv'))
submission_df = pd.read_csv(join(SUBMISSIONS_DIR, submission_name))
print_distribution_info(train_df, 'target', 'train')
print()
print_distribution_info(submission_df, 'target', 'submission')

Number of samples in train: 33126
Target dictribution:
0    98.237034
1     1.762966
Name: target, dtype: float64

Number of samples in submission: 10982
Target dictribution:
0    99.480969
1     0.519031
Name: target, dtype: float64


In [5]:
def print_bins_distribution_info(df: pd.DataFrame, 
                                 custom_bins: List[Tuple[int, Tuple[float, float]]]):
    rows = []
    class_0_df, class_1_df = df[df['target'] < 0.50], df[df['target'] >= 0.50]
    
    for i, (class_, custom_bin) in enumerate(custom_bins):
        class_df_len = class_0_df.shape[0] if class_ == 0 else class_1_df.shape[0]
        
        hist, _ = np.histogram(df['target'], bins=custom_bin)
        
        abs_val = hist[0]
        pct_val = round(abs_val / df.shape[0] * 100, 2)        
        pct_class_val = round(abs_val / class_df_len * 100, 2)
        
        rows.append([abs_val, f'{pct_val:.2f}', f'{pct_class_val:.2f}'])

    indices = [str(custom_bin) for (class_, custom_bin) in custom_bins]
    indices[len(indices)//2-1] = '[0, 0.5)'

    rows_df = pd.DataFrame(rows, index=indices, columns=['Abs', '%', '% of its class'])

    print('Number (N) of predictions in bins (Assuming class = 0 if p<0.5 else 1)')
    print(rows_df)


custom_bins = [[0, [0, 0.05]], [0, [0, 0.10]], [0, [0, 0.15]], [0, [0, 0.20]], [0, [0, 0.50 - 1e-12]], 
               [1, [0.50, 1]], [1, [0.80, 1]], [1, [0.85, 1]], [1, [0.90, 1]], [1, [0.95, 1]]]
print_bins_distribution_info(submission_df, custom_bins)

Number (N) of predictions in bins (Assuming class = 0 if p<0.5 else 1)
             Abs      % % of its class
[0, 0.05]  10057  91.58          92.05
[0, 0.1]   10430  94.97          95.47
[0, 0.15]  10599  96.51          97.02
[0, 0.2]   10712  97.54          98.05
[0, 0.5)   10925  99.48         100.00
[0.5, 1]      57   0.52         100.00
[0.8, 1]       5   0.05           8.77
[0.85, 1]      1   0.01           1.75
[0.9, 1]       0   0.00           0.00
[0.95, 1]      0   0.00           0.00
