It seems that the test set consists of real samples as well as synthetic samples that were generated by sampling the real samples feature distributions (These are probably the "rows which are not included in scoring").

If this is correct, then finding out which sample is synthetic, and which is real should be relatively easy task:

Given a sample, we can go over its features and check if the feature value is unique. If at least one of the sample's features is unique, then the sample must be a real sample. It turns out that if a given sample has no unique values then it is a synthetic sample. (It doesn't have to be like that, but in this dataset the probability is seemingly to low that this would not be the case).

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm_notebook as tqdm

import os
print(os.listdir("../input"))

['train.csv', 'sample_submission.csv', 'test.csv']


In [2]:
test_path = '../input/test.csv'

df_test = pd.read_csv(test_path)
df_test.drop(['ID_code'], axis=1, inplace=True)

# Example -> same value appear in multiple rows
df_test[df_test["var_0"]==0.1887]

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,var_10,var_11,var_12,var_13,var_14,var_15,var_16,var_17,var_18,var_19,var_20,var_21,var_22,var_23,var_24,var_25,var_26,var_27,var_28,var_29,var_30,var_31,var_32,var_33,var_34,var_35,var_36,var_37,var_38,var_39,...,var_160,var_161,var_162,var_163,var_164,var_165,var_166,var_167,var_168,var_169,var_170,var_171,var_172,var_173,var_174,var_175,var_176,var_177,var_178,var_179,var_180,var_181,var_182,var_183,var_184,var_185,var_186,var_187,var_188,var_189,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
14030,0.1887,1.3469,7.369,6.4229,9.7698,-19.7083,5.0344,17.1194,-2.5376,8.9821,4.9432,1.3322,13.8438,11.6458,5.7759,14.724,11.3932,-0.9143,18.344,19.6869,15.3898,11.1909,12.428,2.8495,8.2514,13.74,-12.1011,-0.9131,6.381,8.3949,-14.3678,9.243,-0.5019,13.0421,11.441,5.1421,-0.9394,3.9722,3.5156,-2.9109,...,23.8901,5.9222,7.7799,14.3943,3.9127,15.1503,2.9656,-4.5078,6.29,5.4496,-1.4895,0.7827,28.6452,-4.8072,19.4231,6.1513,2.6274,8.3818,-2.6191,1.7493,2.9097,8.9215,-6.9867,9.5565,11.178,-8.2555,10.1598,-4.0551,13.8613,0.8376,3.1416,10.3891,0.3807,4.3128,19.1017,-1.2793,0.9163,9.4415,14.3593,-21.7602
19559,0.1887,4.1117,12.6385,4.2417,12.1715,-11.1637,4.6478,15.4989,-0.9129,8.309,-7.9889,-0.2495,13.594,10.0287,10.1399,13.876,7.2155,-8.026,8.5082,0.5264,12.3112,16.211,7.6729,2.89,13.3853,13.7862,-6.1797,1.6903,6.6979,7.5874,-0.2008,7.7974,0.5564,19.0862,11.7486,-6.3175,4.8885,8.1542,13.8662,3.3834,...,21.0238,5.5305,7.0291,8.1889,-0.6458,19.1795,3.4865,5.8764,13.518,5.3193,4.4257,-13.1535,26.1037,-1.6268,21.2384,10.9894,4.9616,9.6259,-0.4432,4.6724,-4.6052,9.0861,6.8216,13.5149,15.9706,-2.5545,13.5381,-7.8629,14.2754,0.748,3.4132,4.7445,2.5331,9.3356,13.909,-2.0675,2.2671,10.0102,21.2817,2.8813


In [3]:
df_test = df_test.values

unique_samples = []
unique_count = np.zeros_like(df_test) # create a same shape (200000,200) of zero np 
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    # if value is unique in a column then is 1 other will be 0
    unique_count[index_[count_ == 1], feature] += 1

    
# Samples which have unique values are real the others are fake
# If at least one of the sample's features is unique, then the sample must be a real sample
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


100000
100000


If the split between private and public LB sets was done before the resampling process of generating synthetic samples, then it's also possible to regenerate the two different sets. For each synthetic sample, we can go over its features and capture those features that have only one instance in the real samples set with the same value, this instance has to be one of the samples' generators.

In [4]:
df_test_real = df_test[real_samples_indexes].copy()

generator_for_each_synthetic_sample = []
# Using 20,000 samples should be enough. 
# You can use all of the 100,000 and get the same results (but 5 times slower)
for cur_sample_index in tqdm(synthetic_samples_indexes[:20000]):
    cur_synthetic_sample = df_test[cur_sample_index]
    potential_generators = df_test_real == cur_synthetic_sample

    # A verified generator for a synthetic sample is achieved
    # only if the value of a feature appears only once in the entire real samples set
    features_mask = np.sum(potential_generators, axis=0) == 1
    verified_generators_mask = np.any(potential_generators[:, features_mask], axis=1)
    verified_generators_for_sample = real_samples_indexes[np.argwhere(verified_generators_mask)[:, 0]]
    generator_for_each_synthetic_sample.append(set(verified_generators_for_sample))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))

After collecting the "verified generators" for each fake sample, finding the Public/Private LB split is no more than a few set operations.

In [5]:
public_LB = generator_for_each_synthetic_sample[0]
for x in tqdm(generator_for_each_synthetic_sample):
    if public_LB.intersection(x):
        public_LB = public_LB.union(x)

private_LB = generator_for_each_synthetic_sample[1]
for x in tqdm(generator_for_each_synthetic_sample):
    if private_LB.intersection(x):
        private_LB = private_LB.union(x)
        
print(len(public_LB))
print(len(private_LB))

HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=20000), HTML(value='')))


50000
50000
