In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm_notebook as tqdm

In [4]:
df_trian = pd.read_csv('../mnt/inputs/origin/train.csv.zip')
df_test = pd.read_csv('../mnt/inputs/origin/test.csv.zip')

In [5]:
df_test.drop(['ID_code'], axis=1, inplace=True)
df_test = df_test.values

unique_samples = []
unique_count = np.zeros_like(df_test)
for feature in tqdm(range(df_test.shape[1])):
    _, index_, count_ = np.unique(df_test[:, feature], return_counts=True, return_index=True)
    unique_count[index_[count_ == 1], feature] += 1

# Samples which have unique values are real the others are fake
real_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]

print(len(real_samples_indexes))
print(len(synthetic_samples_indexes))

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


100000
100000


In [7]:
ITER = 100
df_test_real = df_test[real_samples_indexes].copy()

generator_for_each_synthetic_sample = []
# Using 20,000 samples should be enough. 
# You can use all of the 100,000 and get the same results (but 5 times slower)
for cur_sample_index in tqdm(synthetic_samples_indexes[:ITER]):
    # target の synthetic_samples をとってくる
    cur_synthetic_sample = df_test[cur_sample_index]
    # ↑ の target の各要素と real データの各要素をマッチング
    potential_generators = df_test_real == cur_synthetic_sample

    # A verified generator for a synthetic sample is achieved
    # only if the value of a feature appears only once in the
    # entire real samples set
    # fake のある行と同じ feature が real において 1 つしかないところを feature mask としてとってきている
    # つまりマッチングが１つだけだった列を持ってくる
    features_mask = np.sum(potential_generators, axis=0) == 1
    # マッチングが１つだけだった行をとってくる
    # .sum とかでも良さそうだが、bool にしたかったから any?
    verified_generators_mask = np.any(potential_generators[:, features_mask], axis=1)
    verified_generators_for_sample = real_samples_indexes[np.argwhere(verified_generators_mask)[:, 0]]
    generator_for_each_synthetic_sample.append(set(verified_generators_for_sample))

HBox(children=(IntProgress(value=0), HTML(value='')))

In [34]:
len(generator_for_each_synthetic_sample)

100

In [31]:
np.argwhere(verified_generators_mask)[:, 0]

array([ 1058,  1399,  1710,  3090,  6622,  6887,  7197,  8705,  9276,
        9656, 11499, 11622, 12277, 12363, 13268, 15972, 20099, 20377,
       23063, 23162, 23572, 35170, 35284, 36533, 37513, 37690, 37811,
       38076, 43394, 43819, 43880, 44948, 45972, 46611, 47624, 49233,
       50266, 51068, 51680, 52354, 54362, 54975, 56087, 57250, 59363,
       59604, 59635, 59769, 61274, 61493, 63913, 63973, 64069, 64417,
       64613, 69604, 69665, 70569, 70591, 71098, 71370, 72415, 73208,
       73464, 76269, 76489, 76880, 77291, 80070, 82840, 83019, 83867,
       87606, 88506, 89042, 91165, 92972, 95917, 97898, 98127])

In [30]:
verified_generators_for_sample

array([  2096,   2785,   3427,   6205,  13207,  13728,  14391,  17416,
        18540,  19297,  23067,  23338,  24665,  24836,  26684,  32008,
        40306,  40871,  46302,  46518,  47326,  70506,  70739,  73226,
        75149,  75486,  75718,  76240,  86981,  87835,  87947,  89998,
        92035,  93330,  95258,  98475, 100634, 102256, 103522, 104877,
       108908, 110125, 112339, 114658, 118809, 119318, 119373, 119640,
       122658, 123096, 127983, 128084, 128262, 128974, 129349, 139315,
       139448, 141324, 141368, 142394, 142907, 144974, 146586, 147112,
       152777, 153196, 153995, 154825, 160446, 165923, 166295, 167912,
       175396, 177159, 178225, 182481, 186099, 191910, 195977, 196453])

In [17]:
features_mask.shape

(200,)

In [27]:
potential_generators[:, features_mask].sum(axis=1).sum(), np.any(potential_generators[:, features_mask], axis=1).sum()

(80, 80)

In [49]:
generator_for_each_synthetic_sample[0].intersection(generator_for_each_synthetic_sample[8])

{87004}

In [50]:
public_LB = generator_for_each_synthetic_sample[0]
# かぶる場合は同じデータセット (public, private)　というロジックで iteration
for x in tqdm(generator_for_each_synthetic_sample):
    # &
    if public_LB.intersection(x):
        public_LB = public_LB.union(x)

private_LB = generator_for_each_synthetic_sample[1]
for x in tqdm(generator_for_each_synthetic_sample):
    # &
    if private_LB.intersection(x):
        private_LB = private_LB.union(x)
        
print(len(public_LB))
print(len(private_LB))

HBox(children=(IntProgress(value=0), HTML(value='')))

HBox(children=(IntProgress(value=0), HTML(value='')))

3417
3217


In [51]:
public_LB

{139264,
 73730,
 139267,
 131080,
 24586,
 65547,
 188428,
 40975,
 163857,
 24593,
 65555,
 196627,
 57362,
 57367,
 131100,
 81948,
 155678,
 57375,
 49188,
 172070,
 139303,
 57384,
 40999,
 98351,
 163887,
 32816,
 81970,
 188466,
 139316,
 147512,
 57401,
 172090,
 163901,
 155713,
 8258,
 114757,
 188486,
 106568,
 147533,
 32847,
 79,
 180308,
 32854,
 139351,
 163930,
 73820,
 163932,
 92,
 65631,
 147550,
 172127,
 163939,
 147559,
 147564,
 196720,
 73841,
 49266,
 57459,
 24699,
 127,
 180358,
 57480,
 172170,
 90253,
 98446,
 65679,
 196751,
 188561,
 32914,
 196754,
 82069,
 180377,
 196763,
 49314,
 24741,
 57510,
 49318,
 57511,
 90285,
 139438,
 177,
 65713,
 98486,
 41144,
 147643,
 123070,
 164031,
 8382,
 123071,
 191,
 188618,
 90320,
 188626,
 131283,
 147672,
 172249,
 41178,
 180441,
 82142,
 16608,
 41189,
 24808,
 196843,
 65772,
 139502,
 172272,
 98547,
 164087,
 57602,
 172292,
 16644,
 147721,
 114955,
 82188,
 106766,
 98577,
 65810,
 278,
 24856,
 114969