In [1]:
import sys
sys.path.append('..')
from tqdm import tqdm

import pandas as pd
import numpy as np

from adat.utils import calculate_normalized_wer
from adat.masker import get_default_masker

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [3]:
data = pd.concat([train, test])

In [4]:
masker = get_default_masker()

In [5]:
transactions = data.transactions.values

In [6]:
close_to_zero_examples = []

num_close_to_zero = 200000
close_to_zero_indexes = np.random.randint(0, len(transactions), size=(num_close_to_zero, 2))

for id1, id2 in tqdm(close_to_zero_indexes):
    tr1 = transactions[id1]
    tr2 = transactions[id2]
    wer_sim = 1 - calculate_normalized_wer(tr1, tr2)
    close_to_zero_examples.append((tr1, tr2, wer_sim))

100%|██████████| 200000/200000 [00:04<00:00, 45349.46it/s]


In [20]:
some_examples = []
num_some_examples = 1000000
some_examples_indexes = np.random.randint(0, len(transactions), size=num_some_examples)

for idx in tqdm(some_examples_indexes):
    tr1 = transactions[idx]
    tr2, applied = masker.mask(tr1)
    if applied:
        wer_sim = 1 - calculate_normalized_wer(tr1, tr2)
        some_examples.append((tr1, tr2, wer_sim))

100%|██████████| 1000000/1000000 [06:22<00:00, 2615.32it/s]


In [21]:
len(some_examples)

499443

In [22]:
len(close_to_zero_examples)

200000

In [23]:
examples = []
examples.extend(close_to_zero_examples)
examples.extend(some_examples)

In [24]:
examples = pd.DataFrame(examples, columns=['seq_a', 'seq_b', 'similarity'])

In [25]:
examples.head()

Unnamed: 0,seq_a,seq_b,similarity
0,5912_0_0 5912_0_0 5411_1_3 5661_1_3 5977_1_3 5...,5912_0_0 5977_1_0 5411_0_0 5977_0_0 5411_0_0 5...,0.076923
1,5977_0_0 5814_0_0 5499_0_0 5541_0_0,5411_0_0 5912_0_0 5691_3_0 5912_0_0,0.0
2,5541_0_0 5814_0_0 5993_0_0 5541_1_0 7832_3_3 5...,5814_0_0 5411_0_0 8999_4_2 8999_4_2 4121_2_2,0.111111
3,8999_4_2 7995_4_2 8999_4_2 5411_0_0 4816_4_2 4...,5999_0_0 5977_0_0 5814_0_0,0.0
4,5411_0_0 6012_4_2 6012_4_2 6012_1_2 6012_1_2,5816_2_5 5816_2_5 5816_2_5 5816_2_5 5816_2_5 7...,0.0


In [26]:
np.median(examples.similarity)

0.75

In [27]:
examples.similarity.mean()

0.5748349766053704

In [28]:
examples.similarity.max()

1.0

In [29]:
examples.similarity.min()

0.0

In [30]:
examples = examples.sample(frac=1).reset_index(drop=True)

In [31]:
examples.head()

Unnamed: 0,seq_a,seq_b,similarity
0,7399_2_2 5814_0_0 5499_0_0 5411_0_0 5812_0_0,7399_2_2 5814_0_0 5499_0_0 5411_0_0 5812_0_0 5...,0.714286
1,7311_4_2 7399_4_2 5499_0_0 5499_0_0 5200_0_0 5...,5691_1_3 5691_1_3 4812_1_3 5691_1_3 5691_1_3 5...,0.071429
2,5812_3_0 7832_0_0 5411_0_0 5411_3_0 4121_2_2 5...,5812_3_0 7832_0_0 5411_0_0 5411_3_0 4121_2_2 5...,0.818182
3,4121_2_2 5411_1_0 5977_1_0 4121_2_2,4121_2_2 5411_1_0 5977_1_0 4121_2_2,1.0
4,7832_0_0 5812_0_0 5732_0_0 5309_3_3 5411_3_0 5...,7832_0_0 5812_0_0 5732_0_0 5309_3_3 5411_3_0 5...,0.923077


In [32]:
examples.shape

(699443, 3)

In [33]:
! mkdir ../data/deep_lev

In [34]:
from sklearn.model_selection import train_test_split

In [41]:
tr, te = train_test_split(examples, test_size=0.07, random_state=23)

In [42]:
tr.shape, te.shape

((650481, 3), (48962, 3))

In [43]:
# tr.to_csv('../data/deep_lev/train.csv', index=False)
# te.to_csv('../data/deep_lev/test.csv', index=False)

In [58]:
# tr.to_csv('../data/deep_lev/train.csv', index=False)
# te.to_csv('../data/deep_lev/test.csv', index=False)

In [68]:
te

Unnamed: 0,seq_a,seq_b,similarity
225835,id_5 id_38 id_476 id_6 id_34 id_39 id_22 id_30...,id_5 id_38 id_476 id_6 id_34 id_844 id_22 id_3...,0.888889
303042,id_320 id_6 id_6 id_21 id_266 id_256 id_79 id_...,id_320 id_6 id_6 id_21 id_2189 id_256 id_79 id...,0.900000
1070359,id_3 id_3 id_1 id_4 id_3,id_3 id_3 id_1 id_4 id_5376 id_3,0.833333
879393,id_1243 id_703 id_7 id_68 id_221 id_453 id_429...,id_1243 id_703 id_7 id_68 id_221 id_453 id_429...,0.823529
231085,id_22 id_21 id_35 id_22 id_21 id_4 id_38 id_18...,id_22 id_21 id_35 id_22 id_21 id_4 id_38 id_18...,0.750000
...,...,...,...
623359,id_80 id_10 id_10 id_106 id_109 id_67 id_109 i...,id_80 id_2554 id_10 id_106 id_109 id_67 id_109...,0.933333
887151,id_22 id_8 id_9 id_23 id_287 id_15 id_9 id_14 ...,id_2027 id_8 id_9 id_23 id_1729 id_1876 id_9 i...,0.600000
1077476,id_10 id_10 id_8 id_109 id_9 id_8 id_10,id_10 id_5341 id_5341 id_10 id_10 id_8 id_2774...,0.636364
1038475,id_3 id_4 id_4 id_4 id_4,id_3 id_2804 id_4 id_4 id_4,0.800000
