In [1]:
import sys

## find 'core' package
sys.path.insert(0, '../..')

sys.version

'3.7.3 (default, Mar 27 2019, 09:23:15) \n[Clang 10.0.1 (clang-1001.0.46.3)]'

In [2]:
from core.models import unit
from core.models.column import Column
from core.models.table import Table
from core.factories import column_factory

In [3]:
from core.generators.label_generator import LabelGenerator
from core.generators.unit_generator import UnitGenerator
from core.generators.column_generator import ColumnGenerator
from core.generators.table_generator import TableGenerator

In [4]:
vocab = [ c for c in 'abcdefghijklmnopqrstuvwxyz' ]

index_to_char = dict([ (i, c) for i, c in enumerate(vocab) ])
char_to_index = dict([ (c, i) for i, c in enumerate(vocab) ])

label_generator = LabelGenerator(index_to_char)
unit_generator = UnitGenerator(unit.get_unit_lookup())

column_generator = ColumnGenerator(label_generator, unit_generator)

In [5]:
table = Table()

table.n_spaces_between_columns = 5
table.max_number_of_labels = 10
table.columns = [ 
    column_factory.create_column(
        order = 0,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    ),
    column_factory.create_column(
        order = 1,
        min_label_length = 4,
        max_label_length = 15,
        max_unit_length = 6,
        max_number_of_label_parts = 3
    ),
    column_factory.create_column(
        order = 2,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    )
]

In [6]:
table_generator = TableGenerator(table, column_generator)

In [7]:
table_id, train, target = table_generator.get_table()

In [8]:
print('\n'.join(train))

aqyl        1.88       jkduxqkwuwen   0.03 m     juwla       1.64 m
            cm/s       jiuojaaqefvyps            ovwshojdun  2.63
yznkuzbfakw 0.17       ltfcqcbbskrhbq 2.9                    cm/s
jlnnnyfaj   cm         oachjxrcekl    cm/s       lgknvkbr oo 0.95
ev jldkn    1.86       hdnqthtjln     0.12       gp          cm/s
roealsgefz  cm/s       uerybgfogqr    cm/s       htkcxdqrnmo 1.97
tcxmunueylj 1.11       qyrgsgcnd      2.96       fog rutj    cm/s
cqykgrxvjfn cm/s                      m/s        imnjnrssk   1.21
okuezlvqwgo 2.51 m     niba ijemfmaiqb1.0        uatxcvqbjnj cm/s
uqshigagpw                            m/s        fwockguo    0.75
xcuwgq                                           kpzgzamc    cm/s
                                                 nqukmd      2.22
                                                 kroopgpafim m/s
                                                 obaxrgorg
                                                 yfnq        3.71
                

In [9]:
print(target)

aqyl 1.88 cm/s
yznkuzbfakw jlnnnyfaj 0.17 cm
ev jldkn roealsgefz 1.86 cm/s
tcxmunueylj cqykgrxvjfn 1.11 cm/s
okuezlvqwgo uqshigagpw xcuwgq 2.51 m
jkduxqkwuwen jiuojaaqefvyps 0.03 m
ltfcqcbbskrhbq oachjxrcekl 2.9 cm/s
hdnqthtjln uerybgfogqr 0.12 cm/s
qyrgsgcnd 2.96 m/s
niba ijemfmaiqb 1.0 m/s
juwla 1.64 m
ovwshojdun 2.63 cm/s
lgknvkbr oo gp 0.95 cm/s
htkcxdqrnmo fog rutj 1.97 cm/s
imnjnrssk uatxcvqbjnj 1.21 cm/s
fwockguo kpzgzamc 0.75 cm/s
nqukmd kroopgpafim obaxrgorg 2.22 m/s
yfnq 3.71 cm
mnbuk kw 4.25 m


In [10]:
from core.generators.dataset_generator import DatasetGenerator
from core.generators.mashup_dataset_generator import MashupDatasetGenerator

In [11]:
options = [
    {
        'table_generator': table_generator,
        'n_instances': 10,
    }
]

mashup_generator = MashupDatasetGenerator()
df = mashup_generator.get_data(options)

df.head()

Unnamed: 0,table_id,table,target
0,a7362271597f46a4ba713312e6f7d982,yznkuzbfakw 1.84 niicvayowbenu 0.6 cm ...,yznkuzbfakw jlnnnyfaj 1.84 cm\ntcxmunueylj cqy...
1,a7362271597f46a4ba713312e6f7d982,okuezlvqwgo 4.86 m jkduxqkwuwen 0.2 m ...,okuezlvqwgo uqshigagpw xcuwgq 4.86 m\njkduxqkw...
2,a7362271597f46a4ba713312e6f7d982,ev jldkn 1.97 uqzibxlho 0.66 ...,ev jldkn roealsgefz 1.97 cm/s\ntcxmunueylj cqy...
3,a7362271597f46a4ba713312e6f7d982,aqyl 2.71 qyrgsgcnd 1.36 ...,aqyl 2.71 cm/s\nev jldkn roealsgefz 0.24 cm/s\...
4,a7362271597f46a4ba713312e6f7d982,tcxmunueylj 1.38 gvkgfqca xosja 2.39 ...,tcxmunueylj cqykgrxvjfn 1.38 cm/s\nyznkuzbfakw...


In [12]:
print(df['table'][0])

yznkuzbfakw 1.84       niicvayowbenu  0.6 cm     lgknvkbr oo 2.01
jlnnnyfaj   cm         pbic                      gp          cm/s
tcxmunueylj 2.89       hdnqthtjln     1.31       imnjnrssk   1.66
cqykgrxvjfn cm/s       uerybgfogqr    cm/s       uatxcvqbjnj cm/s
okuezlvqwgo 1.35 m     gvkgfqca xosja 0.75       ovwshojdun  0.1
uqshigagpw                            m/s                    cm/s
xcuwgq                 jjev wbsjwos   1.29       htkcxdqrnmo 2.41
upviwtuvys  4.13                      cm         fog rutj    cm/s
            cm         qyrgsgcnd      1.11       cqzdlueuvtf 2.33
ka          3.35                      m/s        jwpllhok    m/s
ppevhjrfst  cm         uvvtfgna       1.03       zctqfyxtgx
ev jldkn    1.13       myynxwypfiq    cm         juwla       1.92 m
roealsgefz  cm/s       ltfcqcbbskrhbq 1.14       nqukmd      1.09
                       oachjxrcekl    cm/s       kroopgpafim m/s
                                                 obaxrgorg
                        

In [13]:
print(df['target'][0])

yznkuzbfakw jlnnnyfaj 1.84 cm
tcxmunueylj cqykgrxvjfn 2.89 cm/s
okuezlvqwgo uqshigagpw xcuwgq 1.35 m
upviwtuvys 4.13 cm
ka ppevhjrfst 3.35 cm
ev jldkn roealsgefz 1.13 cm/s
niicvayowbenu pbic 0.6 cm
hdnqthtjln uerybgfogqr 1.31 cm/s
gvkgfqca xosja 0.75 m/s
jjev wbsjwos 1.29 cm
qyrgsgcnd 1.11 m/s
uvvtfgna myynxwypfiq 1.03 cm
ltfcqcbbskrhbq oachjxrcekl 1.14 cm/s
lgknvkbr oo gp 2.01 cm/s
imnjnrssk uatxcvqbjnj 1.66 cm/s
ovwshojdun 0.1 cm/s
htkcxdqrnmo fog rutj 2.41 cm/s
cqzdlueuvtf jwpllhok zctqfyxtgx 2.33 m/s
juwla 1.92 m
nqukmd kroopgpafim obaxrgorg 1.09 m/s
fwockguo kpzgzamc 0.58 cm/s
mnbuk kw 3.7 m


In [14]:
## split train, dev, test...

In [15]:
from core.utils.modeling_utils import train_test_dev_split

In [16]:
train_data, dev_data, test_data = train_test_dev_split(df, .70)
    
print(len(train_data), len(dev_data), len(test_data))

7 1 2
