In [1]:
import sys

## find 'core' package
sys.path.insert(0, '../..')

sys.version

'3.7.3 (default, Mar 27 2019, 09:23:15) \n[Clang 10.0.1 (clang-1001.0.46.3)]'

In [2]:
from core.models import unit
from core.models.column import Column
from core.models.table import Table
from core.factories import column_factory

In [3]:
from core.generators.label_generator import LabelGenerator
from core.generators.unit_generator import UnitGenerator
from core.generators.column_generator import ColumnGenerator
from core.generators.table_generator import TableGenerator

In [4]:
vocab = [ c for c in 'abcdefghijklmnopqrstuvwxyz' ]

index_to_char = dict([ (i, c) for i, c in enumerate(vocab) ])
char_to_index = dict([ (c, i) for i, c in enumerate(vocab) ])

label_generator = LabelGenerator(index_to_char)
unit_generator = UnitGenerator(unit.get_unit_lookup())

column_generator = ColumnGenerator(label_generator, unit_generator)

In [5]:
table = Table()

table.max_number_of_labels = 5
table.columns = [ 
    column_factory.create_column(
        order = 0,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    ),
    column_factory.create_column(
        order = 1,
        min_label_length = 4,
        max_label_length = 15,
        max_unit_length = 6,
        max_number_of_label_parts = 3
    ),
    column_factory.create_column(
        order = 2,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    )
]

In [6]:
table_generator = TableGenerator(column_generator)

In [7]:
train, target = table_generator.get_table(table)

In [8]:
print('\n'.join(train))

yzaobdjmq   1.92       mukzzv         1.72       laaxkrl     0.53
bzrufmd     cm/s                      cm                     m/s
ewooifgnmt  2.77       rbjoyyggiley   0.66       xlomcpbxjc  0.78
qczgyxa     m/s        mvrqbfwcnpfme  m/s                    cm
kqxujjxils  2.3                                  khfvpy      0.16
vfdys bnrve m/s                                              m/s
vosm bkb    1.3


In [9]:
print(target)

yzaobdjmq bzrufmd 1.92 cm/s
ewooifgnmt qczgyxa 2.77 m/s
kqxujjxils vfdys bnrve 2.3 m/s
vosm bkb hjkkogro 1.3 cm/s
mukzzv 1.72 cm
rbjoyyggiley mvrqbfwcnpfme 0.66 m/s
laaxkrl 0.53 m/s
xlomcpbxjc 0.78 cm
khfvpy 0.16 m/s


In [10]:
from core.generators.dataset_generator import DatasetGenerator
from core.generators.mashup_dataset_generator import MashupDatasetGenerator

In [11]:
options = [
    {
        'table': table,
        'n_instances': 10,
        'dataset_generator': DatasetGenerator(table_generator)
    }
]

mashup_generator = MashupDatasetGenerator()
dataset = mashup_generator.get_data(options)

In [12]:
import pandas as pd

In [13]:
df = pd.DataFrame(dataset)
df.columns = ['table', 'target']

df.head()

Unnamed: 0,table,target
0,bxcejtcrfho 4.15 pfgei 0.38 m ...,bxcejtcrfho jogvg ojlp 4.15 cm\nnavgdknphp 2.6...
1,wtfiw vdixr 3.69 qqilpghknfa 0.09 ...,wtfiw vdixr mseu 3.69 cm\nobwohuava zrzjkrlm o...
2,gti 1.57 nmunrpysc 0.65 ...,gti 1.57 cm/s\njznrda 3.01 m\nnmunrpysc 0.65 m...
3,kwxwygbfpll 3.62 fsubrbgl 2.18 ...,kwxwygbfpll kb 3.62 cm\nvjqgqwb uujl qeda 0.02...
4,sntd 1.41 srlnujdbsxilw 0.36 ...,sntd lczhahwzf cev 1.41 m/s\ngoozavhs teiffibt...


In [14]:
print(df['table'][0])

bxcejtcrfho 4.15       pfgei          0.38 m     gqyybrhn    4.89 m
jogvg ojlp  cm         rkxuineyucc    2.98       dkoo faf    2.96
navgdknphp  2.67       sbddfimsxeutc  m/s        qlvbo       cm/s
            cm         bsgdlnjqoceosc 3.5 cm


In [15]:
print(df['target'][0])

bxcejtcrfho jogvg ojlp 4.15 cm
navgdknphp 2.67 cm
pfgei 0.38 m
rkxuineyucc sbddfimsxeutc 2.98 m/s
bsgdlnjqoceosc ymtirqgohyx 3.5 cm
gqyybrhn 4.89 m
dkoo faf qlvbo 2.96 cm/s
