In [1]:
import sys

## find 'core' package
sys.path.insert(0, '../..')

sys.version

'3.7.3 (default, Mar 27 2019, 09:23:15) \n[Clang 10.0.1 (clang-1001.0.46.3)]'

In [2]:
from core.models import unit
from core.models.column import Column
from core.models.table import Table
from core.factories import column_factory

In [4]:
from core.generators.label_generator import LabelGenerator
from core.generators.unit_generator import UnitGenerator
from core.generators.column_generator import ColumnGenerator
from core.generators.table_generator import TableGenerator

In [5]:
vocab = [ c for c in 'abcdefghijklmnopqrstuvwxyz' ]

index_to_char = dict([ (i, c) for i, c in enumerate(vocab) ])
char_to_index = dict([ (c, i) for i, c in enumerate(vocab) ])

label_generator = LabelGenerator(index_to_char)
unit_generator = UnitGenerator(unit.get_unit_lookup())

column_generator = ColumnGenerator(label_generator, unit_generator)

In [6]:
table = Table()

table.max_number_of_labels = 5
table.columns = [ 
    column_factory.create_column(
        order = 0,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    ),
    column_factory.create_column(
        order = 1,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    ),
    column_factory.create_column(
        order = 2,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    )
]

In [7]:
table_generator = TableGenerator(column_generator)

In [8]:
train, target = table_generator.get_table(table)

In [9]:
print('\n'.join(train))

hnfhtrkjyk  1.49       yowijjgg    0.35       mldi zzyhly 1.94
            cm/s                   m/s                    cm/s
vqdy        2.61       gacd        4.68 m


In [10]:
print(target)

hnfhtrkjyk 1.49 cm/s
vqdy vrmljdut lbeet 2.61 m/s
tugetk 1.72 cm/s
yowijjgg 0.35 m/s
gacd htmrqkhh rpo 4.68 m
ebmc aotb zyabgqroobt 3.21 cm
ppnw vu oojie 0.37 cm/s
mldi zzyhly 1.94 cm/s


In [11]:
from core.generators.dataset_generator import DatasetGenerator

In [12]:
dataset_generator = DatasetGenerator(table_generator)
dataset = dataset_generator.get_data(table, n_instances = 10)

In [13]:
import pandas as pd

In [14]:
df = pd.DataFrame(dataset)
df.columns = ['table', 'target']

df.head()

Unnamed: 0,table,target
0,ksnmd fpaykt1.12 mlxrrrb 1.53 m ...,ksnmd fpaykt mqupag 1.12 cm/s\nmlxrrrb uxmwytc...
1,ogz phbtjr 0.34 gcsu 2.94 ...,ogz phbtjr fkcarejyjbr 0.34 cm/s\nvrbgklin blx...
2,gjmres 0.1 m lcjbpftv 1.52 ...,gjmres uzccgn 0.1 m\ndbs 4.57 m\nnwcuonbk oyrk...
3,mxrplgvi 2.73 tlisrcbjdbg 1.25 ...,mxrplgvi 2.73 cm/s\nmhkdocly 2.84 cm/s\nuoorup...
4,pouhmwx wt 2.9 lz yhb 3.63 m ...,pouhmwx wt 2.9 cm/s\nlz yhb pmhkajfvfzu 3.63 m...


In [15]:
print(df['table'][0])

ksnmd fpaykt1.12       mlxrrrb     1.53 m     lmdryqztnm  2.69
mqupag      cm/s       uxmwytc                elyjc       cm/s
                       gbqdxrasv              futhgqj
                                              tuysdsw     4.85
                                              wakvltyhtn  cm
                                              ufjbzer
                                              mgldypmtbvl 0.75
                                              vwm         m/s
                                              wbvuxrhlel



In [16]:
print(df['target'][0])

ksnmd fpaykt mqupag 1.12 cm/s
mlxrrrb uxmwytc gbqdxrasv 1.53 m
lmdryqztnm elyjc futhgqj 2.69 cm/s
tuysdsw wakvltyhtn ufjbzer 4.85 cm
mgldypmtbvl vwm wbvuxrhlel 0.75 m/s
