In [1]:
import sys

## find 'core' package
sys.path.insert(0, '../..')

sys.version

'3.7.3 (default, Mar 27 2019, 09:23:15) \n[Clang 10.0.1 (clang-1001.0.46.3)]'

In [2]:
from core.models import unit
from core.models.column import Column
from core.models.table import Table
from core.factories import column_factory

In [3]:
from core.generators.label_generator import LabelGenerator
from core.generators.unit_generator import UnitGenerator
from core.generators.column_generator import ColumnGenerator
from core.generators.table_generator import TableGenerator

In [4]:
vocab = [ c for c in 'abcdefghijklmnopqrstuvwxyz' ]

index_to_char = dict([ (i, c) for i, c in enumerate(vocab) ])
char_to_index = dict([ (c, i) for i, c in enumerate(vocab) ])

label_generator = LabelGenerator(index_to_char)
unit_generator = UnitGenerator(unit.get_unit_lookup())

column_generator = ColumnGenerator(label_generator, unit_generator)

In [5]:
table = Table()

table.n_spaces_between_columns = 5
table.max_number_of_labels = 5
table.columns = [ 
    column_factory.create_column(
        order = 0,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    ),
    column_factory.create_column(
        order = 1,
        min_label_length = 4,
        max_label_length = 15,
        max_unit_length = 6,
        max_number_of_label_parts = 3
    ),
    column_factory.create_column(
        order = 2,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    )
]

In [6]:
table_generator = TableGenerator(column_generator)

In [7]:
train, target = table_generator.get_table(table)

In [8]:
print('\n'.join(train))

tkrvgislxc  0.27 m     hpimrahwrxuajn 2.05       sapmll      2.21
puth                                  m/s        vyxcbqm qwklcm/s
diyc        2.55       ruvhzafygqft   1.38       znihzn pjbfg4.8 m
            m/s        vlivaw         cm         xdgivkkkbli
                       xwuhov         0.01       kiwkwznnsj  0.2
                                      cm/s                   m/s
                       lklazxdfugje   1.15 m


In [9]:
print(target)

tkrvgislxc puth 0.27 m
diyc 2.55 m/s
hpimrahwrxuajn 2.05 m/s
ruvhzafygqft vlivaw 1.38 cm
xwuhov 0.01 cm/s
lklazxdfugje 1.15 m
sapmll vyxcbqm qwkl 2.21 cm/s
znihzn pjbfg xdgivkkkbli 4.8 m
kiwkwznnsj 0.2 m/s


In [10]:
from core.generators.dataset_generator import DatasetGenerator
from core.generators.mashup_dataset_generator import MashupDatasetGenerator

In [11]:
options = [
    {
        'table': table,
        'n_instances': 10,
        'dataset_generator': DatasetGenerator(table_generator)
    }
]

mashup_generator = MashupDatasetGenerator()
dataset = mashup_generator.get_data(options)

In [12]:
import pandas as pd

In [13]:
df = pd.DataFrame(dataset)
df.columns = ['table', 'target']

df.head()

Unnamed: 0,table,target
0,llvscpbqhd 1.54 cqrhlqpwgtffxn 4.05 m ...,llvscpbqhd bkkmwnw 1.54 m/s\ncbimylc adatq hhm...
1,hqegaat 0.19 zrfdshumn 1.13 ...,hqegaat peplrxwdwxj njdpqgraj 0.19 cm/s\njeexq...
2,ikavotqk 0.73 tfgosxkmuzuwpn 0.34 ...,ikavotqk zcxsm 0.73 cm/s\ntfgosxkmuzuwpn 0.34 ...
3,midmqygjcpd 0.48 ijqcmg 1.48 ...,midmqygjcpd rsyr vifwfp 0.48 cm\nbctxcuibjo gw...
4,mrg 3.11 m uxnfrk 3.66 ...,mrg grydxtxgvus 3.11 m\nnzvtmdu lldckto 3.13 m...


In [14]:
print(df['table'][0])

llvscpbqhd  1.54       cqrhlqpwgtffxn 4.05 m     nxjfch dc   0.47
bkkmwnw     m/s                                  ehqna       cm
cbimylc     1.42                                 qfpth       4.7 cm
adatq       cm/s


In [15]:
print(df['target'][0])

llvscpbqhd bkkmwnw 1.54 m/s
cbimylc adatq hhmrnyfhim 1.42 cm/s
nthmiouk wisyhik 1.58 cm/s
cqrhlqpwgtffxn 4.05 m
nxjfch dc ehqna 0.47 cm
qfpth 4.7 cm


In [16]:
## split train, dev, test...

In [20]:
from core.utils.modeling_utils import train_test_dev_split

In [21]:
train_data, dev_data, test_data = train_test_dev_split(dataset, .70)
    
print(len(train_data), len(dev_data), len(test_data))

7 1 2
