In [1]:
import sys

## find 'core' package
sys.path.insert(0, '../..')

sys.version

'3.7.3 (default, Mar 27 2019, 09:23:15) \n[Clang 10.0.1 (clang-1001.0.46.3)]'

In [2]:
from core.models import unit
from core.models.column import Column
from core.models.table import Table
from core.factories import column_factory

In [3]:
from core.generators.label_generator import LabelGenerator
from core.generators.unit_generator import UnitGenerator
from core.generators.column_generator import ColumnGenerator
from core.generators.table_generator import TableGenerator

In [4]:
vocab = [ c for c in 'abcdefghijklmnopqrstuvwxyz' ]

index_to_char = dict([ (i, c) for i, c in enumerate(vocab) ])
char_to_index = dict([ (c, i) for i, c in enumerate(vocab) ])

label_generator = LabelGenerator(index_to_char)
unit_generator = UnitGenerator(unit.get_unit_lookup())

column_generator = ColumnGenerator(label_generator, unit_generator)

In [5]:
table = Table()

table.n_spaces_between_columns = 5
table.max_number_of_labels = 5
table.columns = [ 
    column_factory.create_column(
        order = 0,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    ),
    column_factory.create_column(
        order = 1,
        min_label_length = 4,
        max_label_length = 15,
        max_unit_length = 6,
        max_number_of_label_parts = 3
    ),
    column_factory.create_column(
        order = 2,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    )
]

In [6]:
table_generator = TableGenerator(column_generator)

In [7]:
train, target = table_generator.get_table(table)

In [8]:
print('\n'.join(train))

wkosmrp     2.25       jwcdaiprciiqqm 2.3        hutjlaw     2.12
oagqqj      cm                        cm/s       gpaaa       cm
nzpxxdeqs   3.42       ywydxl ealfzw  1.5


In [9]:
print(target)

wkosmrp oagqqj 2.25 cm
nzpxxdeqs hkaxayaub ytijrnhkkfv 3.42 cm
aalybhg 3.19 cm
jwcdaiprciiqqm 2.3 cm/s
ywydxl ealfzw 1.5 m/s
fnmdmoj 0.41 m/s
qleddvxsykpcu 0.61 cm
hutjlaw gpaaa 2.12 cm


In [10]:
from core.generators.dataset_generator import DatasetGenerator
from core.generators.mashup_dataset_generator import MashupDatasetGenerator

In [11]:
options = [
    {
        'table': table,
        'n_instances': 10,
        'dataset_generator': DatasetGenerator(table_generator)
    }
]

mashup_generator = MashupDatasetGenerator()
dataset = mashup_generator.get_data(options)

In [12]:
import pandas as pd

In [13]:
df = pd.DataFrame(dataset)
df.columns = ['table', 'target']

df.head()

Unnamed: 0,table,target
0,fxhptebwrxo 2.7 m rbxfpfs jusz 1.11 ...,fxhptebwrxo 2.7 m\nqcgsigbeiw pwit 0.8 cm/s\nq...
1,ae dhcaqcz 2.23 ilmoceupobmp 2.23 ...,ae dhcaqcz 2.23 cm/s\nlktfavqtgh mfdldpcuufo 2...
2,iawuubj 0.28 djudenaab tbawo0.92 m ...,iawuubj wnjtyfumhsg 0.28 m/s\nslxiclve jgyhz 1...
3,gzjae xf po 3.46 gnyomjbrwikl 1.39 ...,gzjae xf po 3.46 cm\ngnyomjbrwikl nekeiqsje 1....
4,ieqkdlzjkcd 0.06 jzgxw 0.92 m ...,ieqkdlzjkcd qodnluzqdb 0.06 cm/s\nqqivirstws x...


In [14]:
print(df['table'][0])

fxhptebwrxo 2.7 m      rbxfpfs jusz   1.11       zvboiggittj 0.85
qcgsigbeiw  0.8                       cm         ourzj       m/s
pwit        cm/s                                 wubzzaqk
qtycf       1.53                                 xci sh      2.87
            cm/s                                             m/s
igqtts eucnx0.28


In [15]:
print(df['target'][0])

fxhptebwrxo 2.7 m
qcgsigbeiw pwit 0.8 cm/s
qtycf 1.53 cm/s
igqtts eucnx qubcc 0.28 cm/s
rbxfpfs jusz 1.11 cm
zvboiggittj ourzj wubzzaqk 0.85 m/s
xci sh 2.87 m/s


In [16]:
## split train, dev, test...

In [17]:
from core.utils.modeling_utils import train_test_dev_split

In [18]:
train_data, dev_data, test_data = train_test_dev_split(dataset, .70)
    
print(len(train_data), len(dev_data), len(test_data))

7 1 2
