In [1]:
import sys

## find 'core' package
sys.path.insert(0, '../..')

sys.version

'3.7.3 (default, Mar 27 2019, 09:23:15) \n[Clang 10.0.1 (clang-1001.0.46.3)]'

In [2]:
from core.models import unit
from core.models.column import Column
from core.models.table import Table
from core.factories import column_factory

In [3]:
from core.generators.label_generator import LabelGenerator
from core.generators.unit_generator import UnitGenerator
from core.generators.column_generator import ColumnGenerator
from core.generators.table_generator import TableGenerator

In [4]:
vocab = [ c for c in 'abcdefghijklmnopqrstuvwxyz' ]

index_to_char = dict([ (i, c) for i, c in enumerate(vocab) ])
char_to_index = dict([ (c, i) for i, c in enumerate(vocab) ])

label_generator = LabelGenerator(index_to_char)
unit_generator = UnitGenerator(unit.get_unit_lookup())

column_generator = ColumnGenerator(label_generator, unit_generator)

In [5]:
table = Table()

table.n_spaces_between_columns = 5
table.max_number_of_labels = 10
table.columns = [ 
    column_factory.create_column(
        order = 0,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    ),
    column_factory.create_column(
        order = 1,
        min_label_length = 4,
        max_label_length = 15,
        max_unit_length = 6,
        max_number_of_label_parts = 3
    ),
    column_factory.create_column(
        order = 2,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    )
]

In [6]:
table_generator = TableGenerator(column_generator)

In [7]:
train, target = table_generator.get_table(table)

In [8]:
print('\n'.join(train))

fb mk       0.63 m     nzdcsebwkbj    0.22 m     bbx         1.88
xsmzpn      1.4 m                                tzhitujial  cm
kmn pfniwql 2.02                                 wqrwrxjgihn 4.51
            m/s                                  gz          cm
                                                 ucsrnvupps
                                                 svxqfaj     0.13
                                                 xrmdxw dizq cm/s
                                                 itvdbpnubg  4.68 m
                                                 mwmrwx
                                                 xywcdrj     0.65
                                                 lyrzpmlofs  cm
                                                 lhszrek
                                                 xjqd        1.12
                                                             m/s



In [9]:
print(target)

fb mk 0.63 m
xsmzpn 1.4 m
kmn pfniwql 2.02 m/s
nzdcsebwkbj 0.22 m
bbx tzhitujial 1.88 cm
wqrwrxjgihn gz ucsrnvupps 4.51 cm
svxqfaj xrmdxw dizq 0.13 cm/s
itvdbpnubg mwmrwx 4.68 m
xywcdrj lyrzpmlofs lhszrek 0.65 cm
xjqd 1.12 m/s


In [10]:
from core.generators.dataset_generator import DatasetGenerator
from core.generators.mashup_dataset_generator import MashupDatasetGenerator

In [11]:
options = [
    {
        'table': table,
        'n_instances': 10000,
        'dataset_generator': DatasetGenerator(table_generator)
    }
]

mashup_generator = MashupDatasetGenerator()
dataset = mashup_generator.get_data(options)

In [12]:
import pandas as pd

In [13]:
df = pd.DataFrame(dataset)
df.columns = ['table', 'target']

df.head()

Unnamed: 0,table,target
0,wnfyifor 1.82 dtowmcqjuvr 2.37 ...,wnfyifor vgyhtlifz srlzx 1.82 cm/s\nrbbhgmlohc...
1,rt ofdicswc 0.51 oyqosarekwdjiw 2.57 ...,rt ofdicswc sr 0.51 cm/s\nsu chgqjhok ngcfszck...
2,egfknamd 1.93 m lfaxteuav 1.35 ...,egfknamd hdfzu vlplid 1.93 m\nocye yota kheuhs...
3,ydlrayesqc 4.67 m sxiwiyvodii 1.82 ...,ydlrayesqc udhricnjo gkhlddnfom 4.67 m\nfankyo...
4,hpg exfjynja3.15 yrupxpvrqh 1.19 ...,hpg exfjynja jqrjl 3.15 cm\nntzvhymj skrpaokb ...


In [14]:
print(df['table'][0])

wnfyifor    1.82       dtowmcqjuvr    2.37       ptktd       0.96
vgyhtlifz   cm/s                      cm         pieesyzus   m/s
srlzx                  epnaepqs       0.89       ae          0.74
rbbhgmlohcc 0.87                      m/s        ndgjxmpqlpm cm/s
pgel        cm                                   qvuztlrl qej2.96
cvob        4.45                                 xvdj        cm/s
mzklmpsh    cm                                   qxp         1.22 m
mnnzpzth                                         kygngtgxja
beq lre     3.49                                 boqycqjima  1.67
            cm                                   cr ni       m/s
ebpyuf      0.56                                 wupik       2.05
            cm/s                                 xqauucshxz  m/s
hmoirt bv kc2.81


In [15]:
print(df['target'][0])

wnfyifor vgyhtlifz srlzx 1.82 cm/s
rbbhgmlohcc pgel 0.87 cm
cvob mzklmpsh mnnzpzth 4.45 cm
beq lre 3.49 cm
ebpyuf 0.56 cm/s
hmoirt bv kc 2.81 cm/s
lljkv 1.98 cm
bekozduw 2.64 m
dtowmcqjuvr 2.37 cm
epnaepqs 0.89 m/s
ptktd pieesyzus 0.96 m/s
ae ndgjxmpqlpm 0.74 cm/s
qvuztlrl qej xvdj 2.96 cm/s
qxp kygngtgxja 1.22 m
boqycqjima cr ni 1.67 m/s
wupik xqauucshxz 2.05 m/s


In [16]:
## split train, dev, test...

In [17]:
from core.utils.modeling_utils import train_test_dev_split

In [18]:
train_data, dev_data, test_data = train_test_dev_split(dataset, .70)
    
print(len(train_data), len(dev_data), len(test_data))

7000 1500 1500


In [19]:
## save off the dataset,
df.to_csv('../../data/3-columns/v0.0.1.csv', index = False)