In [1]:
import sys

## find 'core' package
sys.path.insert(0, '../..')

sys.version

'3.7.3 (default, Mar 27 2019, 09:23:15) \n[Clang 10.0.1 (clang-1001.0.46.3)]'

In [2]:
from core.models import unit
from core.models.column import Column
from core.models.table import Table
from core.factories import column_factory

In [3]:
from core.generators.label_generator import LabelGenerator
from core.generators.unit_generator import UnitGenerator
from core.generators.column_generator import ColumnGenerator
from core.generators.table_generator import TableGenerator

In [4]:
vocab = [ c for c in 'abcdefghijklmnopqrstuvwxyz' ]

index_to_char = dict([ (i, c) for i, c in enumerate(vocab) ])
char_to_index = dict([ (c, i) for i, c in enumerate(vocab) ])

label_generator = LabelGenerator(index_to_char)
unit_generator = UnitGenerator(unit.get_unit_lookup())

column_generator = ColumnGenerator(label_generator, unit_generator)

In [5]:
table = Table()

table.n_spaces_between_columns = 5
table.max_number_of_labels = 10
table.columns = [ 
    column_factory.create_column(
        order = 0,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    ),
    column_factory.create_column(
        order = 1,
        min_label_length = 4,
        max_label_length = 15,
        max_unit_length = 6,
        max_number_of_label_parts = 3
    ),
    column_factory.create_column(
        order = 2,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 6,
        max_number_of_label_parts = 4
    )
]

In [6]:
table_generator = TableGenerator(table, column_generator)

In [7]:
table_id, train, target = table_generator.get_table()

In [8]:
print('\n'.join(train))

kbfcguqqnv  4.29 m     gmylpyvfzfka   2.23       oezbz       0.89
wfzvibezkk  2.12       pngwmvujqkeqi  cm         ajpunhu     cm/s
            cm         dpdyoekpu      1.04 m     ssozpbacj
ldhpe       3.05       tvkjt          4.54 m     uug         0.2
onlpdhf     cm         coynroeaexqaz  1.12                   cm/s
iwtog                                 cm         btdz        3.4 m
qlybvbiblp  2.68       osvrzfmkl      3.8 cm     qntilxqmd   2.14
            m/s        dlzljzhyyifckc 3.8 cm                 cm
popqlwrd    0.95       zuydsxiyvprxrb


In [9]:
print(target)

kbfcguqqnv 4.29 m
wfzvibezkk 2.12 cm
ldhpe onlpdhf iwtog 3.05 cm
qlybvbiblp 2.68 m/s
popqlwrd ikxhg okfora 0.95 m/s
rpwpe idv 3.23 m
xe gg 1.19 cm/s
vvbj yb zxnghelthwi 1.03 cm
kmdktoa jsbff 0.32 m
gmylpyvfzfka pngwmvujqkeqi 2.23 cm
dpdyoekpu 1.04 m
tvkjt 4.54 m
coynroeaexqaz 1.12 cm
osvrzfmkl 3.8 cm
dlzljzhyyifckc zuydsxiyvprxrb 3.8 cm
aion 1.92 m/s
qdptdlnn bbxbqsbi 3.52 cm
oezbz ajpunhu ssozpbacj 0.89 cm/s
uug 0.2 cm/s
btdz 3.4 m
qntilxqmd 2.14 cm


In [10]:
from core.generators.dataset_generator import DatasetGenerator
from core.generators.mashup_dataset_generator import MashupDatasetGenerator

In [11]:
options = [
    {
        'table_generator': table_generator,
        'n_instances': 10,
    }
]

mashup_generator = MashupDatasetGenerator()
df = mashup_generator.get_data(options)

df.head()

Unnamed: 0,table_id,table,target
0,90de3aa1202b4cc7acd2a1961163a785,kbfcguqqnv 1.65 m gmylpyvfzfka 0.7 cm ...,kbfcguqqnv 1.65 m\nqlybvbiblp 1.83 m/s\nxe gg ...
1,90de3aa1202b4cc7acd2a1961163a785,qlybvbiblp 2.98 sxkhnwjn zepgrh4.81 ...,qlybvbiblp 2.98 m/s\nvvbj yb zxnghelthwi 0.84 ...
2,90de3aa1202b4cc7acd2a1961163a785,qlybvbiblp 0.84 sxkhnwjn zepgrh0.27 ...,qlybvbiblp 0.84 m/s\ngqzmgd pxeic tvwofktgz 2....
3,90de3aa1202b4cc7acd2a1961163a785,wfzvibezkk 0.68 tvkjt 2.58 m ...,wfzvibezkk 0.68 cm\ngqzmgd pxeic tvwofktgz 0.0...
4,90de3aa1202b4cc7acd2a1961163a785,gqzmgd pxeic2.61 aion 0.48 ...,gqzmgd pxeic tvwofktgz 2.61 cm\nqlybvbiblp 1.8...


In [12]:
print(df['table'][0])

kbfcguqqnv  1.65 m     gmylpyvfzfka   0.7 cm     tvcn        0.44 m
qlybvbiblp  1.83       pngwmvujqkeqi             mgfuzmuodew 1.84
            m/s                                  phfrsdxczjz cm/s
xe gg       2.55                                 xwun
            cm/s                                 oezbz       2.33
kmdktoa     3.41 m                               ajpunhu     cm/s
jsbff                                            ssozpbacj
                                                 ccxbeiffmd  1.02
                                                             cm
                                                 uug         0.12
                                                             cm/s
                                                 btdz        1.02 m
                                                 hkpaicel    3.12 m
                                                 ipqortdbhj  1.31
                                                 av          m/s


In [13]:
print(df['target'][0])

kbfcguqqnv 1.65 m
qlybvbiblp 1.83 m/s
xe gg 2.55 cm/s
kmdktoa jsbff 3.41 m
gmylpyvfzfka pngwmvujqkeqi 0.7 cm
tvcn 0.44 m
mgfuzmuodew phfrsdxczjz xwun 1.84 cm/s
oezbz ajpunhu ssozpbacj 2.33 cm/s
ccxbeiffmd 1.02 cm
uug 0.12 cm/s
btdz 1.02 m
hkpaicel 3.12 m
ipqortdbhj av 1.31 m/s


In [14]:
## split train, dev, test...

In [15]:
from core.utils.modeling_utils import train_test_dev_split

In [16]:
train_data, dev_data, test_data = train_test_dev_split(df, .70)
    
print(len(train_data), len(dev_data), len(test_data))

7 1 2
