In [1]:
import sys

## find 'core' package
sys.path.insert(0, '../..')

sys.version

'3.7.3 (default, Mar 27 2019, 09:23:15) \n[Clang 10.0.1 (clang-1001.0.46.3)]'

In [2]:
from core.models import unit

from core.models.table import Table
from core.factories import column_factory

from core.orchestrators import ManyToManySameInputAndOutputVocabOrchestrator

In [3]:
vocab = 'abcdefghijklmnopqrstuvwxyz'

vocab_orchestrator = ManyToManySameInputAndOutputVocabOrchestrator(vocab, unit.get_unit_lookup())

In [4]:
table_1 = Table()
table_1.set_id('0')
table_1.n_spaces_between_columns = 5
table_1.max_number_of_labels = 3
table_1.columns = [
    column_factory.create_column(
        order = 0,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 10,
        max_number_of_label_parts = 2
    ),
    column_factory.create_column(
        order = 1,
        min_label_length = 2,
        max_label_length = 12,
        max_unit_length = 10,
        max_number_of_label_parts = 2
    )
]

In [5]:
table_lookup = {}

table_lookup[table_1.get_id()] = table_1

In [6]:
n_instances = 300

options = []
for key in table_lookup.keys():
    options.append(
        {
            'table_generator': vocab_orchestrator.create_table_generator(table_1),
            'n_instances': n_instances
        }
    )
    
dataset = vocab_orchestrator.build_dataset(options)
    
expected_size = len(table_lookup.keys()) * n_instances
assert len(dataset) == expected_size, f'{len(dataset)} <> {expected_size}'

dataset.head(n=5)

Unnamed: 0,table_id,table,target
0,0,stj 3.91 m zzealae 1.5 m/s...,stj 3.91 m\nzzealae 1.5 m/s\nbqziqiopzxt 0.13 m/s
1,0,stj 2.34 m bqziqiopzxt 0.7 m/s...,stj 2.34 m\njhwu 2.57 m\nbqziqiopzxt 0.7 m/s\n...
2,0,bnxov 1.31 m zzealae 1.6 m/s...,bnxov 1.31 m\njhwu 4.29 m\nzzealae 1.6 m/s\njd...
3,0,jhwu 0.32 m bqziqiopzxt 1.99 m/...,jhwu 0.32 m\nbnxov 4.87 m\nbqziqiopzxt 1.99 m/...
4,0,stj 0.83 m jdlpbbk 0.53 m\...,stj 0.83 m\nbnxov 3.95 m\njdlpbbk 0.53 m\nbqzi...


In [7]:
## breakdown,
dataset.groupby('table_id').count()

Unnamed: 0_level_0,table,target
table_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,300,300


In [8]:
## save off,
dataset.to_csv('../../data/table/v0.0.1_actual.csv', index = False)

In [9]:
translated_df = vocab_orchestrator.translate_to_integer_array(dataset, table_lookup)

translated_df.head()

Unnamed: 0,input,output,n_columns
0,"[34, 35, 25, 2, 2, 2, 2, 2, 2, 2, 2, 2, 8, 3, ...","[34, 35, 25, 2, 8, 3, 14, 6, 2, 28, 1, 41, 41,...",2
1,"[34, 35, 25, 2, 2, 2, 2, 2, 2, 2, 2, 2, 7, 3, ...","[34, 35, 25, 2, 7, 3, 8, 9, 2, 28, 1, 25, 23, ...",2
2,"[17, 29, 39, 30, 37, 2, 2, 2, 2, 2, 2, 2, 6, 3...","[17, 29, 39, 30, 37, 2, 6, 3, 8, 6, 2, 28, 1, ...",2
3,"[25, 23, 38, 36, 2, 2, 2, 2, 2, 2, 2, 2, 5, 3,...","[25, 23, 38, 36, 2, 5, 3, 8, 7, 2, 28, 1, 17, ...",2
4,"[34, 35, 25, 2, 2, 2, 2, 2, 2, 2, 2, 2, 5, 3, ...","[34, 35, 25, 2, 5, 3, 13, 8, 2, 28, 1, 17, 29,...",2


In [10]:
import numpy as np

np.array(translated_df['input'][0])

array([34, 35, 25,  2,  2,  2,  2,  2,  2,  2,  2,  2,  8,  3, 14,  6,  2,
       28,  2,  2,  2,  2,  2,  2,  2,  2,  2, 41, 41, 20, 16, 27, 16, 20,
        2,  2,  2,  2,  2,  6,  3, 10,  2, 28,  4, 34,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2, 17, 32, 41, 24, 32, 24, 30, 31,
       41, 39, 35,  2,  5,  3,  6,  8,  2, 28,  4, 34,  2,  2,  2,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0

In [11]:
## save off,
translated_df.to_csv('../../data/table/v0.0.1.csv', index = False)