In [1]:
import random
import pandas as pd
import numpy as np
from pprint import pprint
import sys
sys.path.append('..')
import doctable as dt

## ```tokens``` and ```subdocs``` Custom Types

```tokens``` is used to store a sequence of tokens and ```subdocs``` is used to store a sequence of a sequence of tokens (think list of tokenized sentences). Simply specify these columns as their appropriate type and it will automatically convert these tokenized objects to formatted strings for database storage.

In [2]:
schema = (
    ('id','integer',dict(primary_key=True, autoincrement=True)),
    ('title','string', dict(nullable=False, unique=True)),
    ('bag_of_words','tokens'),
    ('tokenized_sentences', 'subdocs'),
)
db = dt.DocTable2(schema)
print(db)

<DocTable2::_documents_ ct: 0>


In [3]:
tokens = ['this', 'is', 'the', 'happiest', 'day', 'of', 'my', 'life', '.']
db.insert({'title':'Happy sentence', 'bag_of_words':tokens})
db.select(['title','bag_of_words'])

[('Happy sentence', ['this', 'is', 'the', 'happiest', 'day', 'of', 'my', 'life', '.'])]

Under the hood, DocTable2 stores a ```tokens``` column as a set of strings separated by newlines. This is more efficient than storing a pickled list or other sequence because large files can compress text data efficiently.

In [4]:
sentences = (
    ('i', 'am', 'happy','.'),
    ('the','sky','is','blue','.'),
    ('sun', 'is', 'shining','.'),
    ('what', 'more', 'can', 'i', 'ask', 'for', '?'),
)

db.insert({'title':'Happy sentences', 'tokenized_sentences':sentences})
for title,sents in db.select(['title','tokenized_sentences']):
    print(title)
    pprint(sents)
    print()

Happy sentence
None

Happy sentences
[['i', 'am', 'happy', '.'],
 ['the', 'sky', 'is', 'blue', '.'],
 ['sun', 'is', 'shining', '.'],
 ['what', 'more', 'can', 'i', 'ask', 'for', '?']]



In [5]:
db.select_df()

Unnamed: 0,id,title,bag_of_words,tokenized_sentences
0,1,Happy sentence,"[this, is, the, happiest, day, of, my, life, .]",
1,2,Happy sentences,,"[[i, am, happy, .], [the, sky, is, blue, .], [..."
