In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
import pandas as pd
from sct_dataset import read_sct_stories
from sct_dataset import SCTCachedReader
from tokenizer import Tokenizer

In [15]:
DATA_DIR = 'data'
CACHE_DIR = 'cache'

# download the data
! test -d $DATA_DIR || mkdir $DATA_DIR
! test -f "$DATA_DIR/sct_train.csv" || curl "http://n.ethz.ch/~thomasdi/download/sct_train.csv" --output "$DATA_DIR/sct_train.csv"
! test -f "$DATA_DIR/sct_val.csv" || curl "http://n.ethz.ch/~thomasdi/download/sct_val.csv" --output "$DATA_DIR/sct_val.csv"

# initialize cache dir
! test -d $CACHE_DIR && rm -rf $CACHE_DIR

In [16]:
texts_train = read_sct_stories('data/sct_train.csv')
texts_eval = read_sct_stories('data/sct_val.csv')

In [17]:
tok = Tokenizer().fit(texts_train.begin + texts_train.end_real)

In [18]:
tok.vocabulary_size

34961

In [19]:
print('# STATS FOR TRAINING DATA')
print(pd.Series([len(seq) for seq in texts_train.begin]).describe())
print('# STATS FOR EVAL DATA')
print(pd.Series([len(seq) for seq in texts_eval.begin]).describe())

# STATS FOR TRAINING DATA
count    352644.000000
mean         44.065451
std          13.230812
min           9.000000
25%          34.000000
50%          44.000000
75%          54.000000
max          86.000000
dtype: float64
# STATS FOR EVAL DATA
count    7484.000000
mean       45.646045
std        12.919189
min        11.000000
25%        36.000000
50%        46.000000
75%        56.000000
max        72.000000
dtype: float64


In [20]:
# prepare tokenizer
texts_train = read_sct_stories('data/sct_train.csv')
tok = Tokenizer().fit(texts_train.begin + texts_train.end_real)

# prepare SCT reader
sctreader = SCTCachedReader.create(CACHE_DIR, tok)

In [21]:
sctreader = SCTCachedReader.from_directory(CACHE_DIR)

In [30]:
sctreader.read_stories('data/sct_train.csv')

SCTSequences(begin=array([[[    0,     0,     0, ...,  2457,     4,     2],
        [    0,     0,     0, ...,   399,     4,     2],
        [    0,     0,     0, ...,   399,     4,     2],
        [    0,     0,     0, ...,   119,     4,     2]],

       [[    0,     0,     0, ...,    15,     4,     2],
        [    0,     0,     0, ...,   318,     4,     2],
        [    0,     0,     0, ...,  2008,     4,     2],
        [    0,     0,     0, ...,  9822,    42,     2]],

       [[    0,     0,     0, ...,  1673,     4,     2],
        [    0,     0,     0, ...,  1673,     4,     2],
        [    0,     0,     0, ...,  2564,    13,     2],
        [    0,     0,     0, ...,  5308,     4,     2]],

       ...,

       [[    0,     0,     0, ...,    99,     4,     2],
        [    0,     0,     0, ...,    73,     4,     2],
        [    0,     0,     0, ...,  3955,     4,     2],
        [    0,     0,     0, ...,  1018,     4,     2]],

       [[    0,     0,     0, ...,   320,     4,

In [31]:
stories = sctreader.read_stories('data/sct_train.csv')
tok.sequences_to_texts(stories.begin[0])

["<bos> kelly found her grandmother 's pizza recipe in a shoebox of memories . <eos>",
 "<bos> kelly reminisced about how much she loved her grandmother 's pizza . <eos>",
 '<bos> kelly decided that she was going to try to make pizza . <eos>',
 '<bos> kelly studied the recipe and gathered everything she needed . <eos>']