In [None]:
# export
from fastai2.basics import *
from fastai2.text.all import *
from pycocotools.coco import COCO

In [None]:
# default_exp data.coco

# Data Coco
>

In [None]:
# export
_train_img_path = '/root/data/coco/train2014'
_train_anno_path = '/root/data/coco/annotations/captions_train2014.json'
_val_img_path = '/root/data/coco/val2014'
_val_anno_path = '/root/data/coco/annotations/captions_val2014.json'
_tiny_img_path = './tiny_data/tiny_imgs'
_tiny_anno_path = './tiny_data/captions_tiny.json'

In [None]:
coco = COCO(_tiny_anno_path)

loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


## LM Datasets

In [None]:
# export
def _get_captions(coco):
    anns = coco.dataset['annotations']
    captions = [ann['caption'] for ann in anns]
    return captions

In [None]:
captions = _get_captions(coco)
captions[:3]

['A small closed toilet in a cramped space.',
 'A tan toilet and sink combination in a small room.',
 'This is an advanced toilet with a sink and control panel.']

In [None]:
# export
def _get_lm_dsets(captions, pct=1, valid_pct=0.2):
    captions = captions[:int(len(captions)*pct)]
    df = pd.DataFrame({'caption': captions})
    splits = RandomSplitter(seed=42, valid_pct=valid_pct)(df)
    tfms = [attrgetter('text'), Tokenizer.from_df('caption'), Numericalize()]
    dsets = Datasets(df, [tfms], splits=splits, dl_type=LMDataLoader)
    return dsets

In [None]:
lm_dsets = _get_lm_dsets(captions)
lm_dsets[0]

(TensorText([  2,   9,  30, 179,  19,  13,   9,   0, 230,  10]),)

In [None]:
# export
def get_tiny_lm_dsets(pct=1, valid_pct=0.2):
    coco = COCO(_tiny_anno_path)
    captions = _get_captions(coco)
    lm_dsets = _get_lm_dsets(captions, pct, valid_pct)
    return lm_dsets
# export
def get_small_lm_dsets(pct=1, valid_pct=0.2):
    coco = COCO(_val_anno_path)
    captions = _get_captions(coco)
    lm_dsets = _get_lm_dsets(captions, pct, valid_pct)
    return lm_dsets

In [None]:
lm_dsets = get_tiny_lm_dsets()
lm_dls = lm_dsets.dataloaders(bs=16, seq_len=72)
lm_dls.show_batch(max_n=2)

loading annotations into memory...
Done (t=0.01s)
creating index...
index created!


Unnamed: 0,text,text_
0,xxbos a kitchen that has a bowl of xxunk on the table . xxbos a bathroom is shown with a shower and a toilet . xxbos a cat xxunk in a xxunk xxunk of a car . xxbos xxmaj wooden spoons xxunk out xxunk a kitchen table . xxbos a young woman standing in a kitchen xxunk a plate of vegetables . xxbos a woman eating xxunk vegetables from a bowl .,a kitchen that has a bowl of xxunk on the table . xxbos a bathroom is shown with a shower and a toilet . xxbos a cat xxunk in a xxunk xxunk of a car . xxbos xxmaj wooden spoons xxunk out xxunk a kitchen table . xxbos a young woman standing in a kitchen xxunk a plate of vegetables . xxbos a woman eating xxunk vegetables from a bowl . xxbos
1,xxunk xxunk . xxbos a boy holding an umbrella while standing next to xxunk . xxbos a table xxunk by xxunk and filled with cooking utensils . xxbos a white toilet next to a walk in shower and a sink . xxbos a white bathroom with white fixtures and tile floor xxbos xxmaj girl in a xxunk top holding a xxunk in her back xxunk xxbos xxmaj woman eating an xxunk of,xxunk . xxbos a boy holding an umbrella while standing next to xxunk . xxbos a table xxunk by xxunk and filled with cooking utensils . xxbos a white toilet next to a walk in shower and a sink . xxbos a white bathroom with white fixtures and tile floor xxbos xxmaj girl in a xxunk top holding a xxunk in her back xxunk xxbos xxmaj woman eating an xxunk of xxunk


In [None]:
# skip
lm_dsets = get_small_lm_dsets()
lm_dls = lm_dsets.dataloaders(bs=16, seq_len=72)
lm_dls.show_batch(max_n=2)

loading annotations into memory...
Done (t=0.31s)
creating index...
index created!


Unnamed: 0,text,text_
0,xxbos a bench is in an xxunk of manicured bushes . xxbos xxmaj the bathroom sink counter has drawers beneath it and a mirror above . xxbos a parked gray and black motorcycle and a backpack xxbos an old photo of a castle at the end of the street xxbos a person that is cooking some food in a kitchen . xxbos a young girl taking a swing at a tennis ball,a bench is in an xxunk of manicured bushes . xxbos xxmaj the bathroom sink counter has drawers beneath it and a mirror above . xxbos a parked gray and black motorcycle and a backpack xxbos an old photo of a castle at the end of the street xxbos a person that is cooking some food in a kitchen . xxbos a young girl taking a swing at a tennis ball xxbos
1,"xxup are xxup standing xxup beside a xxup car xxbos a desk with a monitor , laptop , mouse and books on it . xxbos a man that is leaning against a cart . xxbos xxmaj three xxmaj giraffes are standing in a row and they are all different sizes . xxbos xxmaj there are several cows behind a fence , they are looking at the camera . xxbos a man wearing","are xxup standing xxup beside a xxup car xxbos a desk with a monitor , laptop , mouse and books on it . xxbos a man that is leaning against a cart . xxbos xxmaj three xxmaj giraffes are standing in a row and they are all different sizes . xxbos xxmaj there are several cows behind a fence , they are looking at the camera . xxbos a man wearing a"


## export -

In [None]:
# hide
from nbdev.export import notebook2script
notebook2script()

Converted 02_data.coco.ipynb.
Converted 90a_gen_tiny_data.ipynb.
Converted 90b_fulltest_train_lm.ipynb.
Converted index.ipynb.
