In [1]:
from pycocotools.coco import COCO
from pathlib import Path

In [2]:
coco_ann_dir = Path("/home/p0w3r/gitrepos/TERAN_fork/data/coco/annotations/")
assert coco_ann_dir.exists()

In [3]:
train_caps_json = coco_ann_dir.joinpath('captions_train2014.json')
val_caps_json = coco_ann_dir.joinpath('captions_val2014.json')
assert train_caps_json.exists()
assert val_caps_json.exists()

In [4]:
train_coco = COCO(train_caps_json)

loading annotations into memory...
Done (t=0.58s)
creating index...
index created!


In [5]:
val_coco = COCO(val_caps_json)

loading annotations into memory...
Done (t=0.34s)
creating index...
index created!


In [6]:
len(val_coco.anns.keys())

202654

In [7]:
len(train_coco.anns.keys())

414113

In [8]:
val_caps = [val_coco.anns[ann_id]['caption'] for ann_id in val_coco.anns.keys()]

In [9]:
print(val_caps[1])

A black Honda motorcycle parked in front of a garage.


In [10]:
val_coco.imgToAnns[328838]

[{'image_id': 328838,
  'id': 762586,
  'caption': 'a short stack of pancakes sitting on a black plate '},
 {'image_id': 328838,
  'id': 764020,
  'caption': 'A black plate with pancakes and syrup on it.'},
 {'image_id': 328838,
  'id': 765907,
  'caption': 'A stack of pan cakes sitting on top of a black table.'},
 {'image_id': 328838,
  'id': 768103,
  'caption': 'pancakes sitting on a pizza stone, covered in butter. '},
 {'image_id': 328838,
  'id': 769801,
  'caption': 'A stack of four pancakes on a skillet.'}]

# create a dataframe from the JSON files
##### cols: image_id, caption_id, caption, origin (train_2014 or val_2014)

In [11]:
import pandas as pd

In [12]:
list(val_coco.imgToAnns.values())[0]

[{'image_id': 203564,
  'id': 37,
  'caption': 'A bicycle replica with a clock as the front wheel.'},
 {'image_id': 203564, 'id': 181, 'caption': 'The bike has a clock as a tire.'},
 {'image_id': 203564,
  'id': 478,
  'caption': 'A black metal bicycle with a clock inside the front wheel.'},
 {'image_id': 203564,
  'id': 6637,
  'caption': 'A bicycle figurine in which the front wheel is replaced with a clock\n'},
 {'image_id': 203564,
  'id': 6802,
  'caption': 'A clock with the appearance of the wheel of a bicycle '}]

In [13]:
val_data = []
for val in val_coco.imgToAnns.values():
    for v in val:
        val_data.append(v)

In [14]:
val_df = pd.DataFrame(val_data)

In [15]:
val_df.rename(columns = {'id': 'caption_id'}, inplace=True)
val_df['origin'] = ['val_2014'] * len(val_df)
val_df.head()

Unnamed: 0,image_id,caption_id,caption,origin
0,203564,37,A bicycle replica with a clock as the front wh...,val_2014
1,203564,181,The bike has a clock as a tire.,val_2014
2,203564,478,A black metal bicycle with a clock inside the ...,val_2014
3,203564,6637,A bicycle figurine in which the front wheel is...,val_2014
4,203564,6802,A clock with the appearance of the wheel of a ...,val_2014


In [16]:
train_data = []
for train in train_coco.imgToAnns.values():
    for t in train:
        train_data.append(t)

len(train_data)

414113

In [17]:
train_df = pd.DataFrame(train_data)
train_df.rename(columns = {'id': 'caption_id'}, inplace=True)
train_df['origin'] = ['train_2014'] * len(train_df)
train_df.head()

Unnamed: 0,image_id,caption_id,caption,origin
0,318556,48,A very clean and well decorated empty bathroom,train_2014
1,318556,126,A blue and white bathroom with butterfly theme...,train_2014
2,318556,219,A bathroom with a border of butterflies and bl...,train_2014
3,318556,255,An angled view of a beautifully decorated bath...,train_2014
4,318556,3555,A clock that blends in with the wall hangs in ...,train_2014


# Persist the Dataframe

In [19]:
coco_df = train_df.copy().append(val_df.copy())
coco_df.reset_index(drop=True).to_feather('../data/coco_raw.df.feather')

In [20]:
len(coco_df) // 5

123353

##### Why are there only 123353 images in the dataset when the website and paper says 328k?!

In [21]:
coco_df = pd.read_feather('../data/coco_raw.df.feather')

# generate metadata for the captions WITH SPACY

In [21]:
import sys
sys.path.append('..')
from utils import generate_caption_stats

pos_tags_stats = True
readability_scores = False
n_spacy_workers = 8

coco_metadata_spacy = generate_caption_stats(coco_df, pos_tags_stats, readability_scores, n_spacy_workers, backend='spacy')

print(coco_metadata_spacy.head())
coco_metadata_spacy.reset_index(drop=True).to_feather('../data/coco_metadata_spacy.df.feather')

2021-01-20 19:30:12.831 | INFO     | utils:generate_caption_stats:135 - Generating caption statistics...
100%|██████████| 616767/616767 [04:26<00:00, 2317.34it/s]
2021-01-20 19:34:45.959 | INFO     | utils:generate_caption_stats:380 - Finished adding caption statistics in 273.1276879310608 seconds!


   image_id  caption_id                                            caption  \
0    318556          48     A very clean and well decorated empty bathroom   
1    318556         126  A blue and white bathroom with butterfly theme...   
2    318556         219  A bathroom with a border of butterflies and bl...   
3    318556         255  An angled view of a beautifully decorated bath...   
4    318556        3555  A clock that blends in with the wall hangs in ...   

       origin  num_tok  num_sent  min_sent_len  max_sent_len  num_ne ne_types  \
0  train_2014        8         1             8             8       0       []   
1  train_2014       11         1            11            11       0       []   
2  train_2014       16         1            16            16       0       []   
3  train_2014        9         1             9             9       0       []   
4  train_2014       13         1            13            13       0       []   

   ... num_conj  num_verb  num_sym  num_num 

# generate metadata for the captions WITH NLTK

In [22]:
pos_tags_stats = True
readability_scores = False
n_spacy_workers = 8

coco_metadata_nltk = generate_caption_stats(coco_df, pos_tags_stats, readability_scores, n_spacy_workers, backend='nltk')

print(coco_metadata_nltk.head())
coco_metadata_nltk.reset_index(drop=True).to_feather('../data/coco_metadata_nltk.df.feather')

2021-01-20 19:34:46.953 | INFO     | utils:generate_caption_stats:135 - Generating caption statistics...
100%|██████████| 616767/616767 [34:25<00:00, 298.60it/s]
2021-01-20 20:09:15.748 | INFO     | utils:generate_caption_stats:380 - Finished adding caption statistics in 2068.794457912445 seconds!


   image_id  caption_id                                            caption  \
0    318556          48     A very clean and well decorated empty bathroom   
1    318556         126  A blue and white bathroom with butterfly theme...   
2    318556         219  A bathroom with a border of butterflies and bl...   
3    318556         255  An angled view of a beautifully decorated bath...   
4    318556        3555  A clock that blends in with the wall hangs in ...   

       origin  num_tok  num_sent  min_sent_len  max_sent_len  num_ne ne_types  \
0  train_2014        8         1             8             8       0       []   
1  train_2014       11         1            11            11       0       []   
2  train_2014       16         1            16            16       0       []   
3  train_2014        9         1             9             9       0       []   
4  train_2014       13         1            13            13       0       []   

   ... num_conj  num_verb  num_sym  num_num 

# generate metadata for the captions WITH POLYGLOT

In [23]:
import sys
sys.path.append('..')
from utils import generate_caption_stats

pos_tags_stats = True
readability_scores = False
n_spacy_workers = 8

coco_metadata_polyglot = generate_caption_stats(coco_df, pos_tags_stats, readability_scores, n_spacy_workers, backend='polyglot')

print(coco_metadata_polyglot.head())
coco_metadata_polyglot.reset_index(drop=True).to_feather('../data/coco_metadata_polyglot.df.feather')

2021-01-25 10:41:36.031 | INFO     | utils:generate_caption_stats:146 - Generating caption statistics using POLYGLOT...
  0%|          | 0/616767 [00:00<?, ?it/s]

[polyglot_data] Downloading package embeddings2.en to
[polyglot_data]     /home/p0w3r/polyglot_data...
[polyglot_data]   Package embeddings2.en is already up-to-date!
[polyglot_data] Downloading package ner2.en to
[polyglot_data]     /home/p0w3r/polyglot_data...
[polyglot_data]   Package ner2.en is already up-to-date!
[polyglot_data] Downloading package pos2.en to
[polyglot_data]     /home/p0w3r/polyglot_data...
[polyglot_data]   Package pos2.en is already up-to-date!


  1%|          | 4591/616767 [00:32<42:24, 240.61it/s]  Detector is not able to detect the language reliably.
  1%|          | 5043/616767 [00:34<41:37, 244.94it/s]Detector is not able to detect the language reliably.
  1%|▏         | 8554/616767 [00:49<56:20, 179.94it/s]Detector is not able to detect the language reliably.
  2%|▏         | 9785/616767 [00:55<47:48, 211.60it/s]  Detector is not able to detect the language reliably.
  2%|▏         | 11804/616767 [01:08<1:41:32, 99.29it/s] Detector is not able to detect the language reliably.
  3%|▎         | 16821/616767 [01:52<42:43, 234.01it/s]  Detector is not able to detect the language reliably.
  3%|▎         | 17303/616767 [01:54<44:40, 223.63it/s]Detector is not able to detect the language reliably.
  3%|▎         | 18391/616767 [01:59<51:01, 195.48it/s]Detector is not able to detect the language reliably.
  4%|▍         | 24364/616767 [02:25<42:19, 233.32it/s]Detector is not able to detect the language reliably.
  4%|▍         

 14%|█▎        | 83780/616767 [07:11<42:11, 210.51it/s]Detector is not able to detect the language reliably.
 14%|█▎        | 83899/616767 [07:12<38:04, 233.30it/s]Detector is not able to detect the language reliably.
 14%|█▍        | 84996/616767 [07:19<37:08, 238.65it/s]  Detector is not able to detect the language reliably.
 14%|█▍        | 85522/616767 [07:21<37:45, 234.50it/s]Detector is not able to detect the language reliably.
 14%|█▍        | 86387/616767 [07:25<39:55, 221.40it/s]Detector is not able to detect the language reliably.
 14%|█▍        | 87320/616767 [07:29<45:02, 195.88it/s]Detector is not able to detect the language reliably.
 14%|█▍        | 89211/616767 [07:38<42:00, 209.34it/s]Detector is not able to detect the language reliably.
 15%|█▍        | 89729/616767 [07:41<42:58, 204.39it/s]  Detector is not able to detect the language reliably.
 15%|█▍        | 90224/616767 [07:43<36:53, 237.93it/s]Detector is not able to detect the language reliably.
 15%|█▍        

 25%|██▌       | 155066/616767 [14:00<44:35, 172.59it/s]  Detector is not able to detect the language reliably.
 25%|██▌       | 156440/616767 [14:11<45:17, 169.42it/s]  Detector is not able to detect the language reliably.
 25%|██▌       | 156738/616767 [14:13<38:42, 198.07it/s]Detector is not able to detect the language reliably.
 25%|██▌       | 156824/616767 [14:13<47:25, 161.62it/s]Detector is not able to detect the language reliably.
 25%|██▌       | 157016/616767 [14:14<37:39, 203.47it/s]Detector is not able to detect the language reliably.
 26%|██▌       | 157813/616767 [14:20<1:56:22, 65.73it/s] Detector is not able to detect the language reliably.
 26%|██▌       | 158253/616767 [14:24<42:46, 178.64it/s]  Detector is not able to detect the language reliably.
 26%|██▌       | 158446/616767 [14:26<1:03:43, 119.87it/s]Detector is not able to detect the language reliably.
 26%|██▌       | 158617/616767 [14:27<57:07, 133.69it/s]  Detector is not able to detect the language reliably

 41%|████▏     | 255190/616767 [25:07<35:57, 167.61it/s] Detector is not able to detect the language reliably.
 42%|████▏     | 261574/616767 [25:50<27:52, 212.34it/s] Detector is not able to detect the language reliably.
 43%|████▎     | 262151/616767 [25:52<33:27, 176.61it/s]Detector is not able to detect the language reliably.
 43%|████▎     | 262441/616767 [25:54<30:40, 192.49it/s]Detector is not able to detect the language reliably.
 43%|████▎     | 262749/616767 [25:56<31:54, 184.95it/s]Detector is not able to detect the language reliably.
 43%|████▎     | 262852/616767 [25:56<32:05, 183.77it/s]Detector is not able to detect the language reliably.
 43%|████▎     | 263958/616767 [26:03<30:28, 192.96it/s] Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
 43%|████▎     | 264219/616767 [26:05<49:18, 119.18it/s] Detector is not able to detect the language reliably.
 43%|████▎     | 264717/616767 [26:08<30:54, 189.86it/s]Detect

 52%|█████▏    | 317720/616767 [32:15<1:42:59, 48.39it/s]Detector is not able to detect the language reliably.
 52%|█████▏    | 317981/616767 [32:17<25:24, 196.01it/s] Detector is not able to detect the language reliably.
 52%|█████▏    | 320163/616767 [32:29<22:56, 215.53it/s]Detector is not able to detect the language reliably.
 52%|█████▏    | 320522/616767 [32:31<21:39, 227.97it/s]Detector is not able to detect the language reliably.
 52%|█████▏    | 323115/616767 [32:45<28:27, 171.99it/s] Detector is not able to detect the language reliably.
 53%|█████▎    | 324692/616767 [32:56<27:35, 176.45it/s]Detector is not able to detect the language reliably.
 53%|█████▎    | 328462/616767 [33:19<22:54, 209.81it/s] Detector is not able to detect the language reliably.
 53%|█████▎    | 328978/616767 [33:22<32:41, 146.69it/s]Detector is not able to detect the language reliably.
 53%|█████▎    | 329023/616767 [33:23<27:01, 177.50it/s]Detector is not able to detect the language reliably.
 53%|█

 57%|█████▋    | 349441/616767 [36:06<23:27, 189.96it/s]Detector is not able to detect the language reliably.
 57%|█████▋    | 349968/616767 [36:08<23:21, 190.42it/s]Detector is not able to detect the language reliably.
 57%|█████▋    | 350332/616767 [36:10<20:49, 213.23it/s]Detector is not able to detect the language reliably.
 57%|█████▋    | 351211/616767 [36:14<20:08, 219.73it/s]Detector is not able to detect the language reliably.
 57%|█████▋    | 351349/616767 [36:15<20:51, 212.14it/s]Detector is not able to detect the language reliably.
 57%|█████▋    | 351962/616767 [36:18<22:46, 193.74it/s]Detector is not able to detect the language reliably.
 57%|█████▋    | 352538/616767 [36:22<23:22, 188.40it/s]Detector is not able to detect the language reliably.
 57%|█████▋    | 353059/616767 [36:25<39:56, 110.02it/s]Detector is not able to detect the language reliably.
 57%|█████▋    | 353135/616767 [36:26<41:17, 106.43it/s]Detector is not able to detect the language reliably.
 57%|█████

 82%|████████▏ | 506098/616767 [54:13<09:24, 195.96it/s]Detector is not able to detect the language reliably.
 82%|████████▏ | 506119/616767 [54:13<09:16, 198.70it/s]Detector is not able to detect the language reliably.
 82%|████████▏ | 507487/616767 [54:20<09:42, 187.56it/s]Detector is not able to detect the language reliably.
 83%|████████▎ | 509992/616767 [54:33<08:52, 200.45it/s]Detector is not able to detect the language reliably.
 83%|████████▎ | 510725/616767 [54:37<09:54, 178.24it/s]Detector is not able to detect the language reliably.
 83%|████████▎ | 511287/616767 [54:40<07:58, 220.49it/s]Detector is not able to detect the language reliably.
 83%|████████▎ | 511400/616767 [54:41<12:46, 137.42it/s]Detector is not able to detect the language reliably.
 83%|████████▎ | 511720/616767 [54:43<16:18, 107.37it/s]Detector is not able to detect the language reliably.
 83%|████████▎ | 511973/616767 [54:45<12:13, 142.85it/s]Detector is not able to detect the language reliably.
 83%|█████

 90%|█████████ | 557684/616767 [59:53<06:42, 146.84it/s]Detector is not able to detect the language reliably.
 91%|█████████ | 558425/616767 [59:57<04:56, 196.95it/s]Detector is not able to detect the language reliably.
 91%|█████████ | 558524/616767 [59:58<05:19, 182.20it/s]Detector is not able to detect the language reliably.
 91%|█████████ | 558590/616767 [59:58<04:46, 203.34it/s]Detector is not able to detect the language reliably.
 91%|█████████ | 559774/616767 [1:00:07<10:32, 90.16it/s] Detector is not able to detect the language reliably.
 91%|█████████ | 559884/616767 [1:00:07<06:25, 147.42it/s]Detector is not able to detect the language reliably.
 91%|█████████ | 560743/616767 [1:00:13<04:48, 194.31it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
 91%|█████████ | 561121/616767 [1:00:15<05:59, 154.86it/s]Detector is not able to detect the language reliably.
 91%|█████████▏| 562873/616767 [1:00:25<06:06, 147.07it/s]

100%|█████████▉| 614099/616767 [1:05:58<00:28, 94.10it/s] Detector is not able to detect the language reliably.
100%|█████████▉| 616361/616767 [1:06:14<00:02, 156.77it/s]Detector is not able to detect the language reliably.
100%|██████████| 616767/616767 [1:06:33<00:00, 154.00it/s]2021-01-25 11:48:37.844 | INFO     | utils:generate_caption_stats:472 - Finished adding caption statistics in 4021.809257507324 seconds!
100%|██████████| 616767/616767 [1:07:01<00:00, 153.36it/s]


   image_id  caption_id                                            caption  \
0    318556          48     A very clean and well decorated empty bathroom   
1    318556         126  A blue and white bathroom with butterfly theme...   
2    318556         219  A bathroom with a border of butterflies and bl...   
3    318556         255  An angled view of a beautifully decorated bath...   
4    318556        3555  A clock that blends in with the wall hangs in ...   

       origin  num_tok  num_sent  min_sent_len  max_sent_len  num_ne ne_types  \
0  train_2014        8         1             8             8       0       []   
1  train_2014       11         1            11            11       0       []   
2  train_2014       16         1            16            16       0       []   
3  train_2014        9         1             9             9       0       []   
4  train_2014       13         1            13            13       0       []   

   ... num_conj  num_verb  num_sym  num_num 