In [2]:
from pathlib import Path
import pandas as pd

In [2]:
flickr30k_caps = Path("/home/p0w3r/dataHDD/datasets/flickr30k/captions.txt")
assert flickr30k_caps.exists()

# create a dataframe
##### cols: image_id, caption_id, caption

In [3]:
f30k = pd.read_csv(flickr30k_caps, sep='|')

In [4]:
f30k.columns

Index(['image_name', 'caption_number', 'caption_text'], dtype='object')

In [13]:
f30k = f30k.rename(columns={'image_name': 'image_id', 'caption_number': 'caption_id', 'caption_text': 'caption'})

# Persist the Dataframe

In [14]:
f30k.reset_index(drop=True).to_feather('../data/f30k_raw.df.feather')

In [3]:
f30k = pd.read_feather('../data/f30k_raw.df.feather')
print(f"len(f30k) / 5 --> {len(f30k) / 5}")
print(f30k.columns)

len(f30k) / 5 --> 31783.0
Index(['image_id', 'caption_id', 'caption'], dtype='object')


# generate metadata for the captions WITH SPACY

In [4]:
import sys
sys.path.append('..')
from utils import generate_caption_stats

In [5]:
pos_tags_stats = True
readability_scores = False
n_spacy_workers = 8

f30k_metadata_spacy = generate_caption_stats(f30k, pos_tags_stats, readability_scores, n_spacy_workers, backend='spacy')

# persist
print(f30k_metadata_spacy.head())
f30k_metadata_spacy.reset_index(drop=True).to_feather('../data/f30k_metadata_spacy.df.feather')

2021-03-20 12:42:31.738 | INFO     | utils:generate_caption_stats:146 - Generating caption statistics using SPACY...
100%|██████████| 158915/158915 [01:32<00:00, 1708.89it/s]
2021-03-20 12:44:05.434 | INFO     | utils:generate_caption_stats:521 - Finished adding caption statistics in 93.69280004501343 seconds!


         image_id  caption_id  \
0  1000092795.jpg           0   
1  1000092795.jpg           1   
2  1000092795.jpg           2   
3  1000092795.jpg           3   
4  1000092795.jpg           4   

                                             caption  num_tok  num_sent  \
0  Two young guys with shaggy hair look at their ...       17         1   
1  Two young , White males are outside near many ...       11         1   
2   Two men in green shirts are standing in a yard .       11         1   
3       A man in a blue shirt standing in a garden .       11         1   
4            Two friends enjoy time spent together .        7         1   

   min_sent_len  max_sent_len  num_ne    ne_types ne_texts  ...  num_conj  \
0            17            17       1  [CARDINAL]    [Two]  ...         0   
1            11            11       1  [CARDINAL]    [Two]  ...         0   
2            11            11       1  [CARDINAL]    [Two]  ...         0   
3            11            11       0     

# generate metadata for the captions WITH NLTK

In [6]:
pos_tags_stats = True
readability_scores = False
n_spacy_workers = 8

f30k_metadata_nltk = generate_caption_stats(f30k, pos_tags_stats, readability_scores, n_spacy_workers, backend='nltk')

print(f30k_metadata_nltk.head())
f30k_metadata_nltk.reset_index(drop=True).to_feather('../data/f30k_metadata_nltk.df.feather')

2021-03-20 12:44:06.088 | INFO     | utils:generate_caption_stats:146 - Generating caption statistics using NLTK...
  0%|          | 0/158915 [00:00<?, ?it/s][nltk_data] Downloading package punkt to /home/p0w3r/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /home/p0w3r/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/p0w3r/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/p0w3r/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package universal_treebanks_v20 to
[nltk_data]     /home/p0w3r/nltk_data...
[nltk_data]   Package universal_treebanks_v20 is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/p0w3r/nltk_data

         image_id  caption_id  \
0  1000092795.jpg           0   
1  1000092795.jpg           1   
2  1000092795.jpg           2   
3  1000092795.jpg           3   
4  1000092795.jpg           4   

                                             caption  num_tok  num_sent  \
0  Two young guys with shaggy hair look at their ...       17         1   
1  Two young , White males are outside near many ...       11         1   
2   Two men in green shirts are standing in a yard .       11         1   
3       A man in a blue shirt standing in a garden .       11         1   
4            Two friends enjoy time spent together .        7         1   

   min_sent_len  max_sent_len  num_ne ne_types ne_texts  ...  num_conj  \
0            17            17       0       []       []  ...         0   
1            11            11       1    [GPE]  [White]  ...         0   
2            11            11       0       []       []  ...         0   
3            11            11       0       []       [

# generate metadata for the captions WITH POLYGLOT

In [7]:
pos_tags_stats = True
readability_scores = False
n_spacy_workers = 8

f30k_metadata_polyglot = generate_caption_stats(f30k, pos_tags_stats, readability_scores, n_spacy_workers, backend='polyglot')

print(f30k_metadata_polyglot.head())
f30k_metadata_polyglot.reset_index(drop=True).to_feather('../data/f30k_metadata_polyglot.df.feather')

2021-03-20 12:55:22.002 | INFO     | utils:generate_caption_stats:146 - Generating caption statistics using POLYGLOT...
  0%|          | 0/158915 [00:00<?, ?it/s]

[polyglot_data] Downloading package embeddings2.en to
[polyglot_data]     /home/p0w3r/polyglot_data...
[polyglot_data]   Package embeddings2.en is already up-to-date!
[polyglot_data] Downloading package ner2.en to
[polyglot_data]     /home/p0w3r/polyglot_data...
[polyglot_data]   Package ner2.en is already up-to-date!
[polyglot_data] Downloading package pos2.en to
[polyglot_data]     /home/p0w3r/polyglot_data...
[polyglot_data]   Package pos2.en is already up-to-date!


  0%|          | 752/158915 [00:07<06:20, 416.02it/s] Detector is not able to detect the language reliably.
  1%|          | 1660/158915 [00:09<05:30, 475.21it/s]Detector is not able to detect the language reliably.
  1%|          | 1756/158915 [00:09<05:33, 471.31it/s]Detector is not able to detect the language reliably.
  1%|          | 1866/158915 [00:09<05:09, 506.64it/s]Detector is not able to detect the language reliably.
  2%|▏         | 2828/158915 [00:11<05:13, 498.18it/s]Detector is not able to detect the language reliably.
  2%|▏         | 3432/158915 [00:13<05:29, 472.37it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
  3%|▎         | 4936/158915 [00:16<05:15, 488.45it/s]Detector is not able to detect the language reliably.
  3%|▎         | 5386/158915 [00:17<05:20, 479.43it/s]Detector is not able to detect the language reliably.
  4%|▎         | 5636/158915 [00:17<05:15, 485.83it/s]Detector is not able to dete

 19%|█▉        | 30925/158915 [01:10<04:21, 489.03it/s]Detector is not able to detect the language reliably.
 20%|█▉        | 31131/158915 [01:11<04:18, 493.52it/s]Detector is not able to detect the language reliably.
 20%|█▉        | 31238/158915 [01:11<04:10, 509.38it/s]Detector is not able to detect the language reliably.
 20%|█▉        | 31706/158915 [01:12<04:19, 489.98it/s]Detector is not able to detect the language reliably.
 20%|██        | 32480/158915 [01:13<04:23, 480.53it/s]Detector is not able to detect the language reliably.
 21%|██        | 32725/158915 [01:14<04:39, 451.06it/s]Detector is not able to detect the language reliably.
 21%|██        | 32820/158915 [01:14<04:32, 463.56it/s]Detector is not able to detect the language reliably.
 21%|██        | 33476/158915 [01:16<05:25, 385.08it/s]Detector is not able to detect the language reliably.
 21%|██▏       | 33934/158915 [01:17<04:42, 442.27it/s]Detector is not able to detect the language reliably.
 22%|██▏       | 34

 38%|███▊      | 60595/158915 [02:13<03:19, 492.98it/s]Detector is not able to detect the language reliably.
 38%|███▊      | 60648/158915 [02:13<03:15, 503.58it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
 38%|███▊      | 60753/158915 [02:13<03:17, 496.19it/s]Detector is not able to detect the language reliably.
 39%|███▉      | 61607/158915 [02:15<03:16, 496.30it/s]Detector is not able to detect the language reliably.
 39%|███▉      | 61657/158915 [02:15<03:20, 483.88it/s]Detector is not able to detect the language reliably.
 39%|███▉      | 61706/158915 [02:15<03:24, 474.60it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
 39%|███▉      | 61809/158915 [02:15<03:15, 495.91it/s]Detector is not able to detect the language reliably.
 39%|███▉      | 61964/158915 [02:16<03:15, 495.42it/s]Detector is not able

 48%|████▊     | 76622/158915 [02:49<03:23, 404.24it/s]Detector is not able to detect the language reliably.
 48%|████▊     | 76817/158915 [02:49<02:56, 463.94it/s]Detector is not able to detect the language reliably.
 48%|████▊     | 76971/158915 [02:49<02:47, 489.49it/s]Detector is not able to detect the language reliably.
 49%|████▊     | 77265/158915 [02:50<02:58, 458.13it/s]Detector is not able to detect the language reliably.
 49%|████▉     | 77511/158915 [02:50<02:50, 477.71it/s]Detector is not able to detect the language reliably.
 49%|████▉     | 77561/158915 [02:51<02:48, 483.88it/s]Detector is not able to detect the language reliably.
 49%|████▉     | 77717/158915 [02:51<02:43, 496.26it/s]Detector is not able to detect the language reliably.
 49%|████▉     | 78067/158915 [02:52<02:43, 495.29it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
 49%|████▉     | 78232/158915 [02:52<02:41, 498.18it/s]Detector is not abl

Detector is not able to detect the language reliably.
 69%|██████▉   | 110348/158915 [04:04<01:43, 469.59it/s]Detector is not able to detect the language reliably.
 70%|██████▉   | 110934/158915 [04:05<01:40, 475.36it/s]Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
 70%|██████▉   | 111187/158915 [04:06<01:38, 483.04it/s]Detector is not able to detect the language reliably.
 70%|███████   | 111620/158915 [04:07<01:48, 436.68it/s]Detector is not able to detect the language reliably.
 70%|███████   | 111766/158915 [04:07<01:40, 469.85it/s]Detector is not able to detect the language reliably.
 71%|███████   | 112766/158915 [04:09<01:38, 470.71it/s]Detector is not able to detect the language reliably.
 71%|███████   | 112864/158915 [04:10<01:37, 470.49it/s]Detector is not able to detect the language reliably.
 71%|███████   | 113114/158915 [04:10<01:34, 484.56it/s]Detector is not able to detect the language reliably.
 71%|███████

100%|██████████| 158915/158915 [06:07<00:00, 432.60it/s]

         image_id  caption_id  \
0  1000092795.jpg           0   
1  1000092795.jpg           1   
2  1000092795.jpg           2   
3  1000092795.jpg           3   
4  1000092795.jpg           4   

                                             caption  num_tok  num_sent  \
0  Two young guys with shaggy hair look at their ...       17         1   
1  Two young , White males are outside near many ...       11         1   
2   Two men in green shirts are standing in a yard .       11         1   
3       A man in a blue shirt standing in a garden .       11         1   
4            Two friends enjoy time spent together .        7         1   

   min_sent_len  max_sent_len  num_ne ne_types ne_texts  ...  num_conj  \
0            17            17       0       []       []  ...         0   
1            11            11       0       []       []  ...         0   
2            11            11       0       []       []  ...         0   
3            11            11       0       []       [


