<a href="https://colab.research.google.com/github/davinhill/Convolution_Captioning/blob/master/Preprocess_Captions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import os

# Path where the data should be located
path = '/content/drive/My Drive/Colab Notebooks/IE534_ImageCaptioning/Data'
os.chdir(path)

In [0]:
from pycocotools.coco import COCO 

import torch
from torchvision import datasets, transforms
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import io
import itertools
import nltk
nltk.download('punkt')

import os
from datetime import datetime


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
# Load Captions from Annotation File

##################
# Validation Set
##################
# Load Annotations
cap = COCO(os.path.join(path, 'annotations/captions_val2017.json'))

# Create list of captions
ann_ids = cap.getAnnIds(imgIds = [])
ann_list = cap.loadAnns(ids = ann_ids)

cap_val_raw = []
for dict in ann_list:
  cap_val_raw.append(dict['caption'])

##################
# Train Set
##################
# Load Annotations
cap = COCO(os.path.join(path, 'annotations/captions_train2017.json'))

# Create list of captions
ann_ids = cap.getAnnIds(imgIds = [])
ann_list = cap.loadAnns(ids = ann_ids)

cap_train_raw = []
for dict in ann_list:
  cap_train_raw.append(dict['caption'])

loading annotations into memory...
Done (t=0.06s)
creating index...
index created!
loading annotations into memory...
Done (t=0.97s)
creating index...
index created!


In [0]:
# Number of Captions:
len(cap_train_raw)

591753

In [0]:
# Tokenize Caption List

cap_val = []
for caption in cap_val_raw:
  line = nltk.word_tokenize(caption)
  line = [w.lower() for w in line] 
  cap_val.append(line)


cap_train = []
for caption in cap_train_raw:
  line = nltk.word_tokenize(caption)
  line = [w.lower() for w in line] 
  cap_train.append(line)


# Number of Tokens
no_of_tokens = []
for tokens in cap_train:
    no_of_tokens.append(len(tokens))
no_of_tokens = np.asarray(no_of_tokens)
print('Total: ', np.sum(no_of_tokens), ' Min: ', np.min(no_of_tokens), ' Max: ', np.max(no_of_tokens), ' Mean: ', np.mean(no_of_tokens), ' Std: ', np.std(no_of_tokens), ' Med: ', np.median(no_of_tokens))



Total:  6687792  Min:  6  Max:  57  Mean:  11.301661335050266  Std:  2.596305429474608  Med:  11.0


In [0]:
# create word_to_id and id_to_word translations
# word_to_id is a dictionary
# id_to_word is a np array

all_tokens = itertools.chain.from_iterable(cap_train)
word_to_id = {token: idx for idx, token in enumerate(set(all_tokens))}


all_tokens = itertools.chain.from_iterable(cap_train)
id_to_word = [token for idx, token in enumerate(set(all_tokens))]

id_to_word = np.asarray(id_to_word)


print(len(id_to_word), "unique words / tokens")

29556 unique words / tokens


In [0]:
## let's sort the indices by word frequency instead of random
x_train_token_ids = [[word_to_id[token] for token in x] for x in cap_train]
count = np.zeros(id_to_word.shape)
for x in x_train_token_ids:
    for token in x:
        count[token] += 1
indices = np.argsort(-count)
id_to_word = id_to_word[indices]
count = count[indices]

hist = np.histogram(count,bins=[1,10,100,1000,10000])
print(hist)
for i in range(10):
    print(id_to_word[i],count[i])

## recreate word_to_id based on sorted list
word_to_id = {token: (idx+4) for idx, token in enumerate(id_to_word)}

# add start/end/unknown token
word_to_id['<S>'] = 1
word_to_id['</S>'] = 2
word_to_id['UNK'] = 3

# add start/end/unknown token
id_to_word = np.insert(id_to_word, 0, 'UNK')
id_to_word = np.insert(id_to_word, 0, '</S>')
id_to_word = np.insert(id_to_word, 0, '<S>')
id_to_word = np.insert(id_to_word, 0, '<MASK>')

'''
## assign -1 if token doesn't appear in our dictionary
## add +1 to all token ids, we went to reserve id=0 for an unknown token
cap_val_token_ids = [[word_to_id.get(token,-1)+1 for token in x] for x in cap_val]
cap_train_token_ids = [[word_to_id.get(token,-1)+1 for token in x] for x in cap_train]
'''

(array([22103,  5084,  1760,   524]), array([    1,    10,   100,  1000, 10000]))
a 978638.0
. 444424.0
on 215658.0
of 204059.0
the 197760.0
in 184003.0
with 154800.0
and 140762.0
is 98209.0
man 73322.0


"\n## assign -1 if token doesn't appear in our dictionary\n## add +1 to all token ids, we went to reserve id=0 for an unknown token\ncap_val_token_ids = [[word_to_id.get(token,-1)+1 for token in x] for x in cap_val]\ncap_train_token_ids = [[word_to_id.get(token,-1)+1 for token in x] for x in cap_train]\n"

In [0]:
id_to_word[word_to_id.get('dog')]

'dog'

In [0]:
id_to_word[18]

','

In [0]:

os.chdir('/content/drive/My Drive/Colab Notebooks/IE534_ImageCaptioning/')

# save word_to_id
import pickle
with open('word_to_id.p', 'wb') as fp:
    pickle.dump(word_to_id, fp, protocol=4)

## save id_to_word
np.save('id_to_word.npy',np.asarray(id_to_word))

'''
## save training data to single text file
with io.open('val_captions.txt','w',encoding='utf-8') as f:
    for tokens in cap_val_token_ids:
        for token in tokens:
            f.write("%i " % token)
        f.write("\n")

## save test data to single text file
with io.open('train_captions.txt','w',encoding='utf-8') as f:
    for tokens in cap_train_token_ids:
        for token in tokens:
            f.write("%i " % token)
        f.write("\n")
'''


'\n## save training data to single text file\nwith io.open(\'val_captions.txt\',\'w\',encoding=\'utf-8\') as f:\n    for tokens in cap_val_token_ids:\n        for token in tokens:\n            f.write("%i " % token)\n        f.write("\n")\n\n## save test data to single text file\nwith io.open(\'train_captions.txt\',\'w\',encoding=\'utf-8\') as f:\n    for tokens in cap_train_token_ids:\n        for token in tokens:\n            f.write("%i " % token)\n        f.write("\n")\n'