## Import Dependancies

In [4]:
import os
import nltk  # 자연어 툴킷, 사전구축용
import pickle
import numpy as np
from PIL import Image
from collections import Counter
from pycocotools.coco import COCO  # COCO데이터셋을 다루고, 확인하기위해 사용
import matplotlib.pyplot as plt
 
import torch
import torch.nn as nn
import torch.utils.data as data
from torchvision import transforms
import torchvision.models as models
import torchvision.transforms as transforms
from torch.nn.utils.rnn import pack_padded_sequence

In [5]:
# punkt 토크나이저 다운로드
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/easttuna/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## build vocab
- 자연어 토큰을 수치로 매핑하기 위한 사전을 구축

In [9]:
class Vocab(object):
    """
    Simple vocabulary wrapper.
    단어-인덱스, 인덱스-단어 간 매핑하여 저장
    """
    def __init__(self):
        self.w2i = {}
        self.i2w = {}
        self.index = 0

    def __call__(self, token):
        """instance가 token을 인자로 call되면, 해당하는 인덱스를 반환"""
        # 사전에 없으면 unknown을 반환
        if not token in self.w2i:
            return self.w2i['<unk>']
        return self.w2i[token]
    
    def __len__(self):
        """사전의 길이를 반환"""
        return len(self.w2i)
    
    def add_token(self, token):
        """새로운 token이 입력될 시에 인덱스 부여"""
        if not token in self.w2i:
            self.w2i[token] = self.index
            self.i2w[self.index] = token
            self.index += 1

def build_vocabulary(json, threshold):
    """Build a simple vocabulary wrapper."""
    coco = COCO(json)  # Constructor of Microsoft COCO helper class for reading and visualizing annotations.
    counter = Counter()
    ids = coco.anns.keys()
    for i, id in enumerate(ids):
        caption = str(coco.anns[id]['caption'])  # coco annotation상의 캡션을 가져옴
        tokens = nltk.tokenize.word_tokenize(caption.lower())  # 캡션을 토크나이징하여 리스트로 변환
        counter.update(tokens)  # Counter.update()는 iterable 이나 mapping을 받아 기존값에 카운트를 더해 업데이트해줌

        if (i+1) % 1000 == 0:
            print("[{}/{}] Tokenized the captions.".format(i+1, len(ids)))

     # If the word frequency is less than 'threshold', then the word is discarded.
     # 캡션에 등장하는 모든 단어를 카운트 해준 뒤, 일정 빈도 이상의 단어만 남김
    tokens = [token for token, cnt in counter.items() if cnt >= threshold]

    # Create a vocab wrapper and add some special tokens.
    # 앞서 생성한 Vocab 클래스의 인스턴스를 생성하고, 특수 토큰을 사전에 미리 추가함
    vocab = Vocab()
    vocab.add_token('<pad>')
    vocab.add_token('<start>')
    vocab.add_token('<end>')
    vocab.add_token('<unk>')
 
    # Add the words to the vocabulary.
    for i, token in enumerate(tokens):
        vocab.add_token(token)
    return vocab

# train 데이터의 어노테이션만 이용하여 사전을 구축함
vocab = build_vocabulary(json='data_dir/annotations/captions_train2014.json', threshold=4)
# 단어 사전을 저장
vocab_path = './data_dir/vocabulary.pkl'
with open(vocab_path, 'wb') as f:
    pickle.dump(vocab, f)
print("Total vocabulary size: {}".format(len(vocab)))
print("Saved the vocabulary wrapper to '{}'".format(vocab_path))

loading annotations into memory...
Done (t=0.53s)
creating index...
index created!
[10000/414113] Tokenized the captions.
[20000/414113] Tokenized the captions.
[30000/414113] Tokenized the captions.
[40000/414113] Tokenized the captions.
[50000/414113] Tokenized the captions.
[60000/414113] Tokenized the captions.
[70000/414113] Tokenized the captions.
[80000/414113] Tokenized the captions.
[90000/414113] Tokenized the captions.
[100000/414113] Tokenized the captions.
[110000/414113] Tokenized the captions.
[120000/414113] Tokenized the captions.
[130000/414113] Tokenized the captions.
[140000/414113] Tokenized the captions.
[150000/414113] Tokenized the captions.
[160000/414113] Tokenized the captions.
[170000/414113] Tokenized the captions.
[180000/414113] Tokenized the captions.
[190000/414113] Tokenized the captions.
[200000/414113] Tokenized the captions.
[210000/414113] Tokenized the captions.
[220000/414113] Tokenized the captions.
[230000/414113] Tokenized the captions.
[24000

### 추가) COCO 클래스의 동작 확인

In [12]:
coco = COCO('data_dir/annotations/captions_train2014.json')

loading annotations into memory...
Done (t=0.62s)
creating index...
index created!


In [20]:
help(coco)

Help on COCO in module pycocotools.coco object:

class COCO(builtins.object)
 |  COCO(annotation_file=None)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, annotation_file=None)
 |      Constructor of Microsoft COCO helper class for reading and visualizing annotations.
 |      :param annotation_file (str): location of annotation file
 |      :param image_folder (str): location to the folder that hosts images.
 |      :return:
 |  
 |  annToMask(self, ann)
 |      Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask.
 |      :return: binary mask (numpy 2D array)
 |  
 |  annToRLE(self, ann)
 |      Convert annotation which can be polygons, uncompressed RLE to RLE.
 |      :return: binary mask (numpy 2D array)
 |  
 |  createIndex(self)
 |  
 |  download(self, tarDir=None, imgIds=[])
 |      Download COCO images from mscoco.org server.
 |      :param tarDir (str): COCO results directory name
 |             imgIds (list): images to be downloaded
 |     

In [22]:
# coco객체에 인자로 annotation json 파일을 넣어 init하면, .anns 속성이 어노테이션 정보 (image_id, id, caption)을 dictionary형태로 가지고 있음
print(type(coco.anns))
coco.anns

<class 'dict'>


{48: {'image_id': 318556,
  'id': 48,
  'caption': 'A very clean and well decorated empty bathroom'},
 67: {'image_id': 116100,
  'id': 67,
  'caption': 'A panoramic view of a kitchen and all of its appliances.'},
 126: {'image_id': 318556,
  'id': 126,
  'caption': 'A blue and white bathroom with butterfly themed wall tiles.'},
 148: {'image_id': 116100,
  'id': 148,
  'caption': 'A panoramic photo of a kitchen and dining room'},
 173: {'image_id': 379340,
  'id': 173,
  'caption': 'A graffiti-ed stop sign across the street from a red car '},
 188: {'image_id': 379340,
  'id': 188,
  'caption': 'A vandalized stop sign and a red beetle on the road'},
 219: {'image_id': 318556,
  'id': 219,
  'caption': 'A bathroom with a border of butterflies and blue paint on the walls above it.'},
 255: {'image_id': 318556,
  'id': 255,
  'caption': 'An angled view of a beautifully decorated bathroom.'},
 272: {'image_id': 134754,
  'id': 272,
  'caption': 'The two people are walking down the beach.'

## resize images

In [25]:
# 데이터셋의 이미지 크기가 통일되어있지 않음
# 입력 크기가 고정되어이야하는 모델이므로, 이미지를 동일 사이즈로 전처리
# 교재 코드에는 Image.ANTIALIAS를 인자로 전달하나 현 버전의 pillow에서는 deprecated되어 LANCZOS로 대체함

def reshape_image(image, shape):
    """Resize an image to the given shape."""
    return image.resize(shape, Image.Resampling.LANCZOS)  # resize 시의 interpolation 방법을 인자로 넘겨줌

def reshape_images(image_path, output_path, shape):
    """Reshape the images in 'image_path' and save into 'output_path'."""
    # ouput dir 존재하지 않을 시 생성
    if not os.path.exists(output_path):
        os.makedirs(output_path)
 
    images = os.listdir(image_path)  # 이미지 목록을 가져옴
    num_im = len(images)
    for i, im in enumerate(images):
        # 이미지를 'r+' -> 읽기&갱신, 'b' 바이너리 객체 모드
        with open(os.path.join(image_path, im), 'r+b') as f:
            with Image.open(f) as image:
                image = reshape_image(image, shape)
                image.save(os.path.join(output_path, im), image.format)
        if (i+1) % 100 == 0:
            print ("[{}/{}] Resized the images and saved into '{}'."
                   .format(i+1, num_im, output_path))

image_path = './data_dir/train2014/'
output_path = './data_dir/resized_images/'
image_shape = [256, 256]
reshape_images(image_path, output_path, image_shape)

[100/82783] Resized the images and saved into './data_dir/resized_images/'.
[200/82783] Resized the images and saved into './data_dir/resized_images/'.
[300/82783] Resized the images and saved into './data_dir/resized_images/'.
[400/82783] Resized the images and saved into './data_dir/resized_images/'.
[500/82783] Resized the images and saved into './data_dir/resized_images/'.
[600/82783] Resized the images and saved into './data_dir/resized_images/'.
[700/82783] Resized the images and saved into './data_dir/resized_images/'.
[800/82783] Resized the images and saved into './data_dir/resized_images/'.
[900/82783] Resized the images and saved into './data_dir/resized_images/'.
[1000/82783] Resized the images and saved into './data_dir/resized_images/'.
[1100/82783] Resized the images and saved into './data_dir/resized_images/'.
[1200/82783] Resized the images and saved into './data_dir/resized_images/'.
[1300/82783] Resized the images and saved into './data_dir/resized_images/'.
[1400/82

In [23]:
Image.ANTIALIAS

  """Entry point for launching an IPython kernel.


<Resampling.LANCZOS: 1>