In [1]:
import torch
import torch.nn as nn
import torch.optim as optim


import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.datasets import CocoCaptions

import pandas as pd
import os
import sys
import random

from torch.utils.data import Dataset, DataLoader
from transformers.optimization import AdamW

from PIL import Image
import requests
from io import BytesIO
from IPython.display import display
from multiprocessing.pool import ThreadPool

from IPython.display import display
from collections import namedtuple
from time import perf_counter
from tqdm.autonotebook import tqdm
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [2]:
!pip install -q transformers

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
IMG_SIZE = 299
PATH_TO_SAVE = 'drive/My Drive/'
PATH_TO_MODELS = f'drive/My Drive/models/'

sys.path.append(PATH_TO_MODELS)

from image2text import Image2TextDescriptor
from sampler import Sampler
from train_model import train

# COCO Dataset

## Загрузка данных

In [7]:
!mkdir -p data/COCO

In [8]:
!wget http://images.cocodataset.org/annotations/annotations_trainval2014.zip
!unzip -q -n annotations_trainval2014.zip -d data/COCO
!rm annotations_trainval2014.zip

--2020-06-26 05:48:29--  http://images.cocodataset.org/annotations/annotations_trainval2014.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.146.67
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.146.67|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 252872794 (241M) [application/zip]
Saving to: ‘annotations_trainval2014.zip’


2020-06-26 05:48:38 (30.0 MB/s) - ‘annotations_trainval2014.zip’ saved [252872794/252872794]



In [9]:
!wget http://images.cocodataset.org/zips/train2014.zip
!unzip -q -n train2014.zip -d data/COCO
!rm train2014.zip

--2020-06-26 05:48:53--  http://images.cocodataset.org/zips/train2014.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.92.11
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.92.11|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13510573713 (13G) [application/zip]
Saving to: ‘train2014.zip’


2020-06-26 05:55:24 (33.0 MB/s) - ‘train2014.zip’ saved [13510573713/13510573713]



In [10]:
!wget http://images.cocodataset.org/zips/val2014.zip
!unzip -q -n val2014.zip -d data/COCO
!rm val2014.zip

--2020-06-26 06:01:54--  http://images.cocodataset.org/zips/val2014.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.233.35
Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.233.35|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6645013297 (6.2G) [application/zip]
Saving to: ‘val2014.zip’


2020-06-26 06:05:08 (32.6 MB/s) - ‘val2014.zip’ saved [6645013297/6645013297]



## Создание даталодера

In [12]:
transform = transforms.Compose([
     transforms.Resize((IMG_SIZE, IMG_SIZE)),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225])
])

path2data = os.path.abspath('/content/data/COCO')

In [13]:
def target_transform(target):
    target = random.choice(target)
    return target.lower().split()

In [14]:
coco_train = CocoCaptions(root = path2data + '/train2014',
                          annFile = path2data + '/annotations/captions_train2014.json',
                          transform=transform,
                          target_transform=target_transform)

loading annotations into memory...
Done (t=0.78s)
creating index...
index created!


In [16]:
print('Number of samples: ', len(coco_train))
img, target = coco_train[3] # load 4th sample

print("Image Size: ", img.size())
print(target)

Number of samples:  82783
Image Size:  torch.Size([3, 299, 299])
['a', 'lone', 'zebra', 'grazing', 'in', 'some', 'green', 'grass.']


In [17]:
def get_coco_dataloader(dataset, batch_size, pad_elem, shuffle=True):
    def pad(seq, max_len, pad_elem):
        return seq + [pad_elem] * max(0, max_len - len(seq))

    def collate_fn(batch_data):
        list_imgs, list_ids = list(zip(*batch_data))

        batch_img = torch.stack(list_imgs, 0)    

        batch_len = max(map(len, list_ids))
        batch_ids = torch.tensor(
            [pad(ids, batch_len, pad_elem) 
             for ids in list_ids]
        ).long()
        batch_mask = batch_ids.ne(pad_elem).int()

        return batch_img, batch_ids, batch_mask

    return DataLoader(
        dataset=dataset, batch_size=batch_size, shuffle=shuffle,
        collate_fn=collate_fn, pin_memory=False, num_workers=0 
    )

# Train

In [20]:
descriptor = Image2TextDescriptor(device, SPECIAL_TOKENS=('img', 'desc', 'pad'))

In [21]:
####### COCO DATASET #######
batch_size = 16

transform = transforms.Compose([
     transforms.Resize((IMG_SIZE, IMG_SIZE)),
     transforms.ToTensor(),
     transforms.Normalize(mean=[0.485, 0.456, 0.406],
                          std=[0.229, 0.224, 0.225])
])

path2data = os.path.abspath('/content/data/COCO')

coco_train = CocoCaptions(root = path2data + '/train2014',
                          annFile = path2data + '/annotations/captions_train2014.json',
                          transform=transform,
                          target_transform=descriptor.target_transform)

coco_trainloader = get_coco_dataloader(coco_train, batch_size, 
                                       descriptor.special_ids.pad)

loading annotations into memory...
Done (t=0.73s)
creating index...
index created!


In [23]:
LAST_TIMESTEP = 0 # 3775 + 3800 + 3763 + 2841 + 1600 + 10000 # for inception: 3775 + 3800 + 3763 + 2841 + 1600 + 10000 # for resnet: 219 + 1020 + 903 + 1000 + 500 + 1471 + 313 + 642 # !!!!!!!!!

In [25]:
train(descriptor, coco_trainloader, PATH_TO_SAVE, lr=1e-3, accum_interval=10, timestep=LAST_TIMESTEP, n_epochs=1)

HBox(children=(FloatProgress(value=0.0, max=5174.0), HTML(value='')))



FileNotFoundError: ignored

# Валидация

In [26]:
#descriptor = Image2TextDescriptor()
descriptor.load(os.path.join(PATH_TO_SAVE, 'weights/temp_inception'))

In [27]:
sampler = Sampler(descriptor)

In [28]:
####### COCO DATASET #######
val_transform = transforms.Compose([
                                    transforms.Resize((IMG_SIZE, IMG_SIZE)),
                                    transforms.ToTensor(),
                                    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                         std=[0.229, 0.224, 0.225])
])
'''normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
to_img = transforms.ToPILImage()'''

batch_size = 20

path2data = os.path.abspath('/content/data/COCO')

coco_val = CocoCaptions(root = path2data + '/val2014',
                        annFile = path2data + '/annotations/captions_val2014.json',
                        transform=val_transform,
                        target_transform=descriptor.target_transform)

coco_valloader = get_coco_dataloader(coco_val, batch_size, 
                                     descriptor.special_ids.pad, shuffle=False)

loading annotations into memory...
Done (t=1.20s)
creating index...
index created!


In [29]:
predictions = []
captions = []

In [30]:
descriptor.eval()
pbar = tqdm(enumerate(coco_valloader), total=len(coco_valloader), leave=False)
with torch.no_grad():
    for idx, batch in pbar:
        imgs, desc_ids, mask = [x.to(descriptor.device) for x in batch]
        
        #processed_imgs = normalize(imgs)
        sampler.initialize(imgs)
        sys_desc_ids = sampler.run()
        for sys_ids in sys_desc_ids:
            predictions.append(descriptor.tokenizer.convert_ids_to_tokens(sys_ids))

        '''if idx < 2:
            for img, gold_ids, sys_ids in zip(imgs, desc_ids, sys_desc_ids):
                print('gold description: ', descriptor.tokenizer.decode(gold_ids, skip_special_tokens=True))
                print('system description: ', descriptor.tokenizer.decode(sys_ids, skip_special_tokens=True))
                display(to_img(img))
                print('\n\n')'''

HBox(children=(FloatProgress(value=0.0, max=2026.0), HTML(value='')))

In [35]:
len(predictions)

40504

In [36]:
coco_val = CocoCaptions(root = path2data + '/val2014',
                        annFile = path2data + '/annotations/captions_val2014.json')

loading annotations into memory...
Done (t=1.22s)
creating index...
index created!


In [38]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
import numpy as np

In [39]:
def get_bleus(predictions, dataset):
    score1 = 0
    score4 = 0
    smoother = SmoothingFunction()
    for candidate, (_, captions) in tqdm(zip(predictions, dataset), total=len(predictions)):
        references = list(map(lambda x: descriptor.tokenizer.convert_ids_to_tokens(descriptor.tokenizer.encode(x)), 
                              captions))
        score1 += sentence_bleu(references, candidate, weights=(1, 0, 0, 0), smoothing_function=smoother.method1)
        score4 += sentence_bleu(references, candidate, weights=(0, 0, 0, 1), smoothing_function=smoother.method1)
    bleu1 = 100*score1/len(dataset)
    bleu4 = 100*score4/len(dataset)        
    print("BLEU 1:", np.round(bleu1, 2), 
          "BLEU 4:", np.round(bleu4, 2))    
    return bleu1, bleu4

bleu1, bleu4 = get_bleus(predictions, coco_val)

HBox(children=(FloatProgress(value=0.0, max=40504.0), HTML(value='')))


BLEU 1: 9.81 BLEU 4: 0.89
