In [7]:
import os
# from utils.zsg_data import FlickrDataset
# from models.slic_vit import SLICViT
# from models.resnet_high_res import ResNetHighRes
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.optim as optim
import torch.multiprocessing as mp
import torch.distributed as dist
# from torchvision import models, transforms
from torch.utils.data import DataLoader
from utils.vgp_data import FlickrVGPsDataset
from models.vgp_vit import VGPViT
import argparse
from tqdm import tqdm
from datetime import datetime


In [8]:
class ScoreMapComparator(nn.Module):
    def __init__(self, pretrained_model, pretrained_model_args):
        super(ScoreMapComparator, self).__init__(pretrained_model)
        self.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 112 * 112, 128)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()
        self.map_model = pretrained_model(**pretrained_model_args)

    def forward(self, img, phrases):
        x1,x2 = self.map_model(img, phrases)
        x1 = self.pool(self.relu(self.conv1(x1)))
        x2 = self.pool(self.relu(self.conv1(x2)))
        x1 = x1.view(-1, 64 * 112 * 112)
        x2 = x2.view(-1, 64 * 112 * 112)
        x = torch.cat((x1, x2), dim=1)
        x = self.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

In [9]:
def setup_gpu(gpu, args):
    torch.manual_seed(0)
    torch.cuda.set_device(gpu)
    rank = args.nr * args.gpus + gpu	                          
    dist.init_process_group(                                   
    	backend='nccl',                                         
   		init_method='env://',                                   
    	world_size=args.world_size,                              
    	rank=rank                                               
    )      
    return rank


def load_dataset(rank, batch_size, args):
    train_dataset = FlickrVGPsDataset(data_type='train')
    val_dataset = FlickrVGPsDataset(data_type='val')
    if args.num_samples > 0:
        train_dataset.image_paths = train_dataset.image_paths[:args.num_samples]
    # train_sampler = torch.utils.data.distributed.DistributedSampler(
    # 	train_dataset,
    # 	num_replicas=args.world_size,
    # 	rank=rank
    # )
    train_loader = DataLoader(
        dataset=train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=True,
        # sampler=train_sampler
    )

    # val_sampler = torch.utils.data.distributed.DistributedSampler(
    # 	val_dataset,
    # 	num_replicas=args.world_size,
    # 	rank=rank
    # )
    val_loader = DataLoader(
        dataset=val_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=0,
        pin_memory=True,
        # sampler=val_sampler
    )
    return train_loader, val_loader

def setup_model(gpu, args):
    # Score map model
    if args.map_model == 'vgp_vit':
        map_model = VGPViT
        map_model_args = {
            'model': 'vit14',
            'alpha': 0.75,
            'aggregation': 'mean',
            'n_segments': list(range(100, 601, 50)),
            'temperature': 0.02,
            'upsample': 2,
            'start_block': 0,
            'compactness': 50,
            'sigma': 0,
        }
    # TODO: other baseline models
    else:
        assert False
    
    # Train Similarity CNN
    sim_net = ScoreMapComparator(map_model, map_model_args)
    model = DDP(sim_net, device_ids=[gpu])


def train(gpu, args):
    rank = setup_gpu(gpu, args)
    train_loader, val_loader = load_dataset(rank=rank,
                                            batch_size=100, 
                                            args=args)
    model = setup_model(gpu, args)

    # Train
    train_loss = []
    train_acc = []
    valid_loss = []
    valid_acc = []

    # 損失関数の定義
    criterion = nn.CrossEntropyLoss()

    # 最適化手法の定義
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    start = datetime.now()
    for epoch in range(args.epochs):
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
                dataloader = train_loader
            else:
                model.eval()
                dataloader = val_loader

            epoch_loss = 0.0
            epoch_corrects = 0

            for batch_idx, (idx, img, phrases, isVGPs) in tqdm(enumerate(dataloader)):
                img, phrases = img.cuda(non_blocking=True), phrases.cuda(non_blocking=True)
                isVGPs = isVGPs=='True'
                isVGPs = isVGPs.cuda(non_blocking=True)
                optimizer.zero_grad() # optimizerを初期化

                with torch.set_grad_enabled(phase=='train'):
                    outputs = model(img, phrases)
                    loss = criterion(outputs, isVGPs)
                    _, preds = torch.max(outputs, 1)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                    epoch_loss += loss.item()
                    epoch_corrects += torch.sum(preds==isVGPs.data)
        
            epoch_loss = epoch_loss / len(dataloader.dataset)
            epoch_acc = epoch_corrects.double() / len(dataloader.dataset)

            if phase == 'train':
                train_loss.append(epoch_loss)
                train_acc.append(epoch_acc.cpu())
            else:
                valid_loss.append(epoch_loss)
                valid_acc.append(epoch_acc.cpu())

            print('{} Loss: {:.4f}, Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))


        # チェックポイントの保存
        torch.save({'epoch': epoch,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict()
                   },
                   'checkpoints/checkpoint{}.pt'.format(epoch + 1))

    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))
        
    return train_loss, train_acc, valid_loss, valid_acc

In [10]:
gpu = 0
args = {
    'map_model': 'vgp_vit',
    'task': 'train',
    'num_samples': 80000,
    'nodes': 1,
    'gpus': 3,
    'nr': 0,
    'epochs': 100
}

class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

args = dotdict(args)
args.world_size = args.gpus * args.nodes                #
# os.environ['MASTER_ADDR'] = 'localhost'              #
# os.environ['MASTER_PORT'] = '12345'                
# mp.spawn(train, nprocs=args.gpus, args=(args,))  
# rank = setup_gpu(gpu, args)
torch.manual_seed(0)
torch.cuda.set_device(gpu)
train_loader, val_loader = load_dataset(rank=0,
                                        batch_size=100, 
                                        args=args)

In [24]:
from torchtext.data.utils import get_tokenizer
from torchvision import transforms


for batch in tqdm(train_loader):
    idices, images, phrase_pairs, labels = [d for d in batch.values()]
    print(images, type(images), len(images))
    print( len(phrase_pairs[0]))
    images_tensor = [image.cuda(non_blocking=True) for image in images]
    # print(images_tensor)
    # Convert phrases list to a PyTorch tensor
    tokenizer = get_tokenizer('basic_english')
    tokenized_phrases = [[tokenizer(phrase) for phrase in phrase_list] for phrase_list in phrase_pairs]
    print(tokenized_phrases[0])
    # phrases_tensor = [[torch.tensor(tokenized_phrase) for tokenized_phrase in phrase_list] for phrase_list in tokenized_phrases]

    labels = [label=='True' for label in labels]
    labels_tensor = torch.tensor(labels)
    labels_tensor = labels_tensor.cuda(non_blocking=True)


    print(phrases_tensor)
    print(labels_tensor)
    print(len(labels))
    break

  0%|          | 0/800 [00:00<?, ?it/s]

tensor([[[[255, 190,   3],
          [253, 184,   2],
          [254, 181,   1],
          ...,
          [  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0]],

         [[237, 172,  11],
          [251, 185,  12],
          [255, 191,  10],
          ...,
          [  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0]],

         [[128,  31,   5],
          [140,  48,   2],
          [157,  72,   6],
          ...,
          [  0,   0,   0],
          [  0,   0,   0],
          [  0,   0,   0]],

         ...,

         [[ 79,  50,  19],
          [ 80,  51,  19],
          [ 79,  51,  20],
          ...,
          [ 84,  50,  20],
          [ 82,  50,  18],
          [ 80,  50,  16]],

         [[ 71,  46,   8],
          [ 72,  47,  10],
          [ 75,  49,  11],
          ...,
          [ 83,  47,  20],
          [ 84,  48,  22],
          [ 83,  50,  19]],

         [[ 79,  51,   9],
          [ 75,  48,  12],
          [ 74,  45,  10],
         




NameError: name 'phrases_tensor' is not defined

In [20]:
tokenizer = get_tokenizer('basic_english')
print(tokenizer('his teammates'))

['his', 'teammates']
