In [1]:
import torch
import torch.nn as nn
import torch.nn.init
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms

from torchtext.models import ROBERTA_BASE_ENCODER
from torchtext.functional import to_tensor

import numpy as np
import pandas as pd
import glob
import os
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def normalization(X):
    """L2-normalization of features columns"""
    norm = torch.pow(X, 2).sum(dim=1, keepdim=True).sqrt()
    X = torch.div(X, norm)
    
    return X

In [3]:
class ImageEncoder(nn.Module):

  def __init__(self, embedding_size, cnn_type):
    """Initializing parameters"""
    super(ImageEncoder).__init__()
    self.embedding_size = embedding_size # Size of projected image
    self.cnn = self.load_cnn(cnn_type)

    # No need to finetune parameters = frozen layers
    for param in self.cnn.parameters():
        param.requires_grad = False

    # Replacing last fully connected layer with new one
    self.fc = nn.Linear(self.cnn.classifier._modules['6'].in_features, embedding_size)
    self.cnn.classifier = nn.Sequential(*list(self.cnn.classifier.children())[:-1])

    # Initializing the weights of fully-connected layer, which makes projection to new space
    self.initialization_weights()
  
  def load_cnn(self, cnn_type):
    """Loading pretrained model"""
    model = models.__dict__[cnn_type](pretrained=True)

    return model

  def initialization_weights(self):
    """Xavier initialization"""
    r = np.sqrt(6.) / np.sqrt(self.fc.in_features + self.fc.out_features)
    self.fc.weight.data.uniform_(-r, r)
    self.fc.bias.data.fill_(0)

  def forward(self, X):
    """Creation of features"""
    # Creation of embeddings
    features = self.cnn(X)

    # Normalization of embeddings
    features = normalization(features)

    # Projection to new space
    features = self.fc(features)

    return features

In [4]:
ROBERTA_OUT_DIM = 768

class TextEncoder(nn.Module):

    def __init__(self, embedding_size):
        """Initializing parameters"""
        super(TextEncoder).__init__()
        self.embedding_size = embedding_size # Size of projected text
        self.roberta = ROBERTA_BASE_ENCODER.get_model()
        self.transform = ROBERTA_BASE_ENCODER.transform()

        # Linear layer
        self.fc = nn.Linear(ROBERTA_OUT_DIM, embedding_size)

        # Initializing the weights of fully-connected layer, which makes projection to new space
        self.initialization_weights()
        
    def _roberta_encode(self, batch):
        transformed = self.transform(batch)
        model_input = to_tensor(transformed, padding_value=1)
        return self.roberta(model_input)

    def initialization_weights(self):
        """Xavier initialization"""
        r = np.sqrt(6.) / np.sqrt(self.fc.in_features + self.fc.out_features)
        self.fc.weight.data.uniform_(-r, r)
        self.fc.bias.data.fill_(0)

    def forward(self, X, lengths=None):
        """Creation of features"""
        # Creation of embeddings
        features = self._roberta_encode(X)

        # Normalization of embeddings
        features = normalization(features)

        # Projection to new space
        features = self.fc(features)

        return features

In [5]:
class FullEncoder(nn.Module):

    def __init__(self, embedding_size, cnn_type):
        """Initializing parameters"""
        super(FullEncoder).__init__()
        self.embedding_size = embedding_size # Size of projected text
        self.image_encoder = ImageEncoder(embedding_size)
        self.text_encoder = TextEncoder(embedding_size)

    def forward(self, X):
        """Creation of features"""
        img_feas = self.image_encoder(X)
        txt_feas = self.text_encoder(X)
        return img_feas, txt_feas

In [10]:
ex = '18 Kt Rose Gold Supreme Swan Charm Bracelet The Supreme Swan Charm Bracelet is the symbol of love, peace and grace. Gift this to a loved one that adds these elements to your life. Made in Sterling Silver with a rose gold polish.'
roberta = ROBERTA_BASE_ENCODER.get_model()
transform = ROBERTA_BASE_ENCODER.transform()
transformed = transform([ex])
model_input = to_tensor(transformed, padding_value=1)
out = roberta(model_input)
print(out.shape)
out

torch.Size([1, 54, 768])


tensor([[[-0.0762,  0.1758,  0.0180,  ..., -0.0412, -0.0501,  0.1010],
         [ 0.1527,  0.1693,  0.0524,  ..., -0.2322,  0.1401, -0.0715],
         [ 0.0464,  0.2457, -0.1961,  ..., -0.1256, -0.0039, -0.2478],
         ...,
         [ 0.0140,  0.3970,  0.0445,  ...,  0.4153, -0.0614, -0.0562],
         [-0.4992,  0.5983, -0.2194,  ...,  0.0704, -0.4470, -0.5318],
         [ 0.1097,  0.3402,  0.0988,  ...,  0.6461, -0.3220,  0.4176]]],
       grad_fn=<TransposeBackward0>)