In [1]:
import torch
import torchvision.transforms as transforms
from PIL import Image
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
from torchvision.utils import save_image
import torch.nn.functional as F
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device.type

'cuda'

In [3]:
# loading the model vgg19 that will serve as the base model
vgg = models.vgg19(pretrained=True).features

In [4]:
def imageLoader(imagePath):
    # opening the image
    image = Image.open(imagePath)
    # resizing the image and converting to tensor
    loader = transforms.Compose([transforms.Resize((512,512)), transforms.ToTensor()])
    image=loader(image).unsqueeze(0).to(device, )
    return image

# loading the content and the style images
content_images, style_images = [], []
for i in range(1, 4):
    content_images.append(imageLoader(f"images/Content images/content_image{i}.jpg"))
    style_images.append(imageLoader(f"images/Style images/style_image{i}.jpg"))

In [5]:
# [0,5,10,19,28] are the index of the layers we will be using to calculate the loss as per the paper of NST
#Defining a class that for the model

class StyleTransferModel(nn.Module):
    def __init__(self):
        super(StyleTransferModel, self).__init__()
        # Here we will use the following layers and make an array of their indices
        self.layers= ['0','5','10','19','28'] 
        self.mean = torch.tensor([0.485, 0.456, 0.406]).to(device).view(-1, 1, 1)
        self.std = torch.tensor([0.229, 0.224, 0.225]).to(device).view(-1, 1, 1)
        # since we need only the 5 layers in the model so we will be dropping all the rest layers from the features of the model
        self.model = models.vgg19(pretrained=True).features[:29] #model will contain the first 29 layers
    
   
    # x holds the input tensor(image) that will be feeded to each layer
    def forward(self, img):
        # VGG networks are trained on images with each channel normalized by 
        # mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225]. We will use 
        # them to normalize the image before sending it into the network.
        img = (img - self.mean) / self.std
        # initialize an array that wil hold the activations from the chosen layers
        features=[]
        # iterate over all the layers of the mode
        for layer_num,layer in enumerate(self.model):
            # activation of the layer will stored in x
            img = layer(img)
            # appending the activation of the selected layers and return the feature array
            if (str(layer_num) in self.layers):
                features.append(img)
                
        return features

In [6]:
def calc_content_loss(generated, original):
    # content loss is the MSE between the generated content and actual content
    contentLoss = F.mse_loss(generated, original)
    return contentLoss

def calc_style_loss(generated, style):
    # getting the batch size, number of feature maps, height and width 
    batch_size, n_feature_maps, height, width = generated.shape
    # reshaping
    features_G = generated.view(batch_size * n_feature_maps, height * width)
    # gram matrix
    G = torch.mm(features_G, features_G.t())
    # getting the batch size, number of feature maps, height and width 
    batch_size, n_feature_maps, height, width = style.shape
    # reshaping
    features_A = style.view(batch_size * n_feature_maps, height * width)
    # gram matrix
    A = torch.mm(features_A, features_A.t())
    # calculating the style loss, mse of gram matrices of actual style image and generated image
    styleLoss= F.mse_loss(G, A)
    return styleLoss

def calculate_loss(gen_features, orig_feautes, style_featues, alpha, beta):
    styleLoss, contentLoss = 0, 0
    for gen, content, style in zip(gen_features,orig_feautes,style_featues):
        #extracting the dimensions from the generated image
        contentLoss += calc_content_loss(gen, content)
        styleLoss += calc_style_loss(gen, style)
    
    # calculating the total loss of e th epoch
    totalLoss = alpha * contentLoss + beta * styleLoss 
    return totalLoss

In [7]:
#initialize the paramerters required for fitting the model
epoch=1000
learning_rate = 0.005
alpha = 8
beta = 100
counter = 0

# iterating over all the content and style images
for i, content_image in enumerate(content_images):
    for j, style_image in enumerate(style_images):
        print(f"CONTENT IMAGE {i+1} STYLE IMAGE {j+1}")
        counter += 1
        # creating the generated image from the original image
        generated_image = content_image.clone().requires_grad_(True)

        # initializing the model
        model=StyleTransferModel().to(device).eval() 

        #using adam optimizer and it will update the generated image not the model parameter 
        optimizer = optim.Adam([generated_image], lr=learning_rate)

        #iterating for 1000 times
        for e in range(epoch):
            #extracting the features of generated, content and the original required for calculating the loss
            gen_features = model(generated_image)
            orig_feautes = model(content_image)
            style_featues = model(style_image)
            
            # iterating over the activation of each layer and calculate the loss and add it to the content and the style loss
            total_loss = calculate_loss(gen_features, orig_feautes, style_featues, alpha, beta)
            # optimize the pixel values of the generated image and backpropagate the loss
            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()
            
            # print the image and save it after each 100 epoch
            if e % 100 == 0:
                print(f"episode {e}, loss = {total_loss}")        
                save_image(generated_image, f"images/Results/generated{counter}.png")

CONTENT IMAGE 1 STYLE IMAGE 1
episode 0, loss = 726358622208.0
episode 100, loss = 9133766656.0
episode 200, loss = 4170377216.0
episode 300, loss = 2415852544.0
episode 400, loss = 1666021248.0
episode 500, loss = 1271853952.0
episode 600, loss = 1027066688.0
episode 700, loss = 857640448.0
episode 800, loss = 732144704.0
episode 900, loss = 635751232.0
CONTENT IMAGE 1 STYLE IMAGE 2
episode 0, loss = 1076838531072.0
episode 100, loss = 24068179968.0
episode 200, loss = 8712312832.0
episode 300, loss = 4135428864.0
episode 400, loss = 2486430720.0
episode 500, loss = 1744142848.0
episode 600, loss = 1337708800.0
episode 700, loss = 1084915200.0
episode 800, loss = 915507648.0
episode 900, loss = 791683904.0
CONTENT IMAGE 1 STYLE IMAGE 3
episode 0, loss = 1053593108480.0
episode 100, loss = 41669963776.0
episode 200, loss = 15588938752.0
episode 300, loss = 8232599040.0
episode 400, loss = 5958195200.0
episode 500, loss = 4788913152.0
episode 600, loss = 4014006016.0
episode 700, loss =