In [None]:
# deep learning with PyTorch: Neural Style Transfer
# set Google Colab runtime

# !pip install torch torchvision
# !git clone https://github.com/parth1620/Project-NST.git

# loading VSG-19 pretrained model using model subpackage in PyTorch
import torch
from torchvision import models

vgg = models.vgg19(pretrained = True)

# prints features and classifiers and their layers
# for this project, we don't use the classifiers, as we extract the content and style from the layers in the features
print(vgg)

vgg = vgg.features
# see only the features component
print(vgg)

# parse the gradient, use the pretrained weight without updating it
# freeze the model so no gradient computation occurs in the training loop
for parameters in vgg.parameters():
  parameters.requires_grad_(False)

# check and move variables and model to GPU if available, otherwise to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print to check what device was set to
print(device)
vgg.to(device)

# preprocess image
from PIL import Image
from torchvision import transforms as T

# create preprocess function
# used to resize the image if it is larger than max_size
def preprocess(img_path, max_size = 500):
  image = Image.open(img_path).convert('RGB')

  if max(image.size) > max_size:
    size = max_size
  else:
    size = max(image.size)

  # all pre-trained models expect input images normalized in the same way
  # loaded in to a range of [0, 1] and then normalized using mean and std values
  # mean and std values copied from the torchvision.models manual documentation
  img_transforms = T.Compose([
                  T.Resize(size),
                  T.ToTensor(),
                  T.Normalize(mean = [0.485, 0.456, 0.406],
                              std = [0.229, 0.224, 0.225])
  ])
  image = img_transforms(image)

  # if image in shape of (channel, height, width) -> (batch size, channel, height, width)
  image = image.unsqueeze(0)

  return image

# pass content and style image to preprocess function
content_p = preprocess('/content/Project-NST/content11.jpg')
style_p = preprocess('/content/Project-NST/style12.jpg')

# pass to GPU
content_p = content_p.to(device)
style_p = style_p.to(device)

# print the shape
print("Content Shape:", content_p.shape)
print("Style Shape:", style_p.shape)

# deprocess image
import numpy as np
import matplotlib.pyplot as plt

# deprocess the image to plot the image
# opposide of the preprocess

def deprocess(tensor):
  # to plot content and style image, we pass content tensor and style tensor to cpu
  image = tensor.to('cpu').clone()
  image = image.numpy()

  # (batch size, channel, height, width) -> (channel, height, width)
  image = image.squeeze(0)

  # (channel, height, width) -> (height, width, channel)
  image = image.transpose(1, 2, 0)

  # denormalize image with std value and mean value
  image = image * np.array([0.229, 0.224, 0.225]) + np.array([0.485, 0.456, 0.406])

  image = image.clip(0,1)

  return image

# deprocess content and style tensor
content_d = deprocess(content_p)
style_d = deprocess(style_p)

# print shape
print("Deprocess Content Shape:", content_d.shape)
print("Deprocess Style Shape:", style_d.shape)

# plot the images using matplotlib
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 10))

ax1.imshow(content_d)
ax2.imshow(style_d)

# get content, style features and create gram matrix
# extract style and content features from the vgg model that we have previously loaded and defined
def get_features(image, model):
  # map names
  layers = {
      '0' : 'conv1_1', #style feature
      '5' : 'conv2_1', #style feature
      '10' : 'conv3_1', #style feature
      '19' : 'conv4_1', #style feature
      '21' : 'conv4_2', # content feature
      '28' : 'conv5_1' # style feature
  }

  x = image

  # store content and style features
  Features = {}

  for name, layer in model._modules.items():
    
    # first layer will be 0 layer
    # output is the input for the next layer
    x = layer(x)

    if name in layers:
      Features[layers[name]] = x
  
  return Features

# load content and style features
content_f = get_features(content_p, vgg)
style_f = get_features(style_p, vgg)

# create the gram matrix function
# gram matrix is the corelation between the filters
# the (channel, height, width) gets unrolled into H X W by C
# take the unrolled matrix and do matrix multiplication
# H X W by C multiped by C by H X W = gram matrix

def gram_matrix(tensor):
  # unroll the tensor into a matrix
  b, c, h, w = tensor.size()
  tensor = tensor.view(c, h*w)
  gram = torch.mm(tensor, tensor.t())

  return gram

# find style features gram matrix of every matrix in style Features dictionary
style_grams = {layer : gram_matrix(style_f[layer]) for layer in style_f}

# create style and content loss function to apply for the optimization of the target image
def content_loss(target_conv4_2, content_conv4_2):
  return torch.mean((target_conv4_2 - content_conv4_2)**2)

style_weights = {
    'conv1_1' : 1.0,
    'conv2_1' : 0.75,
    'conv3_1' : 0.2,
    'conv4_1' : 0.2,
    'conv5_1' : 0.2
}

def style_loss(style_weights, target_features, style_grams):
  # loss calculated between the style weights and style grams
  loss = 0

  for layer in style_weights:
    target_f = target_features[layer]
    target_gram = gram_matrix(target_f)

    style_gram = style_grams[layer]

    b, c, h, w = target_f.shape
    
    layer_loss = style_weights[layer] * torch.mean((target_gram - style_gram)**2)

    loss += layer_loss / (c*h*w)

  return loss

# initialize target image (the image we want to generate)
# can initialize with random noise or content image
# we are using the content image
target = content_p.clone().requires_grad_(True).to(device)
target_f = get_features(target, vgg)
print("Content Loss:", content_loss(target_f['conv4_2'], content_f['conv4_2']))
print("Style Loss:", style_loss(style_weights, target_f, style_grams))