# Task-1

## Complete the missing code for the GEOMETRY model below according to the paper. 

LipSync3D: Data-Efficient Learning of Personalized 3D Talking Faces from Video using Pose and Lighting Normalization https://arxiv.org/abs/2106.04185

In [1]:
import os
import torch
import torch.nn as nn

# Referred https://github.com/longredzhong/LipSync3D/blob/master/lipsync3d/model.py
class View(nn.Module):
    def __init__(self, shape):
        super(View, self).__init__()
        self.shape = shape
    
    def forward(self, x):
        return x.view(*self.shape)

class Geometry_Model(nn.Module):
    """Model comprising of an audio encoder and geometry decoder"""
    
    def __init__(self, Ns):
        super(Geometry_Model, self).__init__()
        self.Ns = Ns
        """Audio encoder 
        We directly use complex spectrograms St. 
        Each St tensor is passed through a 12 layer deep encoder network where 
        the first 6 layers apply 1D convolutions over frequencies (kernel 3 × 1, stride 2 × 1)
        the subsequent 6 layers apply 1D convolution over time (kernel 1 × 3, stride 1 × 2)
        All with leaky ReLU activation

        We used hyperparameter search to determine the latent code lengths, Ns = 32 
        
        Input: takes in audio spectogram of shape B x 2 x 256 x 24
        Output: a vector of shape B * Ns ; 
        """
        # Referred to https://github.com/leventt/surat/blob/master/surat.py for the parent paper model
        # We basically adopt the output channels at each layer from this paper. Kernel and stride are 
        # the values mentioned in our paper
        self.audio_encoder = nn.Sequential(
            #Convolution over frequencies
            # 2 x 256 x 24 -> 72 x 128 x 24
            nn.Conv2d(2, 72, (3, 1), (2, 1), (1, 0)),
            nn.LeakyReLU(),
            
            # 72 x 128 x 24 -> 108 x 64 x 24
            nn.Conv2d(72, 108, (3, 1), (2, 1), (1, 0)),
            nn.LeakyReLU(),
            
            # 108 x 64 x 24 -> 162 x 32 x 24
            nn.Conv2d(108, 162, (3, 1), (2, 1), (1, 0)),
            nn.LeakyReLU(),
            
            # 162 x 32 x 24 -> 243 x 16 x 24
            nn.Conv2d(162, 243, (3, 1), (2, 1), (1, 0)),
            nn.LeakyReLU(),
            
            # 243 x 16 x 24 -> 256 x 8 x 24
            nn.Conv2d(243, 256, (3, 1), (2, 1), (1, 0)),
            nn.LeakyReLU(),
            
            # 256 x 8 x 24 -> 256 x 4 x 24
            nn.Conv2d(256, 256, (3, 1), (2, 1), (1, 0)),
            nn.LeakyReLU(),
            
            # Convolution over time
            # 256 x 4 x 24 -> 128 x 4 x 13
            nn.Conv2d(256, 128, (1, 3), (1, 2), (0, 2)),
            nn.LeakyReLU(),
            
            # 128 x 4 x 13 -> 64 x 4 x 8
            nn.Conv2d(128, 64, (1, 3), (1, 2), (0, 2)),
            nn.LeakyReLU(),
            
            # 64 x 4 x 8 -> 32 x 4 x 5
            nn.Conv2d(64, 32, (1, 3), (1, 2), (0, 2)),
            nn.LeakyReLU(),
            
            # 32 x 4 x 5 -> 16 x 4 x 4
            nn.Conv2d(32, 16, (1, 3), (1, 2), (0, 2)),
            nn.LeakyReLU(),
            
            # 16 x 4 x 4 -> 8 x 4 x 3
            nn.Conv2d(16, 8, (1, 3), (1, 2), (0, 2)),
            nn.LeakyReLU(),
            
            # 8 x 4 x 3 -> 4 x 4 x 2
            nn.Conv2d(8, 4, (1, 3), (1, 2), (0, 1)),
            nn.LeakyReLU(),

            # 4 x 4 x 2 -> 1 x Ns
            View([-1, self.Ns])
        )

        """Geometry Decoder 
        Maps the latent code from the audio encoder to the vertex deformations
        It consists of two fully connected layers with 150 and 1404 units
        and linear activations, with a dropout layer in the middle. 
        The resulting output is 468 vertices (1404 = 468 * 3)
        
        Input: Takes in the latent vector audio encoder outputs of shape B * 32
        Output: a vector of shape B * 1404
        """
        self.geometry_decoder = nn.Sequential(
            nn.Linear(self.Ns, 150),
            nn.Dropout(0.5),
            nn.Linear(150, 1404))

    def forward(self, audio_spectogram):
        """
        forward pass takes in the audio spectogram, 
        encodes the spectogram into latent code
        decodes it to output the number of vertices required
        """
        # spectogram : B x 2 x 256 x 24
        latent = self.audio_encoder(audio_spectogram)
        vertices = self.geometry_decoder(latent)
        
        return vertices

In [2]:
#!pip install torchinfo
from torchinfo import summary

# We used hyperparameter search to determine the latent code lengths, Ns = 32
model = Geometry_Model(32)
batch_size = 16
summary(model, input_size=(batch_size, 2, 256, 24))

Layer (type:depth-idx)                   Output Shape              Param #
Geometry_Model                           [16, 1404]                --
├─Sequential: 1-1                        [16, 32]                  --
│    └─Conv2d: 2-1                       [16, 72, 128, 24]         504
│    └─LeakyReLU: 2-2                    [16, 72, 128, 24]         --
│    └─Conv2d: 2-3                       [16, 108, 64, 24]         23,436
│    └─LeakyReLU: 2-4                    [16, 108, 64, 24]         --
│    └─Conv2d: 2-5                       [16, 162, 32, 24]         52,650
│    └─LeakyReLU: 2-6                    [16, 162, 32, 24]         --
│    └─Conv2d: 2-7                       [16, 243, 16, 24]         118,341
│    └─LeakyReLU: 2-8                    [16, 243, 16, 24]         --
│    └─Conv2d: 2-9                       [16, 256, 8, 24]          186,880
│    └─LeakyReLU: 2-10                   [16, 256, 8, 24]          --
│    └─Conv2d: 2-11                      [16, 256, 4, 24]         