<a href="https://colab.research.google.com/github/eyaler/avatars4all/blob/master/fomm_bibi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Demo for paper "First Order Motion Model for Image Animation"

## **Talking head (VoxCeleb) model**

### Made just a little bit more accessible by Eyal Gruss (https://eyalgruss.com, eyalgruss@gmail.com)


##### Original project: https://aliaksandrsiarohin.github.io/first-order-model-website

##### Original notebook: https://colab.research.google.com/github/AliaksandrSiarohin/first-order-model/blob/master/demo.ipynb

##### Faceswap notebook: https://colab.research.google.com/github/AliaksandrSiarohin/motion-cosegmentation/blob/master/part_swap.ipynb

##### Notebook with video enhancement: https://colab.research.google.com/github/tg-bomze/Face-Image-Motion-Model/blob/master/Face_Image_Motion_Model_(Photo_2_Video)_Eng.ipynb

##### Avatarify - a live vesrsion (requires local installation): https://github.com/alievk/avatarify

##### Wav2Lip - a different model for lip syncing from audio: http://bhaasha.iiit.ac.in/lipsync

#### **Stuff I made**:
##### Avatars4all repository: https://github.com/eyaler/avatars4all
##### Notebook for live webcam in the browser: https://colab.research.google.com/github/eyaler/avatars4all/blob/master/fomm_live.ipynb
##### Notebook for talking head model: https://colab.research.google.com/github/eyaler/avatars4all/blob/master/fomm_bibi.ipynb
##### Notebook for full body models (FOMM): https://colab.research.google.com/github/eyaler/avatars4all/blob/master/fomm_fufu.ipynb
##### Notebook for full body models (impersonator): https://colab.research.google.com/github/eyaler/avatars4all/blob/master/ganozli.ipynb
##### Notebook for full body models (impersonator++): https://colab.research.google.com/github/eyaler/avatars4all/blob/master/ganivut.ipynb
##### Notebook for Wav2Lip audio based lip syncing: https://colab.research.google.com/github/eyaler/avatars4all/blob/master/melaflefon.ipynb
##### List of more generative tools: https://j.mp/generativetools

In [None]:
#@title Setup

%cd /content
!git clone --depth 1 https://github.com/eyaler/first-order-model
!wget --no-check-certificate -nc https://openavatarify.s3.amazonaws.com/weights/vox-adv-cpk.pth.tar
!wget --no-check-certificate -nc https://eyalgruss.com/fomm/vox-adv-cpk.pth.tar

!mkdir -p /root/.cache/torch/hub/checkpoints
%cd /root/.cache/torch/hub/checkpoints
!wget --no-check-certificate -nc https://eyalgruss.com/fomm/s3fd-619a316812.pth
!wget --no-check-certificate -nc https://eyalgruss.com/fomm/2DFAN4-11f355bf06.pth.tar
%cd /content

!pip install -U git+https://github.com/ytdl-org/youtube-dl
!pip install imageio==2.9.0
!pip install imageio-ffmpeg==0.4.5
!pip install git+https://github.com/1adrianb/face-alignment@v1.0.1
!pip install pyyaml==5.4.1

In [None]:
#@title Get the Driver video and Avatar image from the web
#@markdown 1. You can change the URLs to your **own** stuff!
#@markdown 2. Alternatively, you can upload **local** files in the next cells

video_url = 'https://www.youtube.com/watch?v=OziXYniB5x4' #@param {type:"string"}
image_url = 'https://www.srugim.co.il/wp-content/uploads/2010/10/%D7%90%D7%97%D7%9E%D7%93-%D7%98%D7%99%D7%91%D7%99.jpg' #@param {type:"string"}

if video_url:
  !rm -f /content/video.mp4
  !youtube-dl -f "bestvideo[ext=mp4][vcodec!*=av01][height<=360]+bestaudio[ext=m4a]/mp4[height<=360][vcodec!*=av01]/mp4[vcodec!*=av01]/mp4" "$video_url" --merge-output-format mp4 -o /content/video
  !mv /content/video.mp4 /content/video 

if image_url:
  !wget "$image_url" -O /content/image

In [None]:
#@title Optionally upload local Driver video { run: "auto" }
manually_upload_video = False #@param {type:"boolean"}
if manually_upload_video:
  from google.colab import files
  import shutil

  %cd /content/sample_data
  try:
    uploaded = files.upload()
  except Exception as e:
    %cd /content
    raise e

  for fn in uploaded:
    shutil.move('/content/sample_data/'+fn, '/content/video')
    break
  %cd /content

In [None]:
#@title Optionally upload local Avatar image { run: "auto" }
manually_upload_image = False #@param {type:"boolean"}
if manually_upload_image:
  from google.colab import files
  import shutil

  %cd /content/sample_data
  try:
    uploaded = files.upload()
  except Exception as e:
    %cd /content
    raise e

  for fn in uploaded:
    shutil.move('/content/sample_data/'+fn, '/content/image')
    break
  %cd /content

In [None]:
#@title Optionally shorten Driver video
start_seconds = 0 #@param {type:"number"}
duration_seconds =  60#@param {type:"number"}
start_seconds = max(start_seconds,0)
duration_seconds = max(duration_seconds,0)

if duration_seconds:
  !mv /content/video /content/full_video
  !ffmpeg -ss $start_seconds -t $duration_seconds -i /content/full_video -f mp4 /content/video -y

In [None]:
#@title Prepare assets
#@markdown If you ran out of RAM this means that the video is too large. You can shorten it above.

center_video_to_head = True #@param {type:"boolean"}
crop_video_to_head = True #@param {type:"boolean"}
video_crop_expansion_factor = 2.5 #@param {type:"number"}
center_image_to_head = True #@param {type:"boolean"}
crop_image_to_head = False #@param {type:"boolean"}
image_crop_expansion_factor = 2.5 #@param {type:"number"}
video_crop_expansion_factor = max(video_crop_expansion_factor, 1)
image_crop_expansion_factor = max(image_crop_expansion_factor, 1)

import imageio
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from skimage.transform import resize
from IPython.display import HTML, clear_output
import warnings
warnings.filterwarnings("ignore")

import face_alignment
fa = face_alignment.FaceAlignment(face_alignment.LandmarksType._2D, flip_input=True,
                                      device='cuda')

def create_bounding_box(target_landmarks, expansion_factor=1):
    target_landmarks = np.array(target_landmarks)
    x_y_min = target_landmarks.reshape(-1, 68, 2).min(axis=1)
    x_y_max = target_landmarks.reshape(-1, 68, 2).max(axis=1)
    expansion_factor = (expansion_factor-1)/2
    bb_expansion_x = (x_y_max[:, 0] - x_y_min[:, 0]) * expansion_factor
    bb_expansion_y = (x_y_max[:, 1] - x_y_min[:, 1]) * expansion_factor
    x_y_min[:, 0] -= bb_expansion_x
    x_y_max[:, 0] += bb_expansion_x
    x_y_min[:, 1] -= bb_expansion_y
    x_y_max[:, 1] += bb_expansion_y
    return np.hstack((x_y_min, x_y_max-x_y_min))

def fix_dims(im):
    if im.ndim == 2:
        im = np.tile(im[..., None], [1, 1, 3])
    return im[...,:3]

def get_crop(im, center_face=True, crop_face=True, expansion_factor=1, landmarks=None):
    im = fix_dims(im)
    if (center_face or crop_face) and not landmarks:
        landmarks = fa.get_landmarks_from_image(im)
    if (center_face or crop_face) and landmarks:
        rects = create_bounding_box(landmarks, expansion_factor=expansion_factor)
        x0,y0,w,h = sorted(rects, key=lambda x: x[2]*x[3])[-1]
        if crop_face:
            s = max(h, w)
            x0 += (w-s)//2
            x1 = x0 + s
            y0 += (h-s)//2
            y1 = y0 + s
        else:
            img_h,img_w = im.shape[:2]
            img_s = min(img_h,img_w)
            x0 = min(max(0, x0+(w-img_s)//2), img_w-img_s)
            x1 = x0 + img_s
            y0 = min(max(0, y0+(h-img_s)//2), img_h-img_s)
            y1 = y0 + img_s            
    else:
        h,w = im.shape[:2]
        s = min(h,w)
        x0 = (w-s)//2
        x1 = x0 + s
        y0 = (h-s)//2
        y1 = y0 + s
    return int(x0),int(x1),int(y0),int(y1)

def pad_crop_resize(im, x0=None, x1=None, y0=None, y1=None, new_h=256, new_w=256):
    im = fix_dims(im)
    h,w = im.shape[:2]
    if x0 is None:
      x0 = 0
    if x1 is None:
      x1 = w
    if y0 is None:
      y0 = 0
    if y1 is None:
      y1 = h
    if x0<0 or x1>w or y0<0 or y1>h:
        im = np.pad(im, pad_width=[(max(-y0,0),max(y1-h,0)),(max(-x0,0),max(x1-w,0)),(0,0)], mode='edge')
    return resize(im[max(y0,0):y1-min(y0,0),max(x0,0):x1-min(x0,0)], (new_h, new_w))

source_image = imageio.imread('/content/image')
source_image = pad_crop_resize(source_image, *get_crop(source_image, center_face=center_image_to_head, crop_face=crop_image_to_head, expansion_factor=image_crop_expansion_factor))

with imageio.get_reader('/content/video', format='mp4') as reader:
  fps = reader.get_meta_data()['fps']

  driving_video = []
  landmarks = None
  try:
      for i,im in enumerate(reader):
          if not crop_video_to_head:
              break
          landmarks = fa.get_landmarks_from_image(im)
          if landmarks:
              break
      x0,x1,y0,y1 = get_crop(im, center_face=center_video_to_head, crop_face=crop_video_to_head, expansion_factor=video_crop_expansion_factor, landmarks=landmarks)
      reader.set_image_index(0)
      for im in reader:
          driving_video.append(pad_crop_resize(im,x0,x1,y0,y1))
  except RuntimeError:
      pass

def vid_display(source, driving, generated=None):
    fig = plt.figure(figsize=(8 + 4 * (generated is not None), 6))

    ims = []
    for i in range(len(driving)):
        cols = [source]
        cols.append(driving[i])
        if generated is not None:
            cols.append(generated[i])
        im = plt.imshow(np.concatenate(cols, axis=1), animated=True)
        plt.axis('off')
        ims.append([im])

    ani = animation.ArtistAnimation(fig, ims, interval=50, repeat_delay=1000)
    plt.close()
    return ani

clear_output()
if landmarks:
  print('first found head in frame %d'%i)
HTML(vid_display(source_image, driving_video).to_html5_video())

In [None]:
#@title Find best alignment

%cd /content/first-order-model
from demo import load_checkpoints
generator, kp_detector = load_checkpoints(config_path='/content/first-order-model/config/vox-adv-256.yaml', 
                            checkpoint_path='/content/vox-adv-cpk.pth.tar')

from scipy.spatial import ConvexHull
def normalize_kp(kps):
    max_area = 0
    max_kp = None
    for kp in kps:
        kp = kp - kp.mean(axis=0, keepdims=True)
        area = ConvexHull(kp[:, :2]).volume
        area = np.sqrt(area)
        kp[:, :2] = kp[:, :2] / area
        if area>max_area:
          max_area = area
          max_kp = kp
    return max_kp

from tqdm import tqdm

kp_source = fa.get_landmarks_from_image(255 * source_image)
if kp_source:
  norm_kp_source = normalize_kp(kp_source)

norm  = float('inf')
best = 0
best_kp_driving = None
for i, image in tqdm(enumerate(driving_video)):
  kp_driving = fa.get_landmarks_from_image(255 * image)
  if kp_driving:
    norm_kp_driving = normalize_kp(kp_driving)
    if kp_source:
      new_norm = (np.abs(norm_kp_source - norm_kp_driving) ** 2).sum()
      if new_norm < norm:
        norm = new_norm
        best = i
        best_kp_driving = kp_driving
    else:
      best_kp_driving = kp_driving
      break

from logger import Visualizer
vis = Visualizer(kp_size=3, colormap='gist_rainbow')
source_with_kp = vis.draw_image_with_kp(source_image, kp_source[0]*2/np.array(source_image.shape[:2][::-1])[np.newaxis]-1) if kp_source else source_image
driving_with_kp = vis.draw_image_with_kp(driving_video[best], best_kp_driving[0]*2/np.array(driving_video[best].shape[:2][::-1])[np.newaxis]-1) if best_kp_driving else driving_video[best]

clear_output()

# see: https://github.com/googlecolab/colabtools/issues/3541
matplotlib.use('module://matplotlib_inline.backend_inline')

print('\nbest frame=%d'%best)
plt.figure(figsize=(8, 6))
plt.imshow(np.concatenate([source_with_kp, driving_with_kp], axis=1))
plt.axis('off')

In [None]:
#@title Animate

exaggerate_factor = 1 #@param {type:"slider", min:0.1, max:5, step:0.1}
adapt_movement_scale = True #@param {type:"boolean"}
use_relative_movement = True #@param {type:"boolean"}
use_relative_jacobian = True #@param {type:"boolean"}

import torch
from skimage import img_as_ubyte

def full_normalize_kp(kp_source, kp_driving, kp_driving_initial, adapt_movement_scale=False,
                 use_relative_movement=False, use_relative_jacobian=False, exaggerate_factor=1):
    if adapt_movement_scale:
        source_area = ConvexHull(kp_source['value'][0].data.cpu().numpy()).volume
        driving_area = ConvexHull(kp_driving_initial['value'][0].data.cpu().numpy()).volume
        adapt_movement_scale = np.sqrt(source_area) / np.sqrt(driving_area)
    else:
        adapt_movement_scale = 1

    kp_new = {k: v for k, v in kp_driving.items()}

    if use_relative_movement:
        kp_value_diff = (kp_driving['value'] - kp_driving_initial['value'])
        kp_value_diff *= adapt_movement_scale * exaggerate_factor
        kp_new['value'] = kp_value_diff + kp_source['value']

        if use_relative_jacobian:
            jacobian_diff = torch.matmul(kp_driving['jacobian'], torch.inverse(kp_driving_initial['jacobian']))
            kp_new['jacobian'] = torch.matmul(jacobian_diff, kp_source['jacobian'])

    return kp_new

def make_animation(source_image, driving_video, generator, kp_detector, adapt_movement_scale=False,
                 use_relative_movement=False, use_relative_jacobian=False, cpu=False, exaggerate_factor=1):
    with torch.no_grad():
        predictions = []
        source = torch.tensor(source_image[np.newaxis].astype(np.float32)).permute(0, 3, 1, 2)
        if not cpu:
            source = source.cuda()
        driving = torch.tensor(np.array(driving_video)[np.newaxis].astype(np.float32)).permute(0, 4, 1, 2, 3)
        kp_source = kp_detector(source)
        kp_driving_initial = kp_detector(driving[:, :, 0])

        for frame_idx in tqdm(range(driving.shape[2])):
            driving_frame = driving[:, :, frame_idx]
            if not cpu:
                driving_frame = driving_frame.cuda()
            kp_driving = kp_detector(driving_frame)
            kp_norm = full_normalize_kp(kp_source=kp_source, kp_driving=kp_driving,
                                   kp_driving_initial=kp_driving_initial, adapt_movement_scale=adapt_movement_scale, use_relative_movement=use_relative_movement,
                                   use_relative_jacobian=use_relative_jacobian, exaggerate_factor=exaggerate_factor)
            out = generator(source, kp_source=kp_source, kp_driving=kp_norm)

            predictions.append(np.transpose(out['prediction'].data.cpu().numpy(), [0, 2, 3, 1])[0])
    return predictions

predictions_forward = make_animation(source_image, driving_video[best:], generator, kp_detector, adapt_movement_scale=adapt_movement_scale, use_relative_movement=use_relative_movement,
                                   use_relative_jacobian=use_relative_jacobian, exaggerate_factor=exaggerate_factor)
predictions_backward = make_animation(source_image, driving_video[:(best+1)][::-1], generator, kp_detector, adapt_movement_scale=adapt_movement_scale, use_relative_movement=use_relative_movement,
                                   use_relative_jacobian=use_relative_jacobian, exaggerate_factor=exaggerate_factor)

imageio.mimsave('/content/generated.mp4', [img_as_ubyte(frame) for frame in predictions_backward[::-1] + predictions_forward[1:]], fps=fps)
!ffmpeg -i /content/generated.mp4 -i /content/video -c:v libx264 -c:a aac -map 0:v -map 1:a? -pix_fmt yuv420p /content/final.mp4 -profile:v baseline -movflags +faststart -y
#video can be downloaded from /content/final.mp4

clear_output()
HTML(vid_display(source_image, driving_video, predictions_backward[::-1] + predictions_forward[1:]).to_html5_video())


In [None]:
#@title Download
#@markdown 1. If it fails try running this cell again.
#@markdown 2. Alternatively, you can manually download "final.mp4" from the folder on the left (click "Refresh" if missing).

print() #see https://github.com/googlecolab/colabtools/issues/468
from google.colab import files
files.download('/content/final.mp4') #fails for Firefox private window

In [None]:
#@title Optional apply Wav2Lip post processing and download
wav2lip_post_processing = True #@param {type: "boolean"}
smooth_face_detection = True #@param {type: "boolean"}
if wav2lip_post_processing:
  !pip install librosa==0.9.2
  !pip install -U gdown
  !rm -rf /content/final_wav2lip.mp4
  %cd /content
  !git clone --depth 1 https://github.com/eyaler/Wav2Lip.git
  import os
  if not os.path.exists('/content/Wav2Lip/checkpoints/wav2lip_gan.pth'):
    !gdown https://drive.google.com/uc?id=1dwHujX7RVNCvdR1RR93z0FS2T2yzqup9 -O /content/Wav2Lip/checkpoints/wav2lip_gan.pth
  !wget --no-check-certificate -nc https://eyalgruss.com/fomm/wav2lip_gan.pth -O /content/Wav2Lip/checkpoints/wav2lip_gan.pth
  #!wget --no-check-certificate -nc https://eyalgruss.com/fomm/wav2lip.pth -O /content/Wav2Lip/checkpoints/wav2lip.pth
  !cp /root/.cache/torch/hub/checkpoints/s3fd-619a316812.pth /content/Wav2Lip/face_detection/detection/sfd/s3fd.pth
  %cd /content/Wav2Lip
  !rm -rf /content/Wav2Lip/temp/*
  nosmooth = '' if smooth_face_detection else '--nosmooth'
  !python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth --face /content/final.mp4 --audio /content/final.mp4 --pads 0 20 0 0 $nosmooth --outfile /content/final_wav2lip.mp4
  if os.path.exists('/content/Wav2Lip/temp/faulty_frame.jpg'):
    import cv2
    print('\nFace not detected - will use whole frame')
    video_stream = cv2.VideoCapture(infile)
    still_reading, frame = video_stream.read()
    x1 = y1 = 0
    y2,x2 = frame.shape[:2]
    if x2>h:
      x1 = (x2-h)//2
      x2 = x1+y2
    !python inference.py --checkpoint_path checkpoints/wav2lip_gan.pth /content/final.mp4 --audio /content/final.mp4 --box $y1 $y2 $x1 $x2 --pads 0 20 0 0 --outfile /content/final_wav2lip.mp4
  from google.colab import files
  files.download('/content/final_wav2lip.mp4')