# Illustra: Multi-text to Image

Part of [Aphantasia](https://github.com/eps696/aphantasia) suite, made by Vadim Epstein [[eps696](https://github.com/eps696)]  
Based on [CLIP](https://github.com/openai/CLIP) + FFT from [Lucent](https://github.com/greentfrapp/lucent).  
thanks to [Ryan Murdock](https://twitter.com/advadnoun), [Jonathan Fly](https://twitter.com/jonathanfly), [@eduwatch2](https://twitter.com/eduwatch2) for ideas.

## Features 
* **continuously processes phrase lists** (e.g. illustrating lyrics)
* generates massive detailed high res imagery, a la deepdream
* directly parameterized with [FFT](https://github.com/greentfrapp/lucent/blob/master/lucent/optvis/param/spatial.py) (no pretrained GANs)
* various CLIP models (including multi-language from [SBERT](https://sbert.net))
* saving/loading FFT snapshots to resume processing
* separate text prompt for image style


**Run the cell below after each session restart**

Mark `resume` and upload `.pt` file, if you're resuming from the saved params.

In [None]:
#@title General setup

!pip install ftfy==5.8 transformers==4.6.0
!pip install gputil ffpb 

!apt-get -qq install ffmpeg
from google.colab import drive
drive.mount('/G', force_remount=True)
# gdir = !ls /G/
# gdir = '/G/%s/' % str(gdir[0])
gdir = '/G/MyDrive/'
%cd $gdir
work_dir = 'illustra'
import os
work_dir = os.path.join(gdir, work_dir)
os.makedirs(work_dir, exist_ok=True)
%cd $work_dir

import os
import io
import time
import math
import random
import imageio
import numpy as np
import PIL
from base64 import b64encode
import shutil
# import moviepy, moviepy.editor

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.autograd import Variable

from IPython.display import HTML, Image, display, clear_output
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import ipywidgets as ipy
from google.colab import output, files
output.enable_custom_widget_manager()

import warnings
warnings.filterwarnings("ignore")

!pip install git+https://github.com/openai/CLIP.git --no-deps
import clip
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
!pip install kornia
import kornia
!pip install lpips
import lpips
!pip install PyWavelets==1.1.1
!pip install git+https://github.com/fbcotter/pytorch_wavelets

%cd /content
!pip install git+https://github.com/eps696/aphantasia
from aphantasia.image import to_valid_rgb, fft_image
from aphantasia.utils import slice_imgs, derivat, pad_up_to, basename, file_list, img_list, img_read, txt_clean, plot_text, checkout, old_torch 
from aphantasia import transforms
from aphantasia.progress_bar import ProgressIPy as ProgressBar

clear_output()

resume = False #@param {type:"boolean"}
if resume:
  resumed = files.upload()
  params_pt = list(resumed.values())[0]
  params_pt = torch.load(io.BytesIO(params_pt))
  if isinstance(params_pt, list): params_pt = params_pt[0]

def read_pt(file):
  return torch.load(file).cuda()

def ema(base, next, step):
  scale_ma = 1. / (step + 1)
  return next * scale_ma + base * (1.- scale_ma)

def save_img(img, fname=None):
  img = np.array(img)[:,:,:]
  img = np.transpose(img, (1,2,0))  
  img = np.clip(img*255, 0, 255).astype(np.uint8)
  if fname is not None:
    imageio.imsave(fname, np.array(img))
    imageio.imsave('result.jpg', np.array(img))

def makevid(seq_dir, size=None):
  out_sequence = seq_dir + '/%05d.jpg'
  out_video = seq_dir + '.mp4'
  print('.. generating video ..')
  !ffmpeg -y -v warning -i $out_sequence -crf 20 $out_video
  # moviepy.editor.ImageSequenceClip(img_list(seq_dir), fps=25).write_videofile(out_video, verbose=False)
  data_url = "data:video/mp4;base64," + b64encode(open(out_video,'rb').read()).decode()
  wh = '' if size is None else 'width=%d height=%d' % (size, size)
  return """<video %s controls><source src="%s" type="video/mp4"></video>""" % (wh, data_url)

# Hardware check
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
import GPUtil as GPU
gpu = GPU.getGPUs()[0] # XXX: only one GPU on Colab and isn’t guaranteed
!nvidia-smi -L
print("GPU RAM {0:.0f}MB | Free {1:.0f}MB)".format(gpu.memoryTotal, gpu.memoryFree))
print('\nDone!')

In [None]:
#@title Upload text file

#@markdown For non-English languages mark one of:

multilang = False #@param {type:"boolean"}
translate = False #@param {type:"boolean"}

if translate:
  !pip3 install googletrans==3.1.0a0
  clear_output()
  from googletrans import Translator
  translator = Translator()

uploaded = files.upload()

#@markdown (`multilang` = multi-language CLIP model, trained with ViT, 
#@markdown `translate` = Google translation, compatible with any visual model)


### Settings
Set the desired video resolution and `duration` (in sec).  
Describe `style`, which you'd like to apply to the imagery.  
Select CLIP visual `model` (results do vary!). I prefer ViT for consistency (and it's the only native multi-language option).  

`align` option is about composition. `uniform` looks most adequate, `overscan` can make semi-seamless tileable texture.  
`aug_transform` applies some augmentations, inhibiting image fragmentation & "graffiti" printing (slower, yet recommended).  
Decrease `samples` if you face OOM (it's the main RAM eater).  
Increasing `steps` will elaborate details and make tones smoother, but may start throwing texts like graffiti (and will obviously take more time).  
`show_freq` controls preview frequency (doesn't affect the results; one can set it higher to speed up the process).  
Tune `decay` (compositional softness) and `sharpness`, `colors` (saturation) and `contrast` as needed.  

Experimental tricks:  
`aug_noise` augmentation, `macro` (from 0 to 1) and `progressive_grow` (read more [here](https://github.com/eps696/aphantasia/issues/2)) may boost bigger forms, making composition less disperse.  
`no_text` tries to remove "graffiti" by subtracting plotted text prompt. good start is \~0.1.   
`enhance` boosts training consistency (of simultaneous samples) and steps progress. good start is 0.1~0.2.  

NB: `keep` parameter controls how well the next line/image generation follows the previous. By default X = 0, and every frame is produced independently (i.e. randomly initiated). 
Setting it higher starts each generation closer to the average of previous runs, effectively keeping macro compositions more similar and the transitions smoother. Safe values are < 0.5 (higher numbers may cause the imagery getting stuck). This behaviour depends on the input, so test with your prompts and see what's better in your case.

In [None]:
#@title Generate

# from google.colab import drive
# drive.mount('/content/GDrive')
# clipsDir = '/content/GDrive/MyDrive/T2I ' + dtNow.strftime("%Y-%m-%d %H%M")

style = "" #@param {type:"string"}
sideX = 1280 #@param {type:"integer"}
sideY = 720 #@param {type:"integer"}
duration =  60#@param {type:"integer"}
#@markdown > Config
model = 'ViT-B/32' #@param ['ViT-B/16', 'ViT-B/32', 'RN101', 'RN50x16', 'RN50x4', 'RN50']
align = 'uniform' #@param ['central', 'uniform', 'overscan']
aug_transform = True #@param {type:"boolean"}
keep = 0. #@param {type:"number"}
#@markdown > Look
decay = 1.5 #@param {type:"number"}
colors = 1.5 #@param {type:"number"}
contrast =  0.9#@param {type:"number"}
sharpness = 0.3 #@param {type:"number"}
#@markdown > Training
steps = 200 #@param {type:"integer"}
samples = 200 #@param {type:"integer"}
learning_rate = .05 #@param {type:"number"}
show_freq = 10 #@param {type:"integer"}
#@markdown > Tricks
aug_noise = 0.2 #@param {type:"number"}
no_text = 0. #@param {type:"number"}
enhance = 0. #@param {type:"number"}
macro = 0.4 #@param {type:"number"}
progressive_grow = False #@param {type:"boolean"}
diverse = -enhance
expand = abs(enhance)
fps = 25
if multilang: model = 'ViT-B/32' # sbert model is trained with ViT

model_clip, _ = clip.load(model, jit=old_torch())
modsize = model_clip.visual.input_resolution
xmem = {'ViT-B/16':0.25, 'RN50':0.5, 'RN50x4':0.16, 'RN50x16':0.06, 'RN101':0.33}
if model in xmem.keys():
  samples = int(samples * xmem[model])

def enc_text(txt):
  if multilang:
    model_lang = SentenceTransformer('clip-ViT-B-32-multilingual-v1').cuda()
    emb = model_lang.encode([txt], convert_to_tensor=True, show_progress_bar=False).detach().clone()
    del model_lang
  else:
    emb = model_clip.encode_text(clip.tokenize(txt).cuda())
  return emb.detach().clone()

if diverse != 0:
  samples = int(samples * 0.5)
        
if aug_transform is True:
  trform_f = transforms.transforms_fast
  samples = int(samples * 0.95)
else:
  trform_f = transforms.normalize()

text_file = list(uploaded)[0]
texts = list(uploaded.values())[0].decode().split('\n')
texts = [tt.strip() for tt in texts if len(tt.strip())>0 and tt[0] != '#']
print(' text file:', text_file)
print(' total lines:', len(texts))

if len(style) > 0:
  print(' style:', style)
  if translate:
    style = translator.translate(style, dest='en').text
    print(' translated to:', style) 
  txt_enc2 = enc_text(style)

workdir = os.path.join(work_dir, basename(text_file))
workdir += '-%s' % model if 'RN' in model.upper() else ''
!rm -rf $workdir
os.makedirs(workdir, exist_ok=True)

outpic = ipy.Output()
outpic
  
# make init
global params_start, params_ema
params_shape = [1, 3, sideY, sideX//2+1, 2]
params_start = torch.randn(*params_shape).cuda() # random init
params_ema = 0.

if resume is True:
  # print(' resuming from', resumed)
  # params, _, _ = fft_image([1, 3, sideY, sideX], resume = params_pt, sd=1.)
  params_start = params_pt.cuda()
  if keep > 0:
    params_ema = params[0].detach()
  torch.save(params_pt, os.path.join(workdir, '000-start.pt'))
else:
  torch.save(params_start, os.path.join(workdir, '000-start.pt'))

torch.save(params_start, 'init.pt') # final init

prev_enc = 0
def process(txt, num):

  global params_start
  sd = 0.01
  if keep > 0: sd = keep + (1-keep) * sd
  params, image_f, _ = fft_image([1, 3, sideY, sideX], resume='init.pt', sd=sd, decay_power=decay)
  image_f = to_valid_rgb(image_f, colors = colors)
  
  if progressive_grow is True:
    lr1 = learning_rate * 2
    lr0 = lr1 * 0.01
  else:
    lr0 = learning_rate
  optimizer = torch.optim.AdamW(params, lr0, weight_decay=0.01, amsgrad=True)
    
  print(' topic: ', txt)
  if translate:
    txt = translator.translate(txt, dest='en').text
    print(' translated to:', txt)
  txt_enc = enc_text(txt)
  if no_text > 0:
      txt_plot = torch.from_numpy(plot_text(txt, modsize)/255.).unsqueeze(0).permute(0,3,1,2).cuda()
      txt_plot_enc = model_clip.encode_image(txt_plot).detach().clone()
  else: txt_plot_enc = None

  out_name = '%03d-%s' % (num+1, txt_clean(txt))
  tempdir = os.path.join(workdir, out_name)
  !rm -rf $tempdir
  os.makedirs(tempdir, exist_ok=True)

  pbar = ProgressBar(steps) #  // save_freq
  for i in range(steps):
    loss = 0
    noise = aug_noise * torch.randn(1, 1, *params[0].shape[2:4], 1).cuda() if aug_noise > 0 else 0.
    img_out = image_f(noise)
    imgs_sliced = slice_imgs([img_out], samples, modsize, trform_f, align, macro=macro)
    out_enc = model_clip.encode_image(imgs_sliced[-1])

    loss -= torch.cosine_similarity(txt_enc, out_enc, dim=-1).mean()
    if len(style) > 0:
      loss -= 0.5 * torch.cosine_similarity(txt_enc2, out_enc, dim=-1).mean()
    if no_text > 0:
      loss += no_text * torch.cosine_similarity(txt_plot_enc, out_enc, dim=-1).mean()
    if sharpness != 0: # mode = scharr|sobel|default
      loss -= sharpness * derivat(img_out, mode='sobel')
      # loss -= sharpness * derivat(img_sliced, mode='scharr')
    if diverse != 0:
      imgs_sliced = slice_imgs([image_f(noise)], samples, modsize, trform_f, align, macro=macro)
      out_enc2 = model_clip.encode_image(imgs_sliced[-1])
      loss += diverse * torch.cosine_similarity(out_enc, out_enc2, dim=-1).mean()
      del out_enc2; torch.cuda.empty_cache()
    if expand > 0:
      global prev_enc
      if i > 0:
        loss += expand * torch.cosine_similarity(out_enc, prev_enc, dim=-1).mean()
      prev_enc = out_enc.detach()
    del img_out, imgs_sliced, out_enc; torch.cuda.empty_cache()

    if progressive_grow is True:
      lr_cur = lr0 + (i / steps) * (lr1 - lr0)
      for g in optimizer.param_groups: 
        g['lr'] = lr_cur

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if i % show_freq == 0:
      with torch.no_grad():
        img = image_f(contrast=contrast).cpu().numpy()[0]
      if sharpness != 0:
        img = img ** (1 + sharpness/2.) # empirical tone mapping
      save_img(img, os.path.join(tempdir, '%05d.jpg' % (i // show_freq)))
      outpic.clear_output()
      with outpic:
        display(Image('result.jpg'))
      del img

    pbar.upd()

  if keep > 0:
    global params_start, params_ema
    params_ema = ema(params_ema, params[0].detach(), num+1)
    torch.save((1-keep) * params_start + keep * params_ema, 'init.pt')

  torch.save(params[0], '%s.pt' % os.path.join(workdir, out_name))
  # shutil.copy(img_list(tempdir)[-1], os.path.join(workdir, '%s-%d.jpg' % (out_name, steps)))
  # os.system('ffmpeg -v warning -y -i %s\%%05d.jpg -codec nvenc "%s.mp4"' % (tempdir, os.path.join(workdir, out_name)))
  # HTML(makevid(tempdir))

for i, txt in enumerate(texts):
    process(txt, i)

vsteps = int(duration * fps / len(texts))
tempdir = os.path.join(workdir, '_final')
!rm -rf $tempdir
os.makedirs(tempdir, exist_ok=True)

print(' rendering complete piece')
ptfiles = file_list(workdir, 'pt')
pbar = ProgressBar(vsteps * len(ptfiles))
for px in range(len(ptfiles)):
  params1 = read_pt(ptfiles[px])
  params2 = read_pt(ptfiles[(px+1) % len(ptfiles)])

  params, image_f, _ = fft_image([1, 3, sideY, sideX], resume=params1, sd=1., decay_power=decay)
  image_f = to_valid_rgb(image_f, colors = colors)

  for i in range(vsteps):
    with torch.no_grad():
      img = image_f((params2 - params1) * math.sin(1.5708 * i/vsteps)**2)[0].permute(1,2,0)
      img = torch.clip(img*255, 0, 255).cpu().numpy().astype(np.uint8)
    imageio.imsave(os.path.join(tempdir, '%05d.jpg' % (px * vsteps + i)), img)
    _ = pbar.upd()

HTML(makevid(tempdir))


In [None]:
#@markdown Run this, if you need to make the same video of another length (model must be the same)

duration =  12#@param {type:"integer"}
model = 'ViT-B/32' #@param ['ViT-B/32', 'RN101', 'RN50x4', 'RN50']
colors = 1.5 #@param {type:"number"}
fps = 25

text_file = list(uploaded)[0]
workdir = os.path.join(work_dir, basename(text_file))
workdir += '-%s' % model if 'RN' in model.upper() else ''
tempdir = os.path.join(workdir, '_final')
!rm -rf $tempdir
os.makedirs(tempdir, exist_ok=True)

print(' re-rendering final piece')
ptfiles = file_list(workdir, 'pt')
vsteps = int(duration * fps / (len(ptfiles)-1))

ptest = torch.load(ptfiles[0])
if isinstance(ptest, list): ptest = ptest[0]
shape = [*ptest.shape[:3], (ptest.shape[3]-1)*2]

pbar = ProgressBar(vsteps * len(ptfiles))
for px in range(len(ptfiles)):
  params1 = read_pt(ptfiles[px])
  params2 = read_pt(ptfiles[(px+1) % len(ptfiles)])

  params, image_f, _ = fft_image(shape, resume=params1, decay_power=decay)
  image_f = to_valid_rgb(image_f, colors = colors)

  for i in range(vsteps):
    with torch.no_grad():
      img = image_f((params2 - params1) * math.sin(1.5708 * i/vsteps)**2)[0].permute(1,2,0)
      img = torch.clip(img*255, 0, 255).cpu().numpy().astype(np.uint8)
    imageio.imsave(os.path.join(tempdir, '%05d.jpg' % (px * vsteps + i)), img)
    _ = pbar.upd()

HTML(makevid(tempdir))
