# 1. Text2Image 
> 우리가 생성한 이야기를 담고있는 이미지 생성

* 후보 알고리즘: StackGAN, Text2Scene, DALL-E 등 
* 선정 알고리즘: __DALL-E__

### 0. DALL-E
 * 코드 깃헙: https://github.com/ai-coodinator/DALL-E/blob/main/DALL_E.ipynb
 * 코드 블로그: https://stevenhickson.blogspot.com/2021/03/test.html

In [1]:
!nvidia-smi -L

GPU 0: Tesla K80 (UUID: GPU-8b70f006-3812-dfc4-c640-c7cbdd9d4459)


In [None]:
!pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html ftfy regex
!pip install DALL-E
!pip install ftfy
!git clone https://github.com/openai/CLIP.git
%cd /content/CLIP/

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==1.7.1+cu101
  Downloading https://download.pytorch.org/whl/cu101/torch-1.7.1%2Bcu101-cp37-cp37m-linux_x86_64.whl (735.4 MB)
[K     |████████████████████████████████| 735.4 MB 15 kB/s 
[?25hCollecting torchvision==0.8.2+cu101
  Downloading https://download.pytorch.org/whl/cu101/torchvision-0.8.2%2Bcu101-cp37-cp37m-linux_x86_64.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 80 kB/s 
[?25hCollecting ftfy
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 1.7 MB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Created wheel for ftfy: filename=ftfy-6.0.3-py3-none-any.whl size=41933 sha256=9db4025c1da440692dc938e9214f1b4f71a10084aa30b6263e79654efd1cc8f4
  Stored in directory: /root/.cache/pip/wheels/19/f5/38/273eb3b5e76dfd850619312f693716ac4518b498f5ffb6f56d
Successfully built ftfy
Installi

In [None]:
# torch seed 고정
import numpy as np
import torch
import random
import torch.backends.cudnn as cudnn

torch.manual_seed(2021)
torch.cuda.manual_seed(2021)
torch.cuda.manual_seed_all(2021)
np.random.seed(2021)
cudnn.benchmark = False
cudnn.deterministic = True
random.seed(2021)

### 1. Preparing
* 이미지 스케일 조정

In [None]:
import torch
import numpy as np
import torchvision
import torchvision.transforms.functional as TF
import PIL
import matplotlib.pyplot as plt
import os
import random
import imageio
from IPython import display
from IPython.core.interactiveshell import InteractiveShell
import glob
from google.colab import output
InteractiveShell.ast_node_interactivity = "all"

# probably don't mess with this unless you're changing generator size
im_shape = [512, 512, 3]
sideX, sideY, channels = im_shape

def displ(img, pre_scaled=True):
  img = np.array(img)[:,:,:]
  img = np.transpose(img, (1, 2, 0))
  if not pre_scaled:
    img = scale(img, 48*4, 32*4)
  imageio.imwrite(str(3) + '.png', np.array(img))
  return display.Image(str(3)+'.png')

def gallery(array, ncols=2):
    nindex, height, width, intensity = array.shape
    nrows = nindex//ncols
    assert nindex == nrows*ncols
    # want result.shape = (height*nrows, width*ncols, intensity)
    result = (array.reshape(nrows, ncols, height, width, intensity)
              .swapaxes(1,2)
              .reshape(height*nrows, width*ncols, intensity))
    return result

def card_padded(im, to_pad=3):
  return np.pad(np.pad(np.pad(im, [[1,1], [1,1], [0,0]],constant_values=0), [[2,2], [2,2], [0,0]],constant_values=1),
            [[to_pad,to_pad], [to_pad,to_pad], [0,0]],constant_values=0)

def get_all(img):
  img = np.transpose(img, (0,2,3,1))
  cards = np.zeros((img.shape[0], sideX+12, sideY+12, 3))
  for i in range(len(img)):
    cards[i] = card_padded(img[i])
  print(img.shape)
  cards = gallery(cards)
  imageio.imwrite(str(3) + '.png', np.array(cards))
  return display.Image(str(3)+'.png')

### 2. Perceptor
* CLIP 모델 이용
* 사전 훈련된 모델을 기반으로 제로샷 전이 학습 -> 이미지에 캡션 제공

In [None]:
import clip
clip.available_models()

# Load the model
perceptor, preprocess = clip.load('ViT-B/32', jit=True)
perceptor = perceptor.eval() # evaluation 과정에서 사용하지 않아야 하는 layer들을 알아서 off

### 3. Generator

In [None]:
import io
import os, sys
import requests
import PIL
import torch
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from dall_e import map_pixels, unmap_pixels, load_model

target_image_size = sideX

def preprocess(img):
    s = min(img.size)
    
    if s < target_image_size:
        raise ValueError(f'min dim for image {s} < {target_image_size}')
        
    r = target_image_size / s
    s = (round(r * img.size[1]), round(r * img.size[0]))
    img = TF.resize(img, s, interpolation=PIL.Image.LANCZOS)
    img = TF.center_crop(img, output_size=2 * [target_image_size])
    img = torch.unsqueeze(T.ToTensor()(img), 0)
    return map_pixels(img)

model = load_model("https://cdn.openai.com/dall-e/decoder.pkl", 'cuda') # DALL-E: transformer의 decoder만 사용

### 4. Text input
* 기본/A fairy tale of/An illustration of 등 text input에 워딩을 바꿔보면서 학습
* 뭐가 더 '동화책스럽다'는 딱히 없는 듯(운빨임)
* dall-e mini를 버리는 상황이라면 문장 워딩을 조금씩 바꾼 걸 후보군으로 생성해도 되지 않을까? 
* 혹은 seed 빨도 많이 받으니까 seed에 따른 생성결과를 후보로 줄 수도 있을 듯

In [None]:
text_input = "golden ring was hanging on the wall of room, and it glittered with jewels in his pocket. when he came to the door they saw beautiful princess who wear gold dress as well!"
tau_value = 1.0 # non-negative scalar temperature

### 5. Latent coordinate

In [None]:
class Pars(torch.nn.Module):
    def __init__(self):
        super(Pars, self).__init__()
        self.normu = torch.nn.Parameter(torch.randn(1, 8192, 64, 64).cuda())

    def forward(self):
      #normu = torch.nn.functional.gumbel_softmax(self.normu.view(1, 8192, -1), dim=-1).view(1, 8192, 64, 64)
      normu = torch.nn.functional.gumbel_softmax(self.normu.view(1, 8192, -1), dim=-1, tau=tau_value).view(1, 8192, 64, 64) 
      return normu

lats = Pars().cuda()
mapper = [lats.normu]
optimizer = torch.optim.Adam([{'params': mapper, 'lr': .1}]) 
#eps = 0 # epsilon
tx = clip.tokenize(text_input)
t = perceptor.encode_text(tx.cuda()).detach().clone()
nom = torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)) # 이 값을 조정해보고 싶은데 감이 안와서 못 건드리겠음 -> 평균/표준편차 -> 고정 

with torch.no_grad():
  mult = 1
  al = unmap_pixels(torch.sigmoid(model(lats()).cpu().float())).numpy()
  for allls in al:
    displ(allls[:3])
    print('\n')
  # print(torch.topk(lats().view(1, 8192, -1), k=3, dim=-1))

### 6. Train

In [None]:
def checkin(loss):
  print('''########################################################## ''',loss, '\n',itt)
  
  with torch.no_grad():
    al = unmap_pixels(torch.sigmoid(model(lats())[:, :3]).cpu().float()).numpy()
  for allls in al:
    displ(allls)
    display.display(display.Image(str(3)+'.png'))
    print('\n')
  # the people spoke and they love "ding"
  # output.eval_js('new Audio("https://freesound.org/data/previews/80/80921_1022651-lq.ogg").play()')

def ascend_txt():
  out = unmap_pixels(torch.sigmoid(model(lats())[:, :3].float()))
  cutn = 64 ## improves quality -> 수정 
  p_s = []
  for ch in range(cutn):
    size = int(sideX*torch.zeros(1,).normal_(mean=.8, std=.3).clip(.5, .98))
    offsetx = torch.randint(0, sideX - size, ())
    offsety = torch.randint(0, sideX - size, ())
    apper = out[:, :, offsetx:offsetx + size, offsety:offsety + size]
    apper = torch.nn.functional.interpolate(apper, (224,224), mode='bilinear') ## 해상도 관련? 
    p_s.append(apper)
  into = torch.cat(p_s, 0)
  # into = torch.nn.functional.interpolate(out, (224,224), mode='nearest')
  into = nom(into)
  iii = perceptor.encode_image(into)
  llls = lats()
  lat_l = 0
  return [lat_l, 10*-torch.cosine_similarity(t, iii).view(-1, 1).T.mean(1)]

def train(i):
  loss1 = ascend_txt() 
  loss = loss1[0] + loss1[1]
  loss = loss.mean()
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()
    
  if itt % 50 == 0:
    checkin(loss1)
    shutil.copy('./3.png', './images/%s.png'%str(int(itt/100)).zfill(6))

import shutil

if os.path.isdir('images'):
    shutil.rmtree('images')
os.makedirs('images', exist_ok=True)

itt = 0
for asatreat in range(2021):
  train(itt)
  itt+=1

### 7. Make movie

In [None]:
if os.path.exists('./output.mp4'):
   os.remove('./output.mp4')

!ffmpeg -r 2 -i images/%06d.png -vcodec libx264 -pix_fmt yuv420p output.mp4

# 2. Style Transfer 
> 이미지를 원하는 그림책 분위기에 맞게 변형 <br>
> DALL-E에서 그림책 이미지를 잘 생성한다면 생략 가능한 단계

* 후보 알고리즘: CycleGAN, CartoonGAN, GANILLA 등
* 선정 알고리즘: 미정 -> __안 하는 방향으로 결정__

# 3. Reference
1. Transformer 논문 리뷰
* 올려주신 자료
* 쉽게: https://lv99.tistory.com/26
* 자세히: https://kmhana.tistory.com/28
2. DALL-E 논문 리뷰
* 쉽게: https://jiho-ml.com/weekly-nlp-40/
* 자세히: https://littlefoxdiary.tistory.com/74
3. CartoonGAN 논문 리뷰
* GAN: https://tobigs.gitbook.io/tobigs/deep-learning/computer-vision/gan-generative-adversarial-network
* CartoonGAN: https://blog.diyaml.com/teampost/Improving-CartoonGAN/
4. Code
* DALL-E: https://github.com/ai-coodinator/DALL-E
* CartoonCAN: https://github.com/TobiasSunderdiek/cartoon-gan
* GANILLA: 
https://neurohive.io/en/news/ganilla-gan-network-trained-to-generate-children-s-book-illustrations/