In [13]:
import os, sys
PWD = os.getenv('PWD')
os.chdir(PWD)
sys.path.insert(0, os.getenv('PWD'))
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "local_settings.py")
import django
django.setup()

In [14]:
import subprocess,time
import numpy as np
import cv2,os
from AiHandler.wav2lip import audio
import torch
from tqdm import tqdm
import json
import shutil

from scipy.spatial import ConvexHull
from skimage import img_as_ubyte
from django.conf import settings

In [15]:
from numpy import save

In [16]:
def normalize_kp(kp_source, kp_driving, kp_driving_initial, adapt_movement_scale=False,
                 use_relative_movement=False, use_relative_jacobian=False):
    if adapt_movement_scale:
        source_area = ConvexHull(kp_source['value'][0].data.cpu().numpy()).volume
        driving_area = ConvexHull(kp_driving_initial['value'][0].data.cpu().numpy()).volume
        adapt_movement_scale = np.sqrt(source_area) / np.sqrt(driving_area)
    else:
        adapt_movement_scale = 1

    kp_new = {k: v for k, v in kp_driving.items()}

    if use_relative_movement:
        kp_value_diff = (kp_driving['value'] - kp_driving_initial['value'])
        kp_value_diff *= adapt_movement_scale
        kp_new['value'] = kp_value_diff + kp_source['value']

        if use_relative_jacobian:
            jacobian_diff = torch.matmul(kp_driving['jacobian'], torch.inverse(kp_driving_initial['jacobian']))
            kp_new['jacobian'] = torch.matmul(jacobian_diff, kp_source['jacobian'])

    return kp_new


In [17]:

def wav2lip_datagen(mels,video_path,face_cord_path,start_frame=0,totalFrames=3000,img_size=settings.WAVLIPIMAGESIZE,wav2lip_batch_size=settings.WAVLIPBATCHSIZE):
    img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
    total_frames = len(mels)
    face_cord = np.load(open(face_cord_path,'rb'))[start_frame:start_frame+total_frames]
    video = cv2.VideoCapture(video_path)
    if start_frame>0:
        video.set(cv2.CAP_PROP_POS_FRAMES, start_frame)

    tempFrame=None
    img_size_half = img_size//2
    currentFrame = start_frame - 1
    for i, m in enumerate(mels):
        currentFrame +=1
        currentIndex = currentFrame%totalFrames
        if currentIndex == 0 and currentFrame!=0:
            video.set(cv2.CAP_PROP_POS_FRAMES, currentIndex)
        is_frame, frame = video.read()
        if is_frame:
            tempFrame = frame
        x1,y1,x2,y2 = face_cord[currentIndex]

        img_batch.append(cv2.resize(tempFrame[y1:y2,x1:x2],(img_size,img_size)))
        mel_batch.append(m)
        frame_batch.append(tempFrame)
        coords_batch.append((y1, y2, x1, x2))

        if len(img_batch) >= wav2lip_batch_size:
            img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)

            img_masked = img_batch.copy()
            img_masked[:, img_size_half:] = 0

            img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
            mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

            yield img_batch, mel_batch, frame_batch, coords_batch
            img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []

    if len(img_batch) > 0:
        img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)

        img_masked = img_batch.copy()
        img_masked[:, img_size_half:] = 0

        img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
        mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

        yield img_batch, mel_batch, frame_batch, coords_batch



In [18]:
DEVICE=settings.DEVICE
melstepsize = settings.WAVLIPMELSTEPSIZE
mel_idx_multiplier = 80./settings.VIDEO_DEFAULT_FPS 

In [19]:
wav2lipVideo = "/home/govind/VideoAutomation/src/private_data/avatars/4/wav2lip/video.mp4"
wav2lipFaceCord = "/home/govind/VideoAutomation/src/private_data/avatars/4/wav2lip/face_coordinate.npy"
startFrame = 0

In [20]:
audioPath = "/home/govind/VideoAutomation/src/uploads/usersound/generated/"
allFiles = os.listdir(audioPath)

In [21]:
fileS = [[os.path.getsize(audioPath + ii),audioPath + ii] for ii in allFiles]

In [22]:
fileS = sorted(fileS)

In [23]:
_topTenSize = [ii[1] for ii in fileS[-10:]]

In [24]:
def writeVideoFileFromFrame(n,pred, frames, coords):
    videoWriter = cv2.VideoWriter(f'/home/govind/test/npa/{n}.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 30, (512,512))
    for p, f, c in zip(pred, frames, coords):
        y1, y2, x1, x2 = c
        p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
        f[y1:y2, x1:x2] = p
        videoWriter.write(cv2.resize(cv2.cvtColor(f,cv2.COLOR_BGR2RGB), (512,512))[..., :3])
    videoWriter.release()

In [25]:
from threading import Thread

In [26]:

for n,_audioP in enumerate(_topTenSize[::-1][:1]):
    
    print(n,_audioP)
    wav = audio.load_wav(_audioP, 16000)
    mel = audio.melspectrogram(wav)

    if np.isnan(mel.reshape(-1)).sum() > 0:
        print('Unable to generate')
        break
    mel_chunks = []
    i = 0
    while 1:
        start_idx = int(i * mel_idx_multiplier)
        if start_idx + melstepsize > len(mel[0]):
            mel_chunks.append(mel[:, len(mel[0]) - melstepsize:])
            break
        mel_chunks.append(mel[:, start_idx : start_idx + melstepsize])
        i += 1
    mel_chunk_len = len(mel_chunks)
    print("Mel Chunks Len: ",mel_chunk_len)
    
    wav2lip_data_generator = wav2lip_datagen(mel_chunks,wav2lipVideo,wav2lipFaceCord,startFrame,3020)
    wav2lipOutput = []
    currentMaxBatch = 3000
    prevTh = None
    for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(wav2lip_data_generator,total=int(np.ceil(float(mel_chunk_len)/settings.WAVLIPBATCHSIZE)))):
        with torch.no_grad():
            img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(DEVICE)
            mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(DEVICE)
            pred = settings.WAVLIPMODEL(mel_batch, img_batch)
            pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
            prevTh = Thread(target=writeVideoFileFromFrame,args=(i,pred,frames,coords,))
            prevTh.start()
            wav2lipOutput.append(prevTh)
            #writeVideoFileFromFrame(videoWriter,pred,frames,coords)
        if len(wav2lipOutput)>currentMaxBatch:
            wav2lipOutput = []
            print('Resetting List')

0 /home/govind/VideoAutomation/src/uploads/usersound/generated/566350c5-6dd2-46db-8a43-8cb0bc2427d9.mp3


  0%|          | 0/54 [00:00<?, ?it/s]

Mel Chunks Len:  6827


100%|██████████| 54/54 [00:23<00:00,  2.35it/s]


In [None]:
pred.shape

In [13]:
63*96*96*3*3000

5225472000

In [None]:

_audioPath = ""
   
wav = audio.load_wav(os.path.join(queue_inst.getcwd(),'sound.wav'), 16000)
mel = audio.melspectrogram(wav)

if np.isnan(mel.reshape(-1)).sum() > 0:
    print('Unable to generate')
    break
    
mel_chunks = []

i = 0
while 1:
    start_idx = int(i * mel_idx_multiplier)
    if start_idx + melstepsize > len(mel[0]):
        mel_chunks.append(mel[:, len(mel[0]) - melstepsize:])
        break
    mel_chunks.append(mel[:, start_idx : start_idx + melstepsize])
    i += 1
mel_chunk_len = len(mel_chunks)


## load first and last frame
try:
    _ = allAvatarImgSeq[avatar_inst.id][0]
except:
    allAvatarImgSeq[avatar_inst.id] = sorted(os.listdir(os.path.join(avatar_inst.getcwd(),'fullbody/without_swap/')))


drivingInit = torch.tensor(np.load(avatar_inst.getFirstInitFrame())).permute(0, 3, 1, 2).to(DEVICE)
sourceFrame = torch.tensor(np.load(avatar_inst.getSourceFrame())).permute(0, 3, 1, 2).to(DEVICE)
kpSourceFrame = settings.FIRSTORDERKPDETECTOR(sourceFrame)
kpDrivingInit = settings.FIRSTORDERKPDETECTOR(drivingInit)

startFrameIndx = 0
aiStartFrame = queue_inst.start_frame
aiOutputFolder = queue_inst.getFaceSwapDir()
avatarMainFolder = os.path.join(avatar_inst.getcwd(),'fullbody/without_swap/')

# remove existing avatar data
try:
    os.system(f"rm -rf {aiOutputFolder}*")
except:
    pass


currentFrame = queue_inst.start_frame - 1

wav2lip_data_generator = wav2lip_datagen(mel_chunks,avatar_inst.getWav2lipVideo(),avatar_inst.getFaceCordinate(),queue_inst.start_frame,queue_inst.avatar_image.totalFrames)
wav2lipOutput = []
currentMaxBatch = 3000
for i, (img_batch, mel_batch, frames, coords) in enumerate(tqdm(wav2lip_data_generator,total=int(np.ceil(float(mel_chunk_len)/settings.WAVLIPBATCHSIZE)))):
    with torch.no_grad():
        img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(DEVICE)
        mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(DEVICE)
        pred = settings.WAVLIPMODEL(mel_batch, img_batch)
        pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
        for p, f, c in zip(pred, frames, coords):
            y1, y2, x1, x2 = c
            p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
            f[y1:y2, x1:x2] = p

            drivingData = cv2.resize(cv2.cvtColor(f,cv2.COLOR_BGR2RGB), (512,512))[..., :3].astype('float32')/255
            wav2lipOutput.append(drivingData)

    if len(wav2lipOutput)>currentMaxBatch:
        for drivingData in tqdm(wav2lipOutput,total=len(wav2lipOutput)):
            currentFrame += 1
            currentIndex = currentFrame%queue_inst.avatar_image.totalFrames
            with torch.no_grad():
                drivingDataT = torch.tensor(drivingData[np.newaxis]).permute(0,3,1,2).to(DEVICE)
                kpDriving = settings.FIRSTORDERKPDETECTOR(drivingDataT)
                kpNorm = normalize_kp(kp_source=kpSourceFrame, kp_driving=kpDriving,
                                kp_driving_initial=kpDrivingInit, use_relative_movement=True,
                                use_relative_jacobian=True, adapt_movement_scale=True)

                fout = settings.FIRSTORDERGENERATOR(sourceFrame, kp_source=kpSourceFrame, kp_driving=kpNorm)
                aiRGBFrame = img_as_ubyte(np.transpose(fout['prediction'].data.cpu().numpy(), [0, 2, 3, 1])[0])

            aiRGBAFrame = cv2.cvtColor(aiRGBFrame,cv2.COLOR_RGB2RGBA)

            if currentIndex == 0 and currentFrame!=0:
                aiVideoMask.set(cv2.CAP_PROP_POS_FRAMES, currentIndex)
            ret,aiVideoMaskFrame = aiVideoMask.read()
            aiRGBAFrame[:,:,3] = aiVideoMaskFrame[:,:,1]
            aiRGBAFrame = cv2.resize(aiRGBAFrame,(avatarPosSize,avatarPosSize))
            if avatarPosIY<=0:
                aiRGBAFrame = aiRGBAFrame[avatarPosIY:avatarPosFY,avatarPosIX:avatarPosFX]
            ctAiProcess = Thread(target=saveAiSwapFrame, args=(os.path.join(avatarMainFolder,allAvatarImgSeq[avatar_inst.id][currentIndex]),aiRGBAFrame,(avatarPosIX,avatarPosIY,avatarPosFX,avatarPosFY),os.path.join(aiOutputFolder,f'{str(startFrameIndx).zfill(5)}.png'),))
            ctAiProcess.start()
            startFrameIndx+=1

            ## update progress in db
            if (currentProcessingFrame+startFrameIndx)%settings.VIDEO_PROGRESS_UPDATE_FRAME==0:
                firstQueueData.updateProgress(currentProcessingFrame+startFrameIndx)

        wav2lipOutput = []

gc.collect()
queue_inst.output = json.dumps({"wav2lip": {"status": True}})
queue_inst.save()
print(f'Wav2Lip Completed: {datetime.now()} {queue_inst.id} {firstQueueData.id}')

if len(wav2lipOutput)>0:
    for drivingData in tqdm(wav2lipOutput,total=len(wav2lipOutput)):
        currentFrame += 1
        currentIndex = currentFrame%queue_inst.avatar_image.totalFrames
        with torch.no_grad():
            drivingDataT = torch.tensor(drivingData[np.newaxis]).permute(0,3,1,2).to(DEVICE)
            kpDriving = settings.FIRSTORDERKPDETECTOR(drivingDataT)
            kpNorm = normalize_kp(kp_source=kpSourceFrame, kp_driving=kpDriving,
                            kp_driving_initial=kpDrivingInit, use_relative_movement=True,
                            use_relative_jacobian=True, adapt_movement_scale=True)

            fout = settings.FIRSTORDERGENERATOR(sourceFrame, kp_source=kpSourceFrame, kp_driving=kpNorm)
            aiRGBFrame = img_as_ubyte(np.transpose(fout['prediction'].data.cpu().numpy(), [0, 2, 3, 1])[0])

        aiRGBAFrame = cv2.cvtColor(aiRGBFrame,cv2.COLOR_RGB2RGBA)
        if currentIndex == 0 and currentFrame!=0:
            aiVideoMask.set(cv2.CAP_PROP_POS_FRAMES, currentIndex)
        ret,aiVideoMaskFrame = aiVideoMask.read()
        aiRGBAFrame[:,:,3] = aiVideoMaskFrame[:,:,1]
        aiRGBAFrame = cv2.resize(aiRGBAFrame,(avatarPosSize,avatarPosSize))
        if avatarPosIY<=0:
            aiRGBAFrame = aiRGBAFrame[avatarPosIY:avatarPosFY,avatarPosIX:avatarPosFX]
        ctAiProcess = Thread(target=saveAiSwapFrame, args=(os.path.join(avatarMainFolder,allAvatarImgSeq[avatar_inst.id][currentIndex]),aiRGBAFrame,(avatarPosIX,avatarPosIY,avatarPosFX,avatarPosFY),os.path.join(aiOutputFolder,f'{str(startFrameIndx).zfill(5)}.png'),))
        ctAiProcess.start()
        startFrameIndx+=1

        ## update progress in db
        if (currentProcessingFrame+startFrameIndx)%settings.VIDEO_PROGRESS_UPDATE_FRAME==0:
            firstQueueData.updateProgress(currentProcessingFrame+startFrameIndx)


        #if (min(currentProcessingFrame/firstQueueData.totalFrames,1)*100)%
    gc.collect()
aiVideoMask.release()
allThread.append(ctAiProcess)
queue_inst.status = 1
queue_inst.output = json.dumps({"first_order": {"status": True}})
queue_inst.save()

currentProcessingFrame += queue_inst.totalOutputFrame
firstQueueData.updateProgress(currentProcessingFrame)


