In [2]:
"""! pip install ffmpeg-python
! pip install facenet-pytorch
! pip install mmcv
! apt install -y ffmpeg"""

import mmcv
import ffmpeg
from facenet_pytorch import MTCNN
import numpy as np
import os
import glob
import time

import PIL
from scipy import signal
from scipy.io import wavfile
from tqdm import tqdm_notebook as tqdm
import cv2

import pandas as pd
import numpy as np
import fastai
from fastai.text.models.transformer import MultiHeadAttention
from fastai.vision import *
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os
import glob
import json
from skimage import io, transform

LABEL_FILE = "metadata.json"

PATH_ROOT_VID = "/media/dlo/New Volume/DeepFake/"#"../input/deepfake-detection-challenge/"
PATH_OUT = PATH_ROOT_VID #""

VIDEO_EXT = '.mp4'
AUDIO_EXT = '.wav'


#######################################
########### PREPROCESSING #############
#######################################
def video_audio_text_files(video_name, path_curr_vid, path_current_frames):
    """Returns path names from a video name"""
    video_file = f"{path_curr_vid}{video_name}"
    audio_file = f"{path_current_frames}{video_name.replace(VIDEO_EXT,AUDIO_EXT)}"
    return video_file, audio_file

def numpy_from_audio(audio_file, downsample_factor = None):
    """
    Reads an audio .wav file and returns its samples in a numpy array, 
    and the sampling rate [Hz]"""
    sample_rate, samples = wavfile.read(audio_file.replace('\'',''))
    if downsample_factor is not None:
        samples = signal.resample(samples, len(samples) // downsample_factor)
        sample_rate //= downsample_factor
    drop_samples = -(len(samples) % sample_rate)
    return samples[:drop_samples], sample_rate

def get_stft_db(samples, sample_rate):
    """Reads in 'audio_file', takes its STFT with a window size of 1024, 
    takes the magnitude of it, and returns its dB values.
    """
    f, t, Zxx = signal.stft(samples, fs = sample_rate, nperseg = 128)
    return f, t, np.log(np.abs(Zxx))

def write_stfts(video_name, path_curr_vid, path_current_frames):
    """Creates .jpg files of the an audio signal's STFT.
    A video 'video_name' is taken as input, 
    a temporary .wav file containing its uncompressed audio is created,
    .jpg files containing the dB of its STFT are written to disk, 300 of them
    (one per video frame, to match the extracted faces).
    the temporary .wav file is then deleted.
    """
    
    video_file, audio_file = video_audio_text_files(video_name, path_curr_vid, path_current_frames)
    
    files_exist = {f : os.path.isfile(f.replace('\'','')) 
                   for f in [video_file, audio_file]}
    
    if not os.path.isdir(path_current_frames):
        os.mkdir(path_current_frames)
    # Create audio_file .wav file from video
    if not files_exist[audio_file] :
        (ffmpeg
        .input(video_file)
        .output(audio_file)
        .run())
    
    # Read in audio file
    raw_samples = numpy_from_audio(audio_file)
    
    # Get STFT in dB
    _, _, samples = get_stft_db(*raw_samples)
    
    drop_samples = -(samples.shape[1] % 300)
    samples = samples[:,:drop_samples]
    
    samples -= np.min(samples)
    samples /= np.max(samples)
    
    chunk_size = samples.shape[1] // 300
    for chunk_idx in range(300):
        fname = f"{path_current_frames}audio_{chunk_idx}.jpg"
        if not os.path.isfile(fname):
            PIL.Image.fromarray((samples[:,chunk_idx*chunk_size:(chunk_idx+1)*chunk_size] * 255)
                            .astype(np.uint8)).save(fname)
            
    delete_command = f"rm '{audio_file}'"
    os.system(delete_command)

    
def get_model():
    mtcnn = MTCNN(device = 'cuda')
    return mtcnn
    
def write_face_samples(model, output_path, invid):
    """
    Writes to disk a series of faces detected in a video sample"""
    
    if not os.path.isdir(output_path) :
        os.mkdir(output_path)
    
    video = mmcv.VideoReader(invid)
    for frame_ix, frame in enumerate(video):
        frame_name = f"{output_path}webcam_{frame_ix}_0.jpg"
        if os.path.isfile(frame_name): continue
            
        frame_img = PIL.Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        _ = model(frame_img,frame_name)
    
    
def write_faces_and_stfts(curr_dir):
    """
    Process a directory of videos for kaggle's deepfake challenge.
    For each video of the directory:
    Compute a stft of the audio portion of the file, save it to disk
    Detect the faces present in the file, save them as .jpg to disk
    """
    
    path_curr_vid = f"{PATH_ROOT_VID}{curr_dir}/"
    path_write_faces = f"{PATH_OUT}{curr_dir}/face_frames/"
    
    print(f"Reading videos from {path_curr_vid}")
    video_names = []
    for filename in glob.iglob(path_curr_vid + '*.mp4', recursive=True):
        video_names.append(filename.split("/")[-1])
    if not os.path.isdir(path_write_faces):
        os.mkdir(path_write_faces)
        
    print(f"Writing .jpg of audio's STFTs in {path_write_faces}")
    #mod = MTCNN(device = 'cuda')
    start_time = time.time()
    for ix, video_name in enumerate(video_names):

        output_path = f"{path_write_faces}{video_name.split('.')[0]}/"
        """invid = f'{path_curr_vid}{video_name}'
        try:
            write_face_samples(mod, output_path, invid)
        except:
            print(f"Couldnt yolo {invid}")"""
        try:
            write_stfts(video_name, path_curr_vid, output_path)
        except:
            print(f"Couldnt stft {invid}")
        print(f"Average time per video {round((time.time() - start_time)/(ix + 1),2)} s")


In [3]:
### Write faces and STFTs

write_faces_and_stfts('dfdc_train_part_3')
#write_faces_and_stfts('test_videos')

Reading videos from /media/dlo/New Volume/DeepFake/dfdc_train_part_3/
Writing .jpg of audio's STFTs in /media/dlo/New Volume/DeepFake/dfdc_train_part_3/face_frames/
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.

Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average ti

Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average ti

Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average ti

Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average ti

Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average time per video 0.25 s
Average ti

In [None]:
###################################################################
# Data preparation (indexing of extracted faces & stft per video) #
# Custom pytorch dataset creation                                 #
###################################################################

# Data preparation
class DeepFakeDF():
    """ Indexes the faces & STFTs into a dataframe, to be used
    by the pytorch dataset. processing is a bit long"""
    def __init__(self, data_dirs, test = False):
        self.data_dirs = [f"{PATH_ROOT_VID}{d}" for d in data_dirs]
        self.frame_dirs = [f"{d}" for d in data_dirs]
        audio = []
        video = []
        for frame_dir in self.frame_dirs:
            audio.extend(glob.glob(f"{frame_dir}*/audio*"))
            video.extend(glob.glob(f"{frame_dir}*/webcam*"))
        if not test:
            self.labels = self._get_labels()
        else:
            self.labels = {}
            
        self.video = video
        self.df = self._prep_df_audio(audio)
        self.video_dicts = self._prep_video_dicts(video)
        self._merge_audio_video()
        
    def get_df(self):
        return self.df
        
    def _get_labels(self):
        labels = {}
        for d in self.data_dirs:
            with open(f"{d}{LABEL_FILE}", "r") as f:
                labels.update({f"{k.split('.mp4')[0]}": v['label'] 
                             for k, v in json.load(f).items()})
                
        return labels
    
    def _merge_audio_video(self):
        """
        Audio has 1 sample per frame in any case. But the face extractor may have 
        missed some faces. This function attempts to provide a face for each audio sample.
        """
        self.df['dir'] = self.df['audio'].str.split("/").str[-3]
        
        # Actor 0:
        # Flagging frames for which actor 0 was detected
        self.df['actor_0'] = [self.video_dicts[0].get(tuple(o),np.nan) 
                              for o in self.df[['video_name', 'sample']].values.tolist()]
        
        # Creating path variables for frames in which actor 0 was detected
        act0 = self.df.loc[~self.df['actor_0'].isna()].copy()
        act0['actor_0'] = (PATH_OUT + act0['dir'] + "/" + act0['video_name'] 
                           + "/" + "webcam_" + act0['sample'].astype(str) + "_0" 
                           + ".jpg")
        self.df.loc[~self.df['actor_0'].isna(), 'actor_0'] = act0
        
        # Actor 1:
        # Flagging frames for which actor 1 was detected
        self.df['actor_1'] = [self.video_dicts[1].get(tuple(o),np.nan) 
                              for o in self.df[['video_name', 'sample']].values.tolist()]
        # Creating path variables for frames in which actor 1 was detected
        act1 = self.df.loc[~self.df['actor_1'].isna()].copy()
        act1['actor_1'] = (PATH_OUT + act1['dir'] + "/" + act1['video_name'] 
                           + "/" + "webcam_" + act1['sample'].astype(str) + "_1" 
                           + ".jpg")
        self.df.loc[~self.df['actor_1'].isna(), 'actor_1'] = act1
        
        # Filling NaNs. Forward fill per video name, so that missing faces are replaced
        # by the previous detected face.
        for vid in self.df['video_name'].unique():
            cond = (self.df['video_name'] == vid)
            
            self.df.loc[cond,'actor_0'] = (self.df.loc[cond,'actor_0']
                                           .fillna(method = 'ffill')
                                           .fillna(method = 'bfill'))
            
            self.df.loc[cond,'actor_1'] = (self.df.loc[cond,'actor_1']
                                           .fillna(method = 'ffill')
                                           .fillna(method = 'bfill'))
        
        # As not all videos have two actors, for now, simply copying the 1st actor into the 2nd
        # actor field when there is only 1 actor.
        self.df.loc[self.df['actor_1'].isna(), 'actor_1'] = self.df['actor_0']
        
        for col in ['audio', 'actor_0', 'actor_1']:
            self.df[col] = self.df[col].str.replace(PATH_OUT,"")
        
    
    def _prep_df_audio(self, audio):
        """Returns a dataframe indexed on frames of videos.
        Contains the path to each .jpg of STFTs of video frames"""
        df = pd.DataFrame(audio, columns = ['audio'])
        df['video_name'] = df['audio'].str.split("/").str[-2]
        df['sample'] = df['audio'].str.split("/").str[-1].str.split(".").str[0].str.split("_").str[-1].astype(int)
        df['label'] = df['video_name'].apply(lambda x: self.labels.get(x,""))
        df.sort_values(by=['video_name','sample'], inplace = True)
        df['actor_0'] = ""
        df['actor_1'] = ""
        return df
    
    def _prep_video_dicts(self, video):
        """Returns dicts, one that tell if a face was detected in frames of videos,
        and one that tells if a second face was detected in frames of videos."""
        video_name, frame_name = zip(*[o.split("/")[-2:] for o in video])
        samples, actors = zip(*[o.replace(".jpg","").split("_")[-2:] for o in frame_name])
        samples = [int(o) for o in samples]
        actors = [int(o) for o in actors]
        actor_0_present = {(v, s) : a for v, s, a in zip(video_name, samples, actors) if a == 0}
        actor_1_present = {(v, s) : a for v, s, a in zip(video_name, samples, actors) if a == 1}
        return actor_0_present, actor_1_present

    
# Custom pytorch dataset
class DeepFakeJPGDataset(Dataset):
    """DeepFakeJPGDataset. Opens .jpgs of either faces or STFTs for each frame of video.
    Returns tensors of a concatenatetion of the video's frames."""

    def __init__(self, df, col_name, transform = None, downsample_factor = 1):
        """df[col_name] has to contain paths to .jpg files,
        either of faces or of STFTs.
        transform: resize images - all cropped faces don't have the same
        shape, they won't fit together in a batch. Need to resize them,
        use transforms.Resize((150,100)) for example. 
        downsample_factor: use > 1 to not use all the frames of a video
        """
        self.x = df[col_name]
        self.y = df['label'].astype('category').cat.codes.astype(int)
        self.transform = transform
        self.downsample_factor = int(downsample_factor)
        self.col_name = col_name

    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        list_image_paths = self.x.iloc[idx]
        # Opening one every 'downsample_factor' image
        images = [PIL.Image.open(PATH_OUT + im) 
                  for im in list_image_paths[::self.downsample_factor]]
        target = self.y.iloc[idx]
        
        # Resizing
        if self.transform:
            images = [self.transform(im) for im in images]
            
        # Normalizing the .jpgs to [-0.5, 0.5] both for 
        # faces and sound STFTs.
        # TODO: Normalize faces with imagenet stats, as the 
        # model taking faces as input is pretrained on imagenet
        images = [(np.array(im) / 255.0) - 0.5 for im in images]
        
        # Adding channel dimension to STFT images (grayscale)
        if len(images[0].shape) == 2: 
            images = [im[...,None] for im in images]
            
        # (n_frames, channels, height, width)    
        return torch.Tensor(images).permute(0,3,1,2)
    
class DeepFakeDetectionDataset(Dataset):
    """DeepFakeDetectionDataset. Merges faces & STFT datasets."""
    def __init__(self, x1, x2, y):
        self.x1,self.x2,self.y = x1,x2,y
    def __len__(self): 
        return len(self.y)
    def __getitem__(self, i): 
        return (self.x1[i], self.x2[i]), self.y[i]
    
##########
# Models #
##########

# Frame embeddings
def fully_connected(layers, dropout, bn = True):
    """Returns a series of [BatchNorm1d, Dropout, Linear]*len(layers)
    The size of the linear layers is given by 'layers'. """
    model_layers = [] 
    activations = [nn.ReLU(inplace=True)] * (len(layers)-1)
    for n_in, n_out, p, actn in zip(layers[:-1], layers[1:], dropout, activations):
        model_layers += bn_drop_lin(n_in, n_out, p = p, actn = actn, bn = bn)
    return nn.Sequential(*model_layers)

class VideoAnalyzer(nn.Module):
    def __init__(self, frame_embedding_size, n_frames): 
        super().__init__()
        self.self_attention = MultiHeadAttention(5,frame_embedding_size)
        layers = [n_frames*frame_embedding_size, n_frames*frame_embedding_size // 2, 50]
        self.linears = fully_connected(layers, [0.1]*len(layers))
        self.classifier = nn.Linear(layers[-1],2)
    
    def forward(self, x):
        x = self.self_attention(x)
        x = x.contiguous().view(x.size(0), -1)
        x = self.linears(x)
        x = self.classifier(x)
        return x

    
# Global model
class DeepFakeDetector(nn.Module):
    """DeepFakeDetectionModel. This model has four main parts:
    -Face analyzer: convnet 2d pretrained on imagenet.
    -STFT analyzer: convnet 2d.
    -Face & STFT merger: fully connected network, takes the output of the two above networks
        as input, and outputs a vector (small dimension) representation of the frame's video and
        audio - frame embeddings.
    -Video analyzer: once the three above network have processed all the frames of a video, 
        the concatenation of the frame embeddings is passed to a 4th network. This network sees
        the entire video through its frame embeddings, and predicts the label TRUE/FAKE.
    """
    def __init__(self, model_face, model_stft, model_merge, model_video, n_frames): 
        super().__init__()
        self.n_frames = n_frames
        
        # two conv heads
        self.model_face = model_face
        self.model_stft = model_stft
        self.poolflat = fastai.layers.PoolFlatten()
        
        # frame embeddings
        self.model_merge = model_merge
        
        # frame embeddings aggregator, and classifier
        self.model_video = model_video

    def forward(self, *x):
        x_faces = x[0]
        x_stfts = x[1]
        
        frame_embeddings = []
        for frame in range(self.n_frames):
            
            x_face = self.model_face(x_faces[:,frame,:,:,:])
            x_face = self.poolflat(x_face)
            x_stft = self.model_stft(x_stfts[:,frame,:,:,:])
            x = torch.cat([x_face, x_stft], dim=1)
            x = self.model_merge(x)
            frame_embeddings.append(x[:,None,:])
        
        x = torch.cat(frame_embeddings, dim = 1)
        x = self.model_video(x)
        return F.log_softmax(x, dim = -1)


In [None]:
### Peparing dataframes of faces and stfts paths
train_df = DeepFakeDF(["train_sample_videos/"]).get_df()

gb = train_df.groupby('video_name')
audio = gb['audio'].apply(list)
video = gb['actor_0'].apply(list)
label = gb['label'].nth(0)
train_df = pd.concat([audio,video,label],axis=1)
train_df.reset_index(inplace = True)

test_df = DeepFakeDF(["test_videos/"], test = True).get_df()

gb = test_df.groupby('video_name')
audio = gb['audio'].apply(list)
video = gb['actor_0'].apply(list)
label = gb['label'].nth(0)
test_df = pd.concat([audio,video,label],axis=1)
test_df.reset_index(inplace = True)


In [None]:
val_perc = 0.5
n_val = int(val_perc*len(train_df))
shuffled_idx = np.random.permutation(train_df.index.tolist())

val_idx = shuffled_idx[:n_val]
train_idx = shuffled_idx[n_val:]

downsample_factor = 10 # using one every 10 frames only
n_frames = 300 // downsample_factor

# train torch datasets & dataloaders
tr_images = DeepFakeJPGDataset(train_df.iloc[train_idx].reset_index(drop = True), 
                               'actor_0', transforms.Resize((150,100)), downsample_factor)
tr_sound = DeepFakeJPGDataset(train_df.iloc[train_idx].reset_index(drop = True),
                               'audio', transforms.Resize((65,25)), downsample_factor)
train_ds = DeepFakeDetectionDataset(tr_images, tr_sound, tr_images.y)
train_dl = DataLoader(train_ds)

# val torch datasets & dataloaders
val_images = DeepFakeJPGDataset(train_df.iloc[val_idx].reset_index(drop = True), 
                                'actor_0', transforms.Resize((150,100)), downsample_factor)
val_sound = DeepFakeJPGDataset(train_df.iloc[val_idx].reset_index(drop = True), 
                               'audio', transforms.Resize((65,25)), downsample_factor)
valid_ds = DeepFakeDetectionDataset(val_images, val_sound, val_images.y)
valid_dl = DataLoader(valid_ds)

# test torch datasets & dataloaders
te_images = DeepFakeJPGDataset(test_df.reset_index(drop = True), 
                               'actor_0', transforms.Resize((150,100)), downsample_factor)
te_sound = DeepFakeJPGDataset(test_df.reset_index(drop = True),
                               'audio', transforms.Resize((65,25)), downsample_factor)
test_ds = DeepFakeDetectionDataset(te_images, te_sound, te_images.y)
test_dl = DataLoader(test_ds)

# fastai databunch
db = DataBunch(train_dl,valid_dl,)
db.batch_size = 8

In [None]:
db = DataBunch(train_dl,valid_dl,test_dl = test_dl)
db.batch_size = 8


In [None]:
# Submodels
model_faces = create_body(fastai.vision.models.resnet18)

model_stfts = simple_cnn(actns = [1,8,16,32,64], strides = [(2,1),(2,2),(2,2),(2,2)],
                         bn = True)

frame_embedding_size = 32
merge_layers = [512 + 64, frame_embedding_size]
model_frame = fully_connected(merge_layers, dropout = [0.1]*len(merge_layers))

model_video = VideoAnalyzer(frame_embedding_size, n_frames)
# Global model
model = DeepFakeDetector(model_faces, model_stfts, model_frame, 
                             model_video, n_frames = n_frames)


In [None]:
learn = Learner(db, model, metrics = [accuracy])

In [None]:
learn.fit_one_cycle(1,1e-3)