In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import numpy as np
import numpy.random as rn
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
model_deepfake = keras.Sequential([
  layers.Conv2D(filters = 64, kernel_size = 4, strides = (1, 1), input_shape = (256, 256, 3), padding = 'same', activation = 'relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(filters = 32, kernel_size = 4, strides = (1, 1), padding = 'same', activation = 'relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(filters = 16, kernel_size = 4, strides = (1, 1), padding = 'same', activation = 'relu'),
  layers.MaxPooling2D(),
  layers.Dropout(rate = 0.1),
  layers.Flatten(),
  layers.Dense(units = 4096, activation = 'relu'),
  layers.Dense(units = 256, activation = 'relu'), 
  layers.Dense(units = 1)
])

In [3]:
model_deepfake.compile(optimizer = 'adam', 
                       metrics = keras.metrics.BinaryAccuracy(),
                       loss = keras.losses.BinaryCrossentropy())

In [4]:
model_deepfake.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 256, 256, 64)      3136      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 128, 128, 64)      0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 128, 128, 32)      32800     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 64, 64, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 64, 64, 16)        8208      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 32, 32, 16)        0         
_________________________________________________________________
dropout (Dropout)            (None, 32, 32, 16)        0

In [5]:
# Create a function to randomly select image data from a file
import pathlib
from pathlib import Path

def get_images1(n: int, dtype: str='train', seed: int=None):
    '''
    Returns n randomly selected testing, training, or validation data.
    
    Takes ~29 sec / 100 iter with n = 100
    Takes ~300 sec / 1000 iter with n = 100
    '''
    # Make sure train param is valid
    if dtype not in ['train', 'valid', 'test']:
        raise Exception("dtype argument must be train, valid, or test.")
    
    # Get the number of files in the directory of interest
    n_files = {"train": 50000, "valid": 10000, "test": 10000}[dtype]
    
    # Make sure you don't want more pictures than we have
    if n > n_files:
        raise Exception(f'There are not {n} files in the {dtype} folder') 
    
    # Create the paths to the data
    datapath = Path('.') / 'data' / 'real_vs_fake' / 'real-vs-fake' / dtype
    fakepath = datapath / 'fake'
    realpath = datapath / 'real'
    
    # Get the paths for the images
    fakepaths = np.array([*fakepath.glob('*.jpg')])
    realpaths = np.array([*realpath.glob('*.jpg')])
    
    ############# TODO: Optimize path -> list -> array process above ######
    
    # Decide on how many images will be real or fake
    n_real = np.int(rn.randint(low = 0, high = n + 1, size = 1))
    n_fake = n - n_real

    # Randomly select the images
    fake_imgs = rn.choice(fakepaths, size = n_fake, replace = False)
    real_imgs = rn.choice(realpaths, size = n_real, replace = False)
    
#     return fake_imgs, real_imgs
    return fake_imgs, real_imgs

In [8]:
# Create a second function to randomly select image data from a file
def get_images2(n: int, dtype: str='train', seed: int=None):
    '''
    Returns n randomly selected testing, training, or validation data.
    
    Takes ~13 sec / 100 iter with n = 100
    Takes ~118 sec / 1000 iter with n = 100
    '''
    # Make sure train param is valid
    if dtype not in ['train', 'valid', 'test']:
        raise Exception("dtype argument must be train, valid, or test.")
    
    # Get the number of files in the directory of interest
    n_files = {"train": 50000, "valid": 10000, "test": 10000}[dtype]
    
    # Make sure you don't want more pictures than we have
    if n > n_files:
        raise Exception(f'There are not {n} files in the {dtype} folder')
    
    # Create the paths to the data
    datapath = Path('.') / 'data' / 'real_vs_fake' / 'real-vs-fake' / dtype
    fakepath = datapath / 'fake'
    realpath = datapath / 'real'
    
    # Set a seed if present
    if seed is not None:
        rn.seed(seed)
    
    # Decide on how many images will be real or fake
    n_real = np.int(rn.randint(low = 0, high = n + 1, size = 1))
    n_fake = n - n_real

    # Randomly select indicies for the images
    idx_fake = set(rn.randint(low = 0, high = n_files, size = n_fake))
    idx_real = set(rn.randint(low = 0, high = n_files, size = n_real))
    
    # Containers
    fake_imgs = np.empty(n_fake, dtype = pathlib.PosixPath)
    real_imgs = np.zeros(n_real, dtype = pathlib.PosixPath)
    
    # Get the paths    
    fakepaths = fakepath.glob('*.jpg')
    realpaths = realpath.glob('*.jpg')
    
    # Iterate over the fake files
    i = 0
    for ci, img in enumerate(fakepaths):
        # Check if the current index is in the 
        if ci in idx_fake:
            fake_imgs[i] = img
            i += 1
            
            # Make sure we don't go out of bounds
            if i >= n_fake:
                break
    # Iterate over the fake files
    i = 0
    for ci, img in enumerate(realpaths):
        # Check if the current index is in the 
        if ci in idx_real:
            real_imgs[i] = img
            i += 1
            
            # Make sure we don't go out of bounds
            if i >= n_real:
                break
    
    return fake_imgs, real_imgs

In [10]:
get_images1(10)

(array([PosixPath('data/real_vs_fake/real-vs-fake/train/fake/D42XKJIJCI.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/fake/8P3SVIZFWB.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/fake/MWED1XBF3S.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/fake/BEX6V5S75N.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/fake/RHR54UO7WT.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/fake/TKA9FBZ60B.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/fake/KK13P0GYN8.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/fake/JJWOORDQ5N.jpg')],
       dtype=object),
 array([PosixPath('data/real_vs_fake/real-vs-fake/train/real/21823.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/real/57658.jpg')],
       dtype=object))

In [32]:
get_images2(10)

(array([PosixPath('data/real_vs_fake/real-vs-fake/train/fake/YSCARQLWA0.jpg')],
       dtype=object),
 array([PosixPath('data/real_vs_fake/real-vs-fake/train/real/46221.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/real/46981.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/real/27557.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/real/53034.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/real/53539.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/real/05368.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/real/06003.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/real/46810.jpg'),
        PosixPath('data/real_vs_fake/real-vs-fake/train/real/61717.jpg')],
       dtype=object))