In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import os
import cv2
import json
import random
import shutil
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_data_path = '/kaggle/input/cropped-openforensics/train_cropped_faces_224_rgb/Train'
print(f'Number of real images: {len(os.listdir(os.path.join(train_data_path, "Real")))}')
print(f'Number of fake images: {len(os.listdir(os.path.join(train_data_path, "Fake")))}')

Number of real images: 85006
Number of fake images: 65860


In [3]:
85006 * 100 / (85006 + 65860)

56.345366086460835

In [4]:
val_data_path = '/kaggle/input/cropped-openforensics/val_cropped_faces_224_rgb/Val'
print(f'Number of real images: {len(os.listdir(os.path.join(val_data_path, "Real")))}')
print(f'Number of fake images: {len(os.listdir(os.path.join(val_data_path, "Fake")))}')

Number of real images: 4782
Number of fake images: 10563


In [7]:
10563 * 100 / (10563 + 4782)

68.83675464320626

In [5]:
test_data_path = '/kaggle/input/cropped-openforensics/test_cropped_faces_224_rgb/Test'
print(f'Number of real images: {len(os.listdir(os.path.join(test_data_path, "Real")))}')
print(f'Number of fake images: {len(os.listdir(os.path.join(test_data_path, "Fake")))}')

Number of real images: 21048
Number of fake images: 28670


In [6]:
import os

def remove_all_files(directory):
    # Check if directory exists
    if not os.path.exists(directory):
        print(f"The directory {directory} does not exist.")
        return
    
    # Loop through all files in the directory
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        
        # Check if it’s a file before removing
        if os.path.isfile(file_path):
            os.remove(file_path)
            print(f"Removed file: {file_path}")
        else:
            print(f"Skipped non-file item: {file_path}")

# Usage example
remove_all_files("/kaggle/working/Train20Percent/Train/Real")
remove_all_files("/kaggle/working/Train20Percent/Train/Fake")

The directory /kaggle/working/Train20Percent/Train/Real does not exist.
The directory /kaggle/working/Train20Percent/Train/Fake does not exist.


# Balancing Train Data

In [None]:
# Create directories for real and fake faces
os.makedirs('Train/Train/Real', exist_ok=True)
os.makedirs('Train/Train/Fake', exist_ok=True)

In [None]:
output_base_dir = 'Train/Train'

In [None]:
majority_class_data_path = os.path.join(train_data_path, 'Real')
minority_class_data_path = os.path.join(train_data_path, 'Fake')
num_images = len(os.listdir(minority_class_data_path))
print(f'Downsampling Real class to {num_images} images.\n')

# Random sample the real images so that it is the same with number of fake images
sampled_majority_img_filenames = random.sample(os.listdir(majority_class_data_path), num_images)
for img_filename in tqdm(sampled_majority_img_filenames, desc="Downsampling Real class", unit='image'):
    img_path = os.path.join(majority_class_data_path, img_filename)
    img = cv2.imread(img_path)

    output_img_path = os.path.join('/kaggle/working/Train/Train/Real', img_filename)
    cv2.imwrite(output_img_path, img)

In [None]:
# Take all images from the minority class
print('Copying all Fake images...')
minority_class_img_filenames = os.listdir(minority_class_data_path)
for img_filename in tqdm(minority_class_img_filenames, desc="Copying Fake images", unit='image'):
    img_path = os.path.join(minority_class_data_path, img_filename)
    img = cv2.imread(img_path)

    output_img_path = os.path.join('/kaggle/working/Train/Train/Fake', img_filename)
    cv2.imwrite(output_img_path, img)

In [None]:
shutil.make_archive('train_cropped_faces_224_rgb_balanced', 'zip', '/kaggle/working/Train')

# Balancing Validation Data

In [8]:
# Create directories for real and fake faces
os.makedirs('Val/Val/Real', exist_ok=True)
os.makedirs('Val/Val/Fake', exist_ok=True)

In [9]:
output_base_dir = 'Val/Val'

In [10]:
majority_class_data_path = os.path.join(val_data_path, 'Fake')
minority_class_data_path = os.path.join(val_data_path, 'Real')
num_images = len(os.listdir(minority_class_data_path))
print(f'Downsampling Fake class to {num_images} images.\n')

# Random sample the fake images so that it is the same with number of real images
sampled_majority_img_filenames = random.sample(os.listdir(majority_class_data_path), num_images)
for img_filename in tqdm(sampled_majority_img_filenames, desc="Downsampling Fake class", unit='image'):
    img_path = os.path.join(majority_class_data_path, img_filename)
    img = cv2.imread(img_path)

    output_img_path = os.path.join('/kaggle/working/Val/Val/Fake', img_filename)
    cv2.imwrite(output_img_path, img)

Downsampling Fake class to 4782 images.



Downsampling Fake class: 100%|██████████| 4782/4782 [00:43<00:00, 108.81image/s]


In [11]:
# Take all images from the minority class
print('Copying all Real images...')
minority_class_img_filenames = os.listdir(minority_class_data_path)
for img_filename in tqdm(minority_class_img_filenames, desc="Copying Real images", unit='image'):
    img_path = os.path.join(minority_class_data_path, img_filename)
    img = cv2.imread(img_path)

    output_img_path = os.path.join('/kaggle/working/Val/Val/Real', img_filename)
    cv2.imwrite(output_img_path, img)

Copying all Real images...


Copying Real images: 100%|██████████| 4782/4782 [00:43<00:00, 110.96image/s]


In [12]:
shutil.make_archive('val_cropped_faces_224_rgb_balanced', 'zip', '/kaggle/working/Val')

'/kaggle/working/val_cropped_faces_224_rgb_balanced.zip'