In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
filepath = './140K-RealFakeImages'

train = pd.read_csv(filepath + '/train.csv', usecols=['label', 'label_str', 'path'])
test = pd.read_csv(filepath + '/test.csv', usecols=['label', 'label_str', 'path'])
val = pd.read_csv(filepath + '/valid.csv', usecols=['label', 'label_str', 'path'])

train.head()

In [None]:
# Examine the ratio of real to fake images across train, test and validation sets

train['label'] = train['label'].astype(int)
test['label'] = test['label'].astype(int)
val['label'] = val['label'].astype(int)

In [None]:
train.groupby(['label_str']).size()

In [None]:
test.groupby(['label_str']).size()

In [None]:
val.groupby(['label_str']).size()

In [None]:
train['dataset'] = 'Train'
test['dataset'] = 'Test'
val['dataset'] = 'Validation'

sns.countplot(x='dataset', hue='label_str',
             data=pd.concat([train, test, val]),
             orient='h')

plt.xlabel('Dataset Type')
plt.ylabel('Count')
plt.title('Count of Real vs. Fake Images')
plt.show()

In [None]:
import os
import cv2
import random

# Plot a sample set of images from training set

real_train_path = filepath + '/real_vs_fake/real-vs-fake/train/real'
fake_train_path = filepath + '/real_vs_fake/real-vs-fake/train/fake'

real_train_images = os.listdir(real_train_path)
fake_train_images = os.listdir(fake_train_path)

real_train_labels = [1] * len(real_train_images)
fake_train_labels = [0] * len(fake_train_images)

train_images = real_train_images + fake_train_images
train_labels = real_train_labels + fake_train_labels

def read_image(path):
    img = cv2.imread(path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert images from BGR to RGB
    img = cv2.resize(img, (128, 128)) # Resize images to 128x128 pixels
    return img

def plot_image(img, label, ax, img_path):
    ax.imshow(img)
    ax.set_title('Real' if label==1 else 'Fake')
    ax.set_xlabel(img_path)
    ax.axis('off')

random.seed(210)

indices = np.random.choice(len(train_images), 25)
fig, axes = plt.subplots(5,5, figsize=(15,6))
axes = axes.flatten()

for i, ax in zip(indices, axes):
    img_name = train_images[i]
    label = train_labels[i]
    
    img_path = os.path.join(real_train_path if label==1 else fake_train_path, img_name)
    img = read_image(img_path)
    plot_image(img, label, ax, img_path)
    
plt.tight_layout()
plt.show()

In [None]:
from tqdm import tqdm

# Explore pixel-level summary statistics for each image

pix = np.array([read_image(os.path.join(real_train_path if train_labels[i]==1 else fake_train_path,
                                       train_images[i])).flatten() for i in tqdm(range(len(train_images)))])

stats = pd.DataFrame(pix).describe()
# print(stats)

In [None]:
real_px = np.array([read_image(os.path.join(real_train_path, train_images[i])).flatten()
                   for i in tqdm(range(len(train_labels))) if train_labels[i]==1])
fake_pix = np.array([read_image(os.path.join(fake_train_path, train_images[i])).flatten()
                    for i in tqdm(range(len(train_labels))) if train_labels[i]==0])

real_stats = pd.DataFrame(real_pix).describe()
fake_stats = pd.DataFrame(fake_pix).describe()

In [None]:
# Examine the various color channels within the images

channels = ['Red', 'Green', 'Blue']

for i in range(3):
    plt.hist(pix[:, i::3], bins=256, alpha=0.5)
    plt.xlabel('Pixel Value')
    plt.ylabel('Frequency')
    plt.title(channels[i] + ' Channel')
    plt.show()