# Preprocessing Pt.1

## Load Necessary Libraries and Packages

In [1]:
import numpy as np
import os, sys
import shutil
import pickle # use this to pickle the completed pipeline and then run on training set

import matplotlib.pyplot as plt
%matplotlib inline

import cv2
import itertools
import time
import random
from PIL import Image

## Load the Data

In [2]:
path = "./data/raw/CEDAR/"
# get list of all directories
dir_list = next(os.walk(path))[1]
dir_list.sort(key=int) # sort all of the directories from 1 to 55
print(dir_list)

['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55']


In [3]:
originals = []
forgeries = []

for folder in dir_list:
    images = os.listdir(path+folder)
    images.sort()
    images = [path+folder+'/'+x for x in images]
    forgeries.append(images[:24]) # First 24 images in each folder are forgeries
    originals.append(images[24:])

In [4]:
forgeries[2]

['./data/raw/CEDAR/3/forgeries_3_1.png',
 './data/raw/CEDAR/3/forgeries_3_10.png',
 './data/raw/CEDAR/3/forgeries_3_11.png',
 './data/raw/CEDAR/3/forgeries_3_12.png',
 './data/raw/CEDAR/3/forgeries_3_13.png',
 './data/raw/CEDAR/3/forgeries_3_14.png',
 './data/raw/CEDAR/3/forgeries_3_15.png',
 './data/raw/CEDAR/3/forgeries_3_16.png',
 './data/raw/CEDAR/3/forgeries_3_17.png',
 './data/raw/CEDAR/3/forgeries_3_18.png',
 './data/raw/CEDAR/3/forgeries_3_19.png',
 './data/raw/CEDAR/3/forgeries_3_2.png',
 './data/raw/CEDAR/3/forgeries_3_20.png',
 './data/raw/CEDAR/3/forgeries_3_21.png',
 './data/raw/CEDAR/3/forgeries_3_22.png',
 './data/raw/CEDAR/3/forgeries_3_23.png',
 './data/raw/CEDAR/3/forgeries_3_24.png',
 './data/raw/CEDAR/3/forgeries_3_3.png',
 './data/raw/CEDAR/3/forgeries_3_4.png',
 './data/raw/CEDAR/3/forgeries_3_5.png',
 './data/raw/CEDAR/3/forgeries_3_6.png',
 './data/raw/CEDAR/3/forgeries_3_7.png',
 './data/raw/CEDAR/3/forgeries_3_8.png',
 './data/raw/CEDAR/3/forgeries_3_9.png']

## Train-Test-Validation Split

Out of the 55 individuals, I will be splitting them up into three sets: training, test, and validation. 
The training set will have 33 individuals. The test set will have 11 individuals, and the validation set will have 11 individuals.

In [6]:
train_set = []

# choose 33 random #s from the total 55 to be the training set
while len(train_set) < 33:
    n = random.randint(1, 55)
    if not n in train_set:
        train_set.append(n)
    if n in train_set:
        continue

In [7]:
len(train_set)

33

In [8]:
train_set.sort()
train_set

[1,
 2,
 3,
 6,
 7,
 8,
 9,
 10,
 12,
 14,
 16,
 17,
 18,
 21,
 22,
 24,
 26,
 28,
 31,
 32,
 33,
 34,
 35,
 36,
 40,
 42,
 43,
 44,
 46,
 49,
 50,
 51,
 53]

In [9]:
total_inds = list(range(1, 56))
remainder = list(set(total_inds) - set(train_set))

In [10]:
test_set = []
# choose 11 random #s from the remaining 22 to be the test set
while len(test_set) < 11:
    n = random.choice(remainder)
    if not n in test_set:
        test_set.append(n)
    if n in test_set:
        continue

In [11]:
validation_set = list(set(remainder) - set(test_set))

In [12]:
### make new folders to store train, test, and validation image sets

parent_dir = "./data/interim/"
train_dir = "train"
train_path = os.path.join(parent_dir, train_dir)
#os.mkdir(train_path)

test_dir = "test"
test_path = os.path.join(parent_dir, test_dir)
#os.mkdir(test_path)

val_dir = "validation"
val_path = os.path.join(parent_dir, val_dir)
#os.mkdir(val_path)

In [14]:
# shutil.move(source, destination, copy_function = copy2)

# sort appropriate images to train, test, and validation sets

for folder in train_set:
    path = "./data/raw/CEDAR/"
    source = path+str(folder)
    destination = "./data/interim/train/"
    
    shutil.move(source, destination) 
    

In [15]:
for folder in test_set:
    path = "./data/raw/CEDAR/"
    source = path+str(folder)
    destination = "./data/interim/test/"
    
    shutil.move(source, destination) 

In [16]:
for folder in validation_set:
    path = "./data/raw/CEDAR/"
    source = path+str(folder)
    destination = "./data/interim/validation/"
    
    shutil.move(source, destination)