# Projekt z przedmiotu: "Systemy na bazie sztucznej inteligencji"
Temat własny: **Zaprojektować sieć neuronową do rozpoznawania osoby na podstawie zdjęcia**

Link do dataset: https://vis-www.cs.umass.edu/lfw/

# Przygotowanie danych

## Użyte biblioteki

In [1]:
import os
import shutil
import cv2
import time
import random
import numpy as np

import tensorflow as tf
from tensorflow.keras.applications.inception_v3 import preprocess_input

import seaborn as sns
import matplotlib.pyplot as plt
tf.__version__, np.__version__

import pickle

## Zmienne globalne

In [2]:
DATA_PATH = "Data"
DATASETS = "Datasets"

TRAIN_DATASETS_PATH = os.path.join(DATASETS, "trainDatasets")
TRAIN_DATASET_PATH = os.path.join(TRAIN_DATASETS_PATH, "trainDataset")
TRAIN_TRIPLETS_PATH = os.path.join(TRAIN_DATASETS_PATH, "trainTriplets")
TRAIN_ANCHOR_DATASETS_PATH = os.path.join(TRAIN_DATASETS_PATH, "trainAnchorDataset")
TRAIN_POSITIVE_DATASETS_PATH = os.path.join(TRAIN_DATASETS_PATH, "trainPositiveDataset")
TRAIN_NEGATIVE_DATASETS_PATH = os.path.join(TRAIN_DATASETS_PATH, "trainNegativeDataset")

TEST_DATASETS_PATH = os.path.join(DATASETS, "testDatasets")
TEST_DATASET_PATH = os.path.join(TEST_DATASETS_PATH, "testDataset")
TEST_TRIPLETS_PATH = os.path.join(TEST_DATASETS_PATH, "testTriplets")
TEST_ANCHOR_DATASETS_PATH = os.path.join(TEST_DATASETS_PATH, "testAnchorDataset")
TEST_POSITIVE_DATASETS_PATH = os.path.join(TEST_DATASETS_PATH, "testPositiveDataset")
TEST_NEGATIVE_DATASETS_PATH = os.path.join(TEST_DATASETS_PATH, "testNegativeDataset")

CHECKPOINT_PATH = 'Checkpoints'
ENCODER_SAVE_PATH = 'Encoder'

LFW = "LFW"
LFW_DATASET_CHANGED = os.path.join(LFW, "lfw_changed")
LFW_DATASET = os.path.join(LFW, "lfw")

OUR_PHOTOS_DIR = os.path.join(LFW, "our_photos")
OUR_RAW_PHOTOS = os.path.join(OUR_PHOTOS_DIR, "raw")
OUR_EXTRACTED_FACES_PHOTOS = os.path.join(OUR_PHOTOS_DIR, "extracted")

## Foldery do prywatnych zdjęć

In [8]:
!mkdir $LFW
!mkdir $OUR_PHOTOS_DIR
!mkdir $OUR_RAW_PHOTOS

A subdirectory or file LFW already exists.
A subdirectory or file LFW\our_photos already exists.
A subdirectory or file LFW\our_photos\raw already exists.


## Pobranie i wypakowanie bazy zdjęć

In [4]:
!curl -o LFW/lfw.tgz http://vis-www.cs.umass.edu/lfw/lfw.tgz
!tar -xzvf "./LFW/lfw.tgz" -C "./LFW/"

^C


x lfw/
x lfw/George_HW_Bush/
x lfw/George_HW_Bush/George_HW_Bush_0001.jpg
x lfw/George_HW_Bush/George_HW_Bush_0002.jpg
x lfw/George_HW_Bush/George_HW_Bush_0003.jpg
x lfw/George_HW_Bush/George_HW_Bush_0004.jpg
x lfw/George_HW_Bush/George_HW_Bush_0005.jpg
x lfw/George_HW_Bush/George_HW_Bush_0006.jpg
x lfw/George_HW_Bush/George_HW_Bush_0007.jpg
x lfw/George_HW_Bush/George_HW_Bush_0008.jpg
x lfw/George_HW_Bush/George_HW_Bush_0009.jpg
x lfw/George_HW_Bush/George_HW_Bush_0010.jpg
x lfw/George_HW_Bush/George_HW_Bush_0011.jpg
x lfw/George_HW_Bush/George_HW_Bush_0012.jpg
x lfw/George_HW_Bush/George_HW_Bush_0013.jpg
x lfw/Curtis_Strange/
x lfw/Curtis_Strange/Curtis_Strange_0001.jpg
x lfw/Marc_Grossman/
x lfw/Marc_Grossman/Marc_Grossman_0001.jpg
x lfw/Marc_Grossman/Marc_Grossman_0002.jpg
x lfw/Marc_Grossman/Marc_Grossman_0003.jpg
x lfw/Marc_Grossman/Marc_Grossman_0004.jpg
x lfw/Michael_Schumacher/
x lfw/Michael_Schumacher/Michael_Schumacher_0001.jpg
x lfw/Michael_Schumacher/Michael_Schumacher_000

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0  172M    0 92610    0     0  36034      0  1:23:31  0:00:02  1:23:29 36049
  0  172M    0  396k    0     0   111k      0  0:26:20  0:00:03  0:26:17  111k
  0  172M    0  782k    0     0   174k      0  0:16:52  0:00:04  0:16:48  174k
  0  172M    0 1203k    0     0   214k      0  0:13:40  0:00:05  0:13:35  241k
  0  172M    0 1623k    0     0   246k      0  0:11:54  0:00:06  0:11:48  321k
  1  172M    1 2074k    0     0   276k      0  0:10:37  0:00:07  0:10:30  402k
  1  172M    1 2600k    0     0   304k      0  0:09:38  0:00:08  0:09:30  442k
  1  172M    1 3116k    0     0   327k      0  0:08

## Wyodrębnienie twarzy ze zdjęć wrzuconych przez nas
Zdjęcia  (Imie_Nazwisko_XXXX.jpg, nieważny rozmiar) wrzucamy tu: ./LFW/our_photos/raw do utworzonego przez nas folderu Imie_Nazwisko.

* najlepiej jakby na zdjęciu znajdowała się tylko nasza twarz.

* co najmniej 4 zdjęcia.

* opcjonalnie sprawdzić w folderze: ./LFW/our_photos/extracted czy algorytm dobrze wyciął naszą twarz (uznał jakiś element zdjęcia za twarz, który twarzą nie jest).

In [9]:
haarCascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

def extract_faces():
    for dir in os.listdir(OUR_RAW_PHOTOS):
        dir_path = os.path.join(OUR_RAW_PHOTOS, dir)
        count = 0

        extractedDir = os.path.join(OUR_EXTRACTED_FACES_PHOTOS, dir)
        if not os.path.exists(extractedDir):
                os.makedirs(extractedDir)
        else:
          files_in_extracted = os.listdir(extractedDir)
          if files_in_extracted:
              for file in files_in_extracted:
                  os.remove(os.path.join(extractedDir, file))

        for file in os.listdir(dir_path):
            image = cv2.imread(os.path.join(OUR_RAW_PHOTOS, dir, file))
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
            faces = haarCascade.detectMultiScale(image, scaleFactor=1.24, minNeighbors=4)

            for (x,y,w,h) in faces:
              face = image[y:(y+h), x:(x+w)]
              face = cv2.resize(face, (250, 250), interpolation = cv2.INTER_AREA)
              savePath = os.path.join(extractedDir, (str(count) + ".jpg"))

              cv2.imwrite(savePath, face)
              count += 1
              # zakładamy że jest jedna twarz na zdjęciu, break dlatego że nie wiem jak wyciągnąć tylko pierwszą wartość xd
              break;
extract_faces()

## Przeniesienie zdjęć z datasetu i naszych do innego folderu

* zdjęcia z datasetu są przenoszone pod warunkiem, że w folderze znajdują się co najmniej dwa zdjęcia danej osoby (po wycięciu twarzy).

In [21]:
def adjust_and_move_photos():
  if os.path.exists(LFW_DATASET_CHANGED):
    shutil.rmtree(LFW_DATASET_CHANGED)

  for dir in os.listdir(LFW_DATASET):
    dir_path = os.path.join(LFW_DATASET, dir)

    if(len(os.listdir(dir_path)) > 2):
      destinationDir = os.path.join(LFW_DATASET_CHANGED, dir)
      if not os.path.exists(destinationDir):
        os.makedirs(destinationDir)

      id = 0
      for file in os.listdir(dir_path):
        image = cv2.imread(os.path.join(LFW_DATASET, dir, file))
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        savePath = os.path.join(destinationDir, (str(id) + ".jpg"))
        savingReturn = cv2.imwrite(savePath, image)
        if not savingReturn:
          return
        id += 1

adjust_and_move_photos()

## Podzielenie danych na zbiór uczący i walidacyjny

In [29]:
# to tylko raz na całe uczenie
def split_dataset(directory, split=0.8):
  dirs = os.listdir(directory)
  random.shuffle(dirs)

  nof_train = int(len(dirs)*split)
  train_list, test_list = {}, {}

  # Create train list
  for dir in dirs[:nof_train]:
    nof_files = len(os.listdir(os.path.join(directory, dir)))
    train_list[dir] = nof_files

  # Create test list
  for dir in dirs[nof_train:]:
    nof_files = len(os.listdir(os.path.join(directory, dir)))
    test_list[dir] = nof_files

  return train_list, test_list

In [30]:
# train_list, test list = {folderName: numberOfFilesInFolder, folderName: numberOfFilesInFolder, ...}
[train_list, test_list] = split_dataset(LFW_DATASET_CHANGED)

# save datasets
%mkdir $DATASETS
%mkdir $TRAIN_DATASETS_PATH
%mkdir $TEST_DATASETS_PATH

with open(TRAIN_DATASET_PATH, 'wb') as output:
  pickle.dump(train_list, output)
with open(TEST_DATASET_PATH, 'wb') as output:
  pickle.dump(test_list, output)

In [31]:
# Test
with open(TRAIN_DATASET_PATH, 'rb') as input:
  inTrainSet = pickle.load(input)
with open(TEST_DATASET_PATH, 'rb') as input:
  inTestSet = pickle.load(input)

## Tworzenie zbiorów tripletów (anchor, positive, negative)
anchor - zdjęcie do którego porównujemy, positive - zdjęcie tej samej osoby, negative - zdjęcie losowej innej osoby

In [32]:
#to tylko raz na całe uczenie
def create_triplets(directory, folder_list, max_files=10):
  triplets = []
  dirs = list(folder_list.keys())

  for dir in dirs:
    files = list(os.listdir(os.path.join(directory, dir)))[:max_files]
    num_files = len(files)

    for i in range(num_files - 1):
      for j in range(num_files - 1):
        if(j != i):
          anchor = (dir, f"{i}.jpg")
          positive = (dir, f"{j}.jpg")

          #find directory with photos of any other person
          neg_dir = dir
          while neg_dir == dir:
            neg_dir = random.choice(dirs)

          neg_file = random.randint(0, folder_list[neg_dir] - 1)
          negative = (neg_dir, f"{neg_file}.jpg")

          triplets.append((anchor, positive, negative))

  random.shuffle(triplets)
  return triplets

In [33]:
train_triplet = create_triplets(LFW_DATASET_CHANGED, train_list)
test_triplet  = create_triplets(LFW_DATASET_CHANGED, test_list)

with open(TRAIN_TRIPLETS_PATH, 'wb') as output:
  pickle.dump(train_triplet, output)
with open(TEST_TRIPLETS_PATH, 'wb') as output:
  pickle.dump(test_triplet, output)

print("Number of training triplets:", len(train_triplet))
# print(train_triplet)
print("Number of testing triplets :", len(test_triplet))
# print(test_triplet)

Number of training triplets: 1844
Number of testing triplets : 560


In [38]:
with open(TRAIN_TRIPLETS_PATH, 'rb') as input:
  inTrainTripletsSet = pickle.load(input)
print("Number of training triplets:", len(inTrainTripletsSet))
# print(inTrainTripletsSet)

with open(TEST_TRIPLETS_PATH, 'rb') as input:
  inTestTripletsSet = pickle.load(input)
print("Number of testing triplets :", len(inTestTripletsSet))
# print(inTestTripletsSet)


Number of training triplets: 1844
Number of testing triplets : 560


## Podział danych z Dataset na 3 listy (anchor, positive, negative)

In [35]:
def createAnchorPotsitiveNegativeDataset(triplets_set):
  anchorData = []
  positiveData = []
  negativeData = []
  for triplet in triplets_set:
    a,p,n = triplet
    anchorData.append(os.path.join(LFW_DATASET_CHANGED, a[0], a[1]))
    positiveData.append(os.path.join(LFW_DATASET_CHANGED, p[0], p[1]))
    negativeData.append(os.path.join(LFW_DATASET_CHANGED, n[0], n[1]))
  return (anchorData, positiveData, negativeData)

In [36]:
# train data
(train_anchor_data, train_positive_data, train_negative_data) = createAnchorPotsitiveNegativeDataset(inTrainTripletsSet)
# print(train_anchor_data)
# print(train_positive_data)
# print(train_negative_data)

with open(TRAIN_ANCHOR_DATASETS_PATH, 'wb') as output:
  pickle.dump(train_anchor_data, output)
with open(TRAIN_POSITIVE_DATASETS_PATH, 'wb') as output:
  pickle.dump(train_positive_data, output)
with open(TRAIN_NEGATIVE_DATASETS_PATH, 'wb') as output:
  pickle.dump(train_negative_data, output)

In [37]:
# test data
(test_anchor_data, test_positive_data, test_negative_data) = createAnchorPotsitiveNegativeDataset(inTestTripletsSet)
# print(test_anchor_data)
# print(test_positive_data)
# print(test_negative_data)

with open(TEST_ANCHOR_DATASETS_PATH, 'wb') as output:
  pickle.dump(test_anchor_data, output)
with open(TEST_POSITIVE_DATASETS_PATH, 'wb') as output:
  pickle.dump(test_positive_data, output)
with open(TEST_NEGATIVE_DATASETS_PATH, 'wb') as output:
  pickle.dump(test_negative_data, output)