In [None]:
import os
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt
import shutil

from google.colab import drive
drive.mount('/content/drive')

%cd "drive/MyDrive/biometric"

Through the use of NFIQ2 from command line, a csv file "quality_scores.json" was generated with the values that contain the quality score of each image.

In [None]:
pd.set_option('display.max_rows', 100)
cols=["Filename", "QualityScore"]
df = pd.read_csv("/utils/quality_scores.csv", usecols=cols)
df

Unnamed: 0,Filename,QualityScore
0,Livescan_500/Natural/Indoor/10_i_1_n_1.png,59
1,Livescan_500/Natural/Indoor/10_i_1_n_2.png,64
2,Livescan_500/Natural/Indoor/10_i_1_n_3.png,66
3,Livescan_500/Natural/Indoor/10_i_1_n_4.png,58
4,Livescan_500/Natural/Indoor/10_i_1_n_5.png,58
...,...,...
4091,Livescan_500/White/Outdoor/9_o_2_w_4.png,48
4092,Livescan_500/White/Outdoor/9_o_2_w_5.png,49
4093,Livescan_500/White/Outdoor/9_o_2_w_6.png,43
4094,Livescan_500/White/Outdoor/9_o_2_w_7.png,37


# Dataset organisation
In this notebook we prepare the dataset to be used during the experiments:
1. we remove the worst 6 samples for each different finger recorded;
2. we split in train and test set;
3. we split both train and test sets in probe and gallery (for the test set, this split is repeated 5 times in order to have statistically reliable scores).

Note that originally we have 64 different identities, and 2 fingers for each identity. Since each finger is independent, we can treat different fingers as different subjects, so we have a total of 64 $\cdot$ 2 = 128 identities.

## 1. Remove bad samples for each finger

In [None]:
Filenames = list(df['Filename'])
Qualities = list(df['QualityScore'])

identities = []
fingers = []
for filename in Filenames:
  info = filename.split('/')
  splitted_info = info[-1].split("_")
  identities.append(int(splitted_info[0]))
  fingers.append(int(splitted_info[2]))

df['identity'] = identities
df['finger'] = fingers
df.head(30)

Unnamed: 0,Filename,QualityScore,identity,finger
0,Livescan_500/Natural/Indoor/10_i_1_n_1.png,59,10,1
1,Livescan_500/Natural/Indoor/10_i_1_n_2.png,64,10,1
2,Livescan_500/Natural/Indoor/10_i_1_n_3.png,66,10,1
3,Livescan_500/Natural/Indoor/10_i_1_n_4.png,58,10,1
4,Livescan_500/Natural/Indoor/10_i_1_n_5.png,58,10,1
5,Livescan_500/Natural/Indoor/10_i_1_n_6.png,62,10,1
6,Livescan_500/Natural/Indoor/10_i_1_n_7.png,55,10,1
7,Livescan_500/Natural/Indoor/10_i_1_n_8.png,59,10,1
8,Livescan_500/Natural/Indoor/10_i_2_n_1.png,41,10,2
9,Livescan_500/Natural/Indoor/10_i_2_n_2.png,37,10,2


In [None]:
grouped_df = df.groupby(by=["identity", "finger"])
dataframes = [grouped_df.get_group(x) for x in grouped_df.groups]
sorted_dataframes = [df.sort_values("QualityScore") for df in dataframes]
cleaned_dataframes = [df.iloc[6:] for df in sorted_dataframes] # remove worst 6 samples for each finger

In [None]:
# record paths of images to keep
valid_samples = []
for df in cleaned_dataframes:
  valid_samples += list(df["Filename"])

cleaned_paths = []
for path in valid_samples:
  cleaned_paths.append(path.split("/")[-1])

In [None]:
# write list of paths to json file
dict_valid_samples = dict()
dict_valid_samples["samples_list"] = cleaned_paths

with open("utils/valid_samples.json", "w") as f:
  json.dump(dict_valid_samples, f)

In [None]:
# load list of paths from json file
with open("utils/valid_samples.json") as json_file:
    data = json.load(json_file)
valid_list = data["samples_list"]

for file in os.listdir("LivescanUnified"):
    if file not in valid_list:
        os.remove(os.path.join("LivescanUnified", file))

## 2. Split in train and test set
We decided to keep only the outdoor datasets for computational reasons and because they have average better quality. Then we use 70% of data for train set and 30% for test set. All identities will have equal number of samples in train and test sets (choice based on samples rather than subjects).


In [None]:
# read all the files in the folder "LivescanUnified", remove middle fingers and organize the remaining in a dict
ids_to_file = {}
for file in os.listdir("LivescanUnified"):
    infos = file.split('_')

    if infos[2] == '2': # remove middle fingers
        os.remove(os.path.join("LivescanUnified", file))

    else:
        id = infos[0]
        if id in ids_to_file:
            ids_to_file[id].append(file)
        else:
            ids_to_file[id] = [file]

ids_to_file

{'10': ['10_o_1_n_1.png',
  '10_o_1_n_6.png',
  '10_o_1_w_6.png',
  '10_o_1_n_2.png',
  '10_o_1_w_3.png',
  '10_o_1_w_7.png',
  '10_o_1_n_3.png',
  '10_o_1_n_7.png',
  '10_o_1_w_8.png',
  '10_o_1_w_2.png',
  '10_o_1_w_5.png',
  '10_o_1_w_4.png',
  '10_o_1_n_4.png',
  '10_o_1_w_1.png',
  '10_o_1_n_5.png',
  '10_o_1_n_8.png'],
 '18': ['18_o_1_w_1.png',
  '18_o_1_n_7.png',
  '18_o_1_w_6.png',
  '18_o_1_w_7.png',
  '18_o_1_w_2.png',
  '18_o_1_n_4.png',
  '18_o_1_n_2.png',
  '18_o_1_w_5.png',
  '18_o_1_w_4.png',
  '18_o_1_w_3.png',
  '18_o_1_n_8.png',
  '18_o_1_w_8.png',
  '18_o_1_n_6.png',
  '18_o_1_n_1.png',
  '18_o_1_n_3.png',
  '18_o_1_n_5.png'],
 '35': ['35_o_1_w_3.png',
  '35_o_1_n_1.png',
  '35_o_1_n_4.png',
  '35_o_1_n_3.png',
  '35_o_1_w_7.png',
  '35_o_1_w_2.png',
  '35_o_1_w_5.png',
  '35_o_1_n_7.png',
  '35_o_1_w_1.png',
  '35_o_1_w_4.png',
  '35_o_1_n_5.png',
  '35_o_1_n_6.png',
  '35_o_1_n_2.png',
  '35_o_1_n_8.png',
  '35_o_1_w_8.png',
  '35_o_1_w_6.png'],
 '1': ['1_o_1_w_4.p

In [None]:
# train (70%) and test (30%) split
train = {}
test = {}

for id in ids_to_file:
    naturals = [x for x in ids_to_file[id] if '_n_' in x]
    whites = [x for x in ids_to_file[id] if '_n_' not in x]

    train_naturals = np.random.choice(naturals, 6, replace=False) # 70% of the natural images will be used for training
    train_whites = np.random.choice(whites, 5, replace=False) # 70% of the white images will be used for training

    # concatenate the two lists
    train[id] = list(np.concatenate((train_naturals, train_whites)))

    # the remaining images will be used for testing
    test_naturals = [x for x in naturals if x not in train_naturals]
    test_whites = [x for x in whites if x not in train_whites]

    test[id] = list(np.concatenate((test_naturals, test_whites)))

print(sum([len(x) for x in train.values()]))
print(sum([len(x) for x in test.values()]))

704
320


In [None]:
# now create the subfolders "train" and "test" inside "DatasetFinal" and move the images in the correct folder

# create the subfolders
os.mkdir("DatasetFinal")
os.mkdir("DatasetFinal/train")
os.mkdir("DatasetFinal/test")

for id in train:
    for img in train[id]:
        shutil.copy("LivescanUnified/" + img, "DatasetFinal/train/" + img)

for id in test:
    for img in test[id]:
        shutil.copy("LivescanUnified/" + img, "DatasetFinal/test/" + img)


## 3. Split both train and test sets in probe and gallery
For the test set, this split is repeated 5 times in order to have statistically reliable scores.

In [None]:
# first, rename the images in the Dataset folder

for id in train:

    new_names = []
    for old_name in train[id]:
        infos = old_name.split('_')
        new_name = infos[0] + '_' + infos[3] + '_' + infos[4]

        new_names.append(new_name)
        os.rename("DatasetFinal/train/" + old_name, "DatasetFinal/train/" + new_name)

    train[id] = new_names


for id in test:

    new_names = []
    for old_name in test[id]:
        infos = old_name.split('_')
        new_name = infos[0] + '_' + infos[3] + '_' + infos[4]

        new_names.append(new_name)
        os.rename("DatasetFinal/test/" + old_name, "DatasetFinal/test/" + new_name)

    test[id] = new_names

train

{'10': ['10_n_1.png',
  '10_n_2.png',
  '10_n_7.png',
  '10_n_6.png',
  '10_n_4.png',
  '10_n_3.png',
  '10_w_1.png',
  '10_w_2.png',
  '10_w_8.png',
  '10_w_5.png',
  '10_w_4.png'],
 '18': ['18_n_7.png',
  '18_n_2.png',
  '18_n_6.png',
  '18_n_1.png',
  '18_n_3.png',
  '18_n_4.png',
  '18_w_7.png',
  '18_w_2.png',
  '18_w_6.png',
  '18_w_4.png',
  '18_w_8.png'],
 '35': ['35_n_2.png',
  '35_n_5.png',
  '35_n_7.png',
  '35_n_4.png',
  '35_n_3.png',
  '35_n_6.png',
  '35_w_7.png',
  '35_w_8.png',
  '35_w_2.png',
  '35_w_3.png',
  '35_w_6.png'],
 '1': ['1_n_2.png',
  '1_n_4.png',
  '1_n_1.png',
  '1_n_5.png',
  '1_n_6.png',
  '1_n_8.png',
  '1_w_3.png',
  '1_w_7.png',
  '1_w_2.png',
  '1_w_8.png',
  '1_w_6.png'],
 '23': ['23_n_5.png',
  '23_n_1.png',
  '23_n_2.png',
  '23_n_8.png',
  '23_n_3.png',
  '23_n_4.png',
  '23_w_2.png',
  '23_w_8.png',
  '23_w_5.png',
  '23_w_3.png',
  '23_w_1.png'],
 '57': ['57_n_3.png',
  '57_n_8.png',
  '57_n_1.png',
  '57_n_4.png',
  '57_n_7.png',
  '57_n_2.p

In [None]:
# split train set into probe and gallery
train_probe = {}
train_gallery = {}

for id in train:

    naturals = [x for x in train[id] if '_n_' in x] #6
    whites = [x for x in train[id] if '_n_' not in x] #5

    gallery_naturals = np.random.choice(naturals, 3, replace=False) # half of the natural images will be used for gallery
    gallery_whites = np.random.choice(whites, 3, replace=False) # half of the white images will be used for gallery

    # concatenate the two lists
    train_gallery[id] = list(np.concatenate((gallery_naturals, gallery_whites)))

    # the remaining images will be used for the probe set
    probe_naturals = [x for x in naturals if x not in gallery_naturals]
    probe_whites = [x for x in whites if x not in gallery_whites]

    train_probe[id] = list(np.concatenate((probe_naturals, probe_whites)))

    assert len(train_gallery[id]) == 6 and len(train_probe[id]) == 5, f"ERROR IN PROBE VS GALLERY DIVISION FOR SAMPLE {id}"

print(sum([len(x) for x in train_gallery.values()]) + sum([len(x) for x in train_probe.values()]), "total number of samples in train set")

704 total number of samples in train set


In [None]:
# check that there are no duplicates between probe and gallery
print(train_probe["22"])
print(train_gallery["22"])

['22_n_6.png', '22_n_2.png', '22_n_5.png', '22_w_2.png', '22_w_1.png']
['22_n_7.png', '22_n_3.png', '22_n_4.png', '22_w_6.png', '22_w_4.png', '22_w_5.png']


In [None]:
# dump train_probe and train_gallery in a json file

with open("DatasetFinal/train/probe_gallery.json", "w") as f:
    json.dump({"probe": train_probe, "gallery": train_gallery}, f)


In [None]:
# now split test set into probe and gallery
to_dump = {}

for i in range(1,6): # make 5 different splits for test reliability
    test_probe = {}
    test_gallery = {}

    for id in test:
        naturals = [x for x in test[id] if '_n_' in x] #2
        whites = [x for x in test[id] if '_n_' not in x] #3

        gallery_naturals = np.random.choice(naturals, 1, replace=False) # half of the natural images will be used for gallery
        gallery_whites = np.random.choice(whites, 2, replace=False) # half of the white images will be used for gallery

        # concatenate the two lists
        test_gallery[id] = list(np.concatenate((gallery_naturals, gallery_whites)))

        # the remaining images will be used for the probe set
        probe_naturals = [x for x in naturals if x not in gallery_naturals]
        probe_whites = [x for x in whites if x not in gallery_whites]

        test_probe[id] = list(np.concatenate((probe_naturals, probe_whites)))

        assert len(test_gallery[id]) == 3 and len(test_probe[id]) == 2, f"ERROR IN PROBE VS GALLERY DIVISION FOR SAMPLE {id}"

    print(sum([len(x) for x in test_gallery.values()]) + sum([len(x) for x in test_probe.values()]), "total number of samples in test set split", i)

    to_dump[i] = {"probe": test_probe, "gallery": test_gallery}

# now dump the 5 splits in a json file
with open("DatasetFinal/test/probe_gallery.json", "w") as f:
    json.dump(to_dump, f)

320 total number of samples in test set split 1
320 total number of samples in test set split 2
320 total number of samples in test set split 3
320 total number of samples in test set split 4
320 total number of samples in test set split 5


=====================================================================================================================================

In [None]:
# EXAMPLE ON HOW TO PARSE THE PROBE VS GALLERY DIVISION FOR TRAIN SET

with open("DatasetFinal/train/probe_gallery.json") as json_file:
    data = json.load(json_file)


probe = data["probe"]
gallery = data["gallery"]

print(probe.keys())
print(probe["38"])
print(gallery.keys())
print(gallery["38"])

dict_keys(['10', '18', '35', '1', '23', '57', '8', '16', '45', '25', '46', '15', '31', '47', '7', '59', '13', '21', '29', '38', '17', '60', '11', '52', '40', '44', '12', '4', '64', '26', '37', '48', '32', '2', '14', '27', '62', '3', '24', '61', '34', '41', '28', '50', '6', '42', '55', '49', '51', '19', '9', '56', '36', '22', '5', '30', '43', '53', '58', '54', '63', '39', '33', '20'])
['38_n_2.png', '38_n_4.png', '38_n_3.png', '38_w_5.png', '38_w_6.png']
dict_keys(['10', '18', '35', '1', '23', '57', '8', '16', '45', '25', '46', '15', '31', '47', '7', '59', '13', '21', '29', '38', '17', '60', '11', '52', '40', '44', '12', '4', '64', '26', '37', '48', '32', '2', '14', '27', '62', '3', '24', '61', '34', '41', '28', '50', '6', '42', '55', '49', '51', '19', '9', '56', '36', '22', '5', '30', '43', '53', '58', '54', '63', '39', '33', '20'])
['38_n_5.png', '38_n_6.png', '38_n_7.png', '38_w_7.png', '38_w_4.png', '38_w_3.png']


In [None]:
# EXAMPLE ON HOW TO PARSE THE PROBE VS GALLERY DIVISION FOR TEST SET

with open("DatasetFinal/test/probe_gallery.json") as json_file:
    data = json.load(json_file)

for i in range(1,6):
    # take probe and gallery of the i-th split
    probe = data[str(i)]["probe"]
    gallery = data[str(i)]["gallery"]

    # DO YOUR TESTS HERE
    # ....
    # APPEND YOUR RESULTS TO A LIST

    print(probe.keys())
    print(probe["38"])
    print(gallery.keys())
    print(gallery["38"])
    print()

# PLOT AVERAGE METRICS

dict_keys(['10', '18', '35', '1', '23', '57', '8', '16', '45', '25', '46', '15', '31', '47', '7', '59', '13', '21', '29', '38', '17', '60', '11', '52', '40', '44', '12', '4', '64', '26', '37', '48', '32', '2', '14', '27', '62', '3', '24', '61', '34', '41', '28', '50', '6', '42', '55', '49', '51', '19', '9', '56', '36', '22', '5', '30', '43', '53', '58', '54', '63', '39', '33', '20'])
['38_n_1.png', '38_w_2.png']
dict_keys(['10', '18', '35', '1', '23', '57', '8', '16', '45', '25', '46', '15', '31', '47', '7', '59', '13', '21', '29', '38', '17', '60', '11', '52', '40', '44', '12', '4', '64', '26', '37', '48', '32', '2', '14', '27', '62', '3', '24', '61', '34', '41', '28', '50', '6', '42', '55', '49', '51', '19', '9', '56', '36', '22', '5', '30', '43', '53', '58', '54', '63', '39', '33', '20'])
['38_n_8.png', '38_w_8.png', '38_w_1.png']

dict_keys(['10', '18', '35', '1', '23', '57', '8', '16', '45', '25', '46', '15', '31', '47', '7', '59', '13', '21', '29', '38', '17', '60', '11', '52', '