In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from src.models.HandcraftedModel import HandcraftedModel
from src.DataManager import DataManager

from src.models.Model import IMAGE_INPUT_SIZE
from src.config import IMDB_CROPPED_PATH, IMBD_CROPPED_METADATA_FILENAME

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import os

In [2]:

df = pd.DataFrame()

with tqdm(total=len(os.listdir('../dataset/utk/crop_part1'))) as pbar:
    for file in os.listdir('../dataset/utk/crop_part1'):
        d = {}
        d["path"] = file
        attr = file.split('_')
        d["age"] = attr[0]
        # 0 male, 1 female
        d["gender"] = attr[1]
        df = df.append(d, ignore_index=True)
        pbar.update(1)

100%|██████████| 9780/9780 [00:13<00:00, 741.93it/s]


In [3]:
df.describe()

Unnamed: 0,path,age,gender
count,9780,9780,9780
unique,9780,99,3
top,100_1_0_20170110183726390.jpg.chip.jpg,1,1
freq,1,1112,5407


In [4]:
def standardize_age(dataset, scaler):
    x = np.expand_dims(dataset['age'], -1)
    scaler.fit(x)
    new_x = scaler.transform(x)
    dataset['age'] = new_x
    return dataset

scaler = MinMaxScaler()
df = standardize_age(df, scaler)

In [11]:
df = df[:1000]
df.describe()

Unnamed: 0,age
count,1000.0
mean,0.122523
std,0.062354
min,0.082569
25%,0.100917
50%,0.119266
75%,0.137615
max,1.0


In [12]:
def split_dataset(dataset):
    train, test = train_test_split(dataset, test_size=0.3)
    train, validation = train_test_split(dataset, test_size=.15)
    return train, validation, test

train, validation, test = split_dataset(df)

In [13]:
X_train = []
y_train = pd.DataFrame(train[["gender", "age"]])
with tqdm(total=len(train)) as pbar:
    for _,x in train.iterrows():
        img = cv2.imread("../dataset/utk/crop_part1/" + x["path"])
        X_train.append(img)
        pbar.update(1)

X_test = []
y_test = pd.DataFrame(test[["gender", "age"]])
with tqdm(total=len(test)) as pbar:
    for _,x in test.iterrows():
        img = cv2.imread("../dataset/utk/crop_part1/" + x["path"])
        X_test.append(img)
        pbar.update(1)

X_val = []
y_val = pd.DataFrame(validation[["gender", "age"]])
with tqdm(total=len(validation)) as pbar:
    for _,x in validation.iterrows():
        img = cv2.imread("../dataset/utk/crop_part1/" + x["path"])
        X_val.append(img)
        pbar.update(1)

100%|██████████| 850/850 [00:00<00:00, 963.91it/s] 
100%|██████████| 300/300 [00:00<00:00, 920.45it/s] 
100%|██████████| 150/150 [00:00<00:00, 977.00it/s] 


In [14]:
# EXTRACTIONS
n_sift = 150
color_hist_bins = 128
lbp_n_points = 24
lbp_radius = 3
handcrafted_model = HandcraftedModel(n_sift, color_hist_bins, lbp_n_points, lbp_radius)

df_train = handcrafted_model.extract_dataset_features(X_train, y_train)
df_val = handcrafted_model.extract_dataset_features(X_val, y_val)
df_test = handcrafted_model.extract_dataset_features(X_test, y_test)

# CLEANING FROM NANS
data_manager = DataManager(IMDB_CROPPED_PATH, IMBD_CROPPED_METADATA_FILENAME, IMAGE_INPUT_SIZE,
                           n_subset=1, normalize_images=False, normalize_age=True)
data_manager.delete_nan_columns(df_train, df_val, df_test)

Extracting dataset features ...


100%|██████████| 850/850 [06:34<00:00,  2.15it/s]
  df["gender"] = y["gender"].values
  df["age"] = y["age"].values


Extracting dataset features ...


100%|██████████| 150/150 [01:07<00:00,  2.23it/s]


Extracting dataset features ...


100%|██████████| 300/300 [02:21<00:00,  2.12it/s]


Deleted a maximum of 129 columns


In [15]:
df_train.shape

(850, 50)

In [16]:
# srotolo
df_train_new = pd.DataFrame()
for _, row in df_train.iterrows():
    d = {}
    i = 0
    for col in range(26):
        d[i] = row[col]
        i = i + 1
    for col in range(26, len(df_train.columns) - 2):
        for el in row[col]:
            d[i] = el
            i = i + 1
    df_train_new = df_train_new.append(d, ignore_index=True)

df_val_new = pd.DataFrame()
for _, row in df_val.iterrows():
    d = {}
    i = 0
    for col in range(26):
        d[i] = row[col]
        i = i + 1
    for col in range(26, len(df_val.columns) - 2):
        for el in row[col]:
            d[i] = el
            i = i + 1
    df_val_new = df_val_new.append(d, ignore_index=True)

df_test_new = pd.DataFrame()
for _, row in df_test.iterrows():
    d = {}
    i = 0
    for col in range(26):
        d[i] = row[col]
        i = i + 1
    for col in range(26, len(df_test.columns) - 2):
        for el in row[col]:
            d[i] = el
            i = i + 1
    df_test_new = df_test_new.append(d, ignore_index=True)

In [17]:
print(df_train_new.shape)
print(df_val_new.shape)
print(df_test_new.shape)

(850, 2842)
(150, 2842)
(300, 2842)


In [18]:
clf = SVC()
clf.fit(df_train_new, df_train["gender"])

SVC()

In [19]:
preds = clf.predict(df_test_new)
acc_test = accuracy_score(df_test["gender"], preds)
conf_mat_test = confusion_matrix(df_test["gender"], preds)
print(acc_test)
print(conf_mat_test)

0.8566666666666667
[[124  21]
 [ 22 133]]


In [20]:
classifier = KNeighborsClassifier(n_neighbors=3,algorithm='brute')
classifier.fit(df_train_new, df_train["gender"])
preds = classifier.predict(df_test_new)
acc_test = accuracy_score(df_test["gender"], preds)
conf_mat_test = confusion_matrix(df_test["gender"], preds)
print(acc_test)
print(conf_mat_test)

0.7333333333333333
[[ 81  64]
 [ 16 139]]
