In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns
import os
import PIL.Image as Image
from dataclasses import dataclass
import torch 
import cv2
import PIL
from glob import glob
from os import listdir
from os.path import isfile, join
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPool2D , Flatten

import tensorflow as tf

## Data Files 

**· img_align_celeba.zip：** All the face images, cropped and aligned 

**· list_attr_celeba.csv:** Attribute labels for each image. There are 40 attributes. "1" represents positive while "-1" represents negative  

**· list_eval_partition.csv:** Recommended partitioning of images into training, validation, testing sets. Images 1-162770 are training, 162771-182637 are validation, 182638-202599 are testing

**· list_bbox_celeba.csv:** Bounding box information for each image. "x_1" and "y_1" represent the upper left point coordinate of bounding box. "width" and "height" represent the width and height of bounding box

**· list_landmarks_align_celeba.csv:** Image landmarks and their respective coordinates. There are 5 landmarks: left eye, right eye, nose, left mouth, right mouth

In [2]:
!pwd

/content


In [3]:
# load datasets
attributes_df = pd.read_csv('/nfs/home/ikq2724/deep_learning/project/data/list_attr_celeba.csv', index_col=0).replace([-1], 0)
bboxes_df = pd.read_csv('/nfs/home/ikq2724/deep_learning/project/data/list_bbox_celeba.csv')
landmarks_df = pd.read_csv('/nfs/home/ikq2724/deep_learning/project/data/list_landmarks_align_celeba.csv')
eval_df = pd.read_csv('/nfs/home/ikq2724/deep_learning/project/data/list_eval_partition.csv')

FileNotFoundError: ignored

## Exploratory Data Analysis

## Attributes Table

In [None]:
attributes_df

In [None]:
attributes_df.columns

In [None]:
# Summarize data

# Record counts
print("Record count: ", len(attributes_df))

In [None]:
# Missing values
print("Missing values: \n", attributes_df.isnull().sum())


In [None]:
# Schema
print("Schema: \n", attributes_df.dtypes)

In [None]:
attributes_df.describe() 

In [None]:
attributes_df['Wearing_Hat'].value_counts()

In [None]:
# Countplot of Male and Female

plt.figure(figsize = (10, 10))
sns.countplot(y = 'Male', data =attributes_df)
plt.title('Countplot of Male and Female')
plt.xlabel('Count of persons')
plt.ylabel('Male VS Female')

In [None]:
# show attributes distribution in pie charts

def show_pies(df = attributes_df):
        
    
    def show_categorical_dist(labels, 
                              coords,
                              title = None, 
                              colormap = 'rocket', 
                              title_params = dict(
                                  fontsize = 20, 
                                  fontweight = 20, 
                                  color = 'gray'),
                              textprops = dict(fontsize = 16)
                             ):
        
        nums = [df[label].sum() for label in labels]
        colors = sns.color_palette(colormap) 
        
        plt.subplot(*coords)
        plt.title(title, fontdict = title_params)
        plt.pie(
            nums, 
            labels=labels, 
            autopct='%1.1f%%', 
            shadow=True, 
            startangle=90, 
            textprops=textprops,
            colors = colors
        )

        
    def show_binary_dist(label, 
                          coords,
                          title = None, 
                          colormap = 'rocket', 
                          title_params = dict(
                              fontsize = 20, 
                              fontweight = 20, 
                              color = 'gray'),
                          textprops = dict(fontsize = 16)
                         ):
        
        positive = df[label].sum()
        negative = len(df) - positive
        labels = [label, f'non-{label}']
        nums = [positive, negative]
        colors = sns.color_palette(colormap)
        
        plt.subplot(*coords)
        plt.title(title, fontdict = title_params)
        plt.pie(
            nums, 
            labels=labels, 
            autopct='%1.1f%%', 
            shadow=True, 
            startangle=0, 
            textprops=textprops,
            colors = colors
        )
        
    plt.figure(figsize = (25, 25))
    plt.subplots_adjust(  
        wspace = 0.4,   
        hspace = 0.2,
    )
    show_categorical_dist(
        coords = (3, 3, 1),
        labels = ['Blond_Hair', 'Brown_Hair', 'Gray_Hair', 'Black_Hair'], 
        title = 'Hairs Color Distribution',
        colormap = 'twilight'
    )
    
    show_binary_dist(
        coords = (3, 3, 2),
        label = 'Attractive', 
        title = 'Attractiveness Distribution',
        colormap = 'rocket'
    )
    
    show_binary_dist(
        coords = (3, 3, 3),
        label = 'Male', 
        title = 'Gender Distribution',
        colormap = 'tab10'
    )
    
    show_binary_dist(
        coords = (3, 3, 4),
        label = 'Smiling', 
        title = 'Smiles Distribution',
        colormap = 'tab20'
    )
    
    show_binary_dist(
        coords = (3, 3, 5),
        label = 'Young', 
        title = 'Youngness Distribution',
        colormap = 'viridis'
    )
    
    show_categorical_dist(
        coords = (3, 3, 6),
        labels = ['Wavy_Hair', 'Straight_Hair', 'Receding_Hairline', 'Bald', 'Bangs'], 
        title = 'Hairs Types Distribution',
        colormap = 'vlag'
    )
    
    show_binary_dist(
        coords = (3, 3, 8),
        label = 'Pale_Skin', 
        title = 'Pale Skin Distribution',
        colormap = 'flare'
    )
    
    plt.show()
show_pies()

## BBoxes

In [None]:
bboxes_df

In [None]:
# Summarize data

# Record counts
print("Record count: ", len(bboxes_df))

In [None]:
# Missing values

print("Missing values: \n", bboxes_df.isnull().sum())

In [None]:
# Schema
print("Schema: \n", bboxes_df.dtypes)

In [None]:
bboxes_df.describe()

In [None]:
# Plot a histogram of the 'width' column
plt.hist(bboxes_df['width'], bins=20)
plt.title('Histogram of width')
plt.xlabel('width')
plt.ylabel('Frequency')
plt.xlim(0, 1200)
plt.show()

In [None]:
# Plot a histogram of the 'height' column
plt.hist(bboxes_df['height'], bins=20)
plt.title('Histogram of height')
plt.xlabel('height')
plt.ylabel('Frequency')
plt.xlim(0, 1600)
plt.show()

## Landmarks

In [None]:
landmarks_df

In [None]:
landmarks_df.columns

In [None]:
# Get record counts, missing values, and schema
record_count = len(landmarks_df)
missing_count = landmarks_df.isna().sum().sum()
schema = landmarks_df.dtypes.to_dict()

# Print results
print(f"Record count: {record_count}")
print(f"Missing value count: {missing_count}")
print("Schema:")
for col_name, dtype in schema.items():
    print(f"\t{col_name}: {dtype}")

In [None]:
landmarks_df.describe() 

In [None]:
# plot for distribution of landmarks

def show_landmarks_distribution(df = landmarks_df):
    
    sns.set_style("darkgrid")
    plt.figure(figsize = (10, 10))
    plt.gca().invert_yaxis()
    
    plt.scatter(
        x = df.righteye_x.tolist(), 
        y = df.righteye_y.tolist(), 
        s=10, 
        c='b', 
        marker="o", 
        label='righteye'
    )
    
    plt.scatter(
        x = df.lefteye_x.tolist(), 
        y = df.lefteye_y.tolist(), 
        s=10, 
        c='r', 
        marker="o", 
        label='lefteye'
    )
    
    plt.scatter(
        x = df.nose_x.tolist(), 
        y = df.nose_y.tolist(), 
        s=10, 
        c='m', 
        marker="v", 
        label='nose'
    )
    
    plt.scatter(
        x = df.leftmouth_x.tolist(), 
        y = df.leftmouth_y.tolist(), 
        s=10, 
        c='c', 
        marker="x", 
        label='left_mouth'
    )
    plt.scatter(
        x = df.rightmouth_x.tolist(), 
        y = df.rightmouth_y.tolist(), 
        s=10, 
        c='g', 
        marker="x", 
        label='right_mouth'
    )
    
    plt.legend(loc='best', ncol = 5);
    plt.show()
    
    
show_landmarks_distribution()

## Eval

In [None]:
eval_df

In [None]:
# Get record counts, missing values, and schema
record_count = len(eval_df)
missing_count = eval_df.isna().sum().sum()
schema = eval_df.dtypes.to_dict()

# Print results
print(f"Record count: {record_count}")
print(f"Missing value count: {missing_count}")
print("Schema:")
for col_name, dtype in schema.items():
    print(f"\t{col_name}: {dtype}")


In [None]:
# see value counts
eval_df['partition'].value_counts()

In [None]:
# Plot a histogram of the 'x1' column
plt.hist(eval_df['partition'], bins=20)
plt.title('Histogram of partition')
plt.xlabel('partition')
plt.ylabel('Frequency')
plt.show()

## Image

### See example Images and their attributes

In [None]:
main_folder = "/nfs/home/ikq2724/deep_learning/project/data/img_align_celeba/"
images_folder = main_folder + 'img_align_celeba/'

In [None]:
# plot image
#IMAGE_SHAPE = (224, 224)
image_example1 = images_folder + '020007.jpg'
#image_example = Image.open(image_example).resize(IMAGE_SHAPE)
see_example1 = Image.open(image_example1)
see_example1

In [None]:
attributes_df.loc[image_example1.split('/')[-1]][['Smiling','Male','Young']]

In [None]:
image_example2 = images_folder + '028809.jpg'
see_example2 = Image.open(image_example2)
see_example2

In [None]:
attributes_df.loc[image_example2.split('/')[-1]][['Smiling','Male','Young']]

In [None]:
BASIC_PATH = "/nfs/home/ikq2724/deep_learning/project/data/"
IMG_PATH = os.path.join(BASIC_PATH,'img_align_celeba/img_align_celeba')
FEATURE_PATH = os.path.join(BASIC_PATH,'list_attr_celeba.csv')

In [None]:
# Read the attributes csv files in a dataframe format.
df = pd.read_csv(FEATURE_PATH, usecols=['image_id','Male'])
df = df.sample(n=2500, random_state = 42).reset_index(drop=True)

# Reset the columns values to categorical./
df.loc[df['Male'] == -1,'Male'] = "Female"
df.loc[df['Male'] == 1,'Male'] = "Male"

# Change column names.
df.columns = ["image_id", "Gender"]

df.head(10)

In [None]:
# Display Multiple Sample Images.
for i in range(0, 6):
    plt.subplot(2, 3, i+1)
    
    # Read an Image.
    img = cv2.imread(IMG_PATH + '/' + df["image_id"][i])
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    # Display Image.
    plt.imshow(img)
    plt.title(img.shape)
    plt.axis('off')
    
plt.tight_layout()
plt.show()

In [None]:
# Get the category distribution.
category_count = df["Gender"].value_counts()
print(category_count)

higher_category = list(category_count.index)[0]

In [None]:
# Get the indices of the higher category indices images.
np.random.seed(42)
indices = df[df["Gender"] == higher_category].index
sample_size = category_count[0] - category_count[1]

# Drop the extra rows of female images to fix class imbalance problem.
drop_sample = np.random.choice(indices, sample_size, replace = False)
df = df.drop(drop_sample, axis = "index")


In [None]:
df["Gender"].value_counts().plot.bar()

In [None]:
train_df, test_df = train_test_split(df, test_size=0.3)
test_df, validation_df = train_test_split(test_df, test_size=0.33)

In [None]:
print("Total Train Sample Images : ", len(train_df))
print("Total Test Sample Images : ", len(test_df))
print("Total Validation Sample Images : ", len(validation_df))

In [None]:
IMAGE_SIZE = (218, 178)
BATCH_SIZE = 15

In [None]:
# Generate Train Images Data Generator.
train_datagen = ImageDataGenerator(
    rotation_range=15,
    rescale=1./255,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1
)

train_generator = train_datagen.flow_from_dataframe(
    train_df, 
    IMG_PATH + "/", 
    x_col='image_id',
    y_col='Gender',
    target_size=IMAGE_SIZE,
    class_mode='binary',
    batch_size=BATCH_SIZE
)

In [None]:
# Generate Validation Images Data Generator.
validation_datagen = ImageDataGenerator(rescale=1./255)
validation_generator = validation_datagen.flow_from_dataframe(
    validation_df, 
    IMG_PATH + "/", 
    x_col='image_id',
    y_col='Gender',
    target_size=IMAGE_SIZE,
    class_mode='binary',
    batch_size=BATCH_SIZE
)

In [None]:
test_gen = ImageDataGenerator(rescale=1./255)
test_generator = test_gen.flow_from_dataframe(
    test_df, 
    IMG_PATH + "/",  
    x_col='image_id',
    y_col=None,
    class_mode=None,
    target_size=IMAGE_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False
)

In [None]:
base_vgg_model = tf.keras.applications.vgg19.VGG19(weights='imagenet', include_top=False, input_shape= IMAGE_SIZE + (3,))

In [None]:
base_vgg_model.trainable = False
base_vgg_model.summary()

In [None]:
vgg_model = Sequential(
    [
    base_vgg_model,
    Flatten(),
    Dense(256,activation='relu'),
    Dense(256,activation='relu'),
    Dense(64,activation='relu'),
    Dense(32, activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='sigmoid')
]
)

In [None]:
vgg_model.summary()

In [None]:
vgg_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
earlystop = EarlyStopping(patience=10)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=4, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.001)
callbacks = [earlystop, learning_rate_reduction]

In [None]:
history = vgg_model.fit(train_generator, validation_data = validation_generator
                        , validation_steps=len(validation_df)//BATCH_SIZE
                        ,steps_per_epoch=len(train_df)//BATCH_SIZE,
                        epochs=10, verbose = 1, callbacks=callbacks)

In [None]:
vgg_model.save("celeb_vgg_model.h5")

In [None]:
vgg_model.evaluate(test_generator)

In [None]:
predict = vgg_model.predict(test_generator, steps=np.ceil(len(test_df)/BATCH_SIZE))

In [None]:
prediction = predict.argmax(axis=-1)
test_df['Prediction'] = ["Male" if ele == 1 else "Female" for ele in prediction]

In [None]:
test_df

In [None]:
labels = ['Male','Female']

cm = confusion_matrix(test_df['Gender'], test_df["Prediction"], labels= labels)
cm

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=labels)
disp.plot()
plt.show()