In [1]:
# System Libs
import multiprocessing as mp
import sys
import os
from glob import glob
from time import time
from pathlib import Path

# Other Libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import re
from PIL import Image
from tqdm.notebook import tqdm

# Local Libs
# None

In [2]:
# Path Setting
# Project Dir
dir_project = Path('/opt/ml')

# Code Dir
dir_code = dir_project.joinpath('code')

# Code Data
dir_data = dir_project.joinpath('input/data')
dir_eval = dir_data.joinpath('eval')
dir_train = dir_data.joinpath('train')

In [3]:
def get_ext(path, file_name):
    """
    입력받은 경로(파일)의 확장자 반환

    Args:
    
        path: 경로
    Return:
        exts: 확장자
    """ 
    file_list = os.listdir(path)
    
    return path.split('.')[-1]


def get_img_paths(path):
    """
    Get full path list under entered path.

    Args:
        path: (string or path object) path.
    Returns:
        paths: (list) full path of image files under path.
    """
    img_exts = ['jpg', 'jpeg', 'png']
    file_list = os.listdir(path)

    paths = []
    for f in file_list:
        if f[0] == '.':
            continue
        if '.' not in f[1:]:
            continue
        if f.split('.')[-1] not in img_exts:
            continue
        paths.append(Path(path).joinpath(f))

    return paths


In [4]:
def show_from_path(path, n_row, n_col):
    """
    path 내의 이미지 파일을 출력

    Args:
        path
    """
    paths = get_img_paths(path)
    imgs = [np.array(Image.open(p)) for p in paths]
    if n_row*n_col < len(imgs):
        raise ValueError

    figsize = (n_col*5, n_row*6)
    fig, axes = plt.subplots(n_row, n_col, sharex=True, sharey=True, figsize=figsize)
    for i, img in enumerate(imgs):
        idx_col = i%n_col
        idx_row = i//n_col
        title = (str(paths[i]).split('/')[-1]).split('.')[0] # ./title.jpg -> title
        axes[idx_row][idx_col].imshow(img)
        axes[idx_row][idx_col].set_title(title, color='r')
    plt.tight_layout()
    plt.show()
    

<br><br><br><br><br>


## PREPROCESS
---

In [5]:
train_df = pd.read_csv(dir_train.joinpath('train.csv'), header=0)
dir_image = dir_train.joinpath('images')

In [6]:
noise_gender_ids = ['006359', '006360', '006361', '006362', '006363', '006364']
noise_mask_ids = ['000020', '004418', '005227']

noise_gender = train_df[train_df.id.isin(noise_gender_ids)]
noise_mask = train_df[train_df.id.isin(noise_mask_ids)]

In [7]:
from itertools import product

image_list = ['mask1', 'mask2', 'mask3', 'mask4', 'mask5', 'normal', 'incorrect_mask']
images = pd.DataFrame(image_list, columns=['file'])

train_df['key'] = 0
images['key'] = 0

train_df = train_df.merge(images, how='outer').drop(columns=['key'])

In [8]:
def attach_ext(path):
    file_list = os.listdir(path.parent)
    file_list = [f for f in file_list if bool(re.match(f"{path.name}+", f))]

    if len(file_list) == 1:
        return path.parent.joinpath(file_list[0])
    else:
        raise ValueError(f"Matched file exist {len(file_list)}")


def get_full_path(row):
    folder_name = row.path
    file_name = row.file
    full_path = dir_train.joinpath('images').joinpath(folder_name).joinpath(file_name)
    full_path = attach_ext(full_path)

    return full_path

In [9]:
full_path = train_df.T.apply(lambda row: get_full_path(row)).T

train_df['full_path'] = full_path
train_df.head(1)

Unnamed: 0,id,gender,race,age,path,file,full_path
0,1,female,Asian,45,000001_female_Asian_45,mask1,/opt/ml/input/data/train/images/000001_female_...


### NOISE LABEL CHECK
---

In [10]:
## Print
# for idx, row in noise_gender.iterrows():
#     path = dir_image.joinpath(row['path'])
#     show_from_path(path, 2, 4)

In [11]:
# # Print
# for idx, row in noise_mask.iterrows():
#     path = dir_image.joinpath(row['path'])
#     show_from_path(path, 2, 4)

### GENERATE LABEL
---


In [12]:
import re

def label_mask(file_name):
    if bool(re.match("mask[0-9]+", file_name)):
        return ("Wear", 0)
    elif bool(re.match("incorrect_mask+", file_name)):
        return ("Incorrect", 6)
    elif bool(re.match("normal+", file_name)):
        return ("Not Wear", 12)
    else:
        raise ValueError(f"Invalid Mask Value {file_name}")

def label_age(age):
    age = int(age)
    if age < 30:
        return ("< 30", 0)
    elif age < 60:
        return (">= 30 and < 60", 1)
    elif age >= 60:
        return (">= 60", 2)
    else:
        raise ValueError(f"Invalid Age Value {age}")

def label_gender(gender):
    if gender.upper() == "MALE":
        return ("Male", 0)
    elif gender.upper() == "FEMALE":
        return ("Female", 3)
    else:
        raise ValueError(f"Invalid Gender Value {gender}")

def generate_label(row):
    names = ["Mask", "Age", "Gender", "Class"]

    mask = label_mask(row.file)
    age = label_age(row.age)
    gender = label_gender(row.gender)
    label = mask[1]+age[1]+gender[1]

    label = pd.Series([mask[0], age[0], gender[0], label], index=names)
    return label


In [13]:
label = train_df.T.apply(lambda x: generate_label(x)).T
train_df = train_df[['id', 'full_path']]
train_df = pd.concat([train_df, label], axis=1)

### CORRECT LABEL
---


Gender Label

In [14]:
# Female To Male
noise_gender

Unnamed: 0,id,gender,race,age,path
2399,6359,female,Asian,18,006359_female_Asian_18
2400,6360,female,Asian,18,006360_female_Asian_18
2401,6361,female,Asian,18,006361_female_Asian_18
2402,6362,female,Asian,18,006362_female_Asian_18
2403,6363,female,Asian,18,006363_female_Asian_18
2404,6364,female,Asian,18,006364_female_Asian_18


In [15]:
def correct_gender_noise(df, ids, correct_value='Male', inplace=False):
    if not inplace:
        df = df.copy()

    correct_req = train_df.loc[(train_df.id.isin(ids)) & (train_df.Gender != correct_value)]
    correct_n = correct_req.index.size
    print(f"{correct_n} label corrected - from {correct_req.Gender.unique()} to {correct_value}.")

    if correct_n > 0:
        _correct_class = -3 if correct_value == 'Male' else 3
        df.loc[correct_req.index, 'Gender'] = correct_value
        df.loc[correct_req.index, 'Class'] = df.loc[correct_req.index, 'Class'] + _correct_class

    return df


In [16]:
train_df = correct_gender_noise(train_df, noise_gender_ids)

42 label corrected - from ['Female'] to Male.


<br>

Mask Label

In [17]:
# Swap 'noraml' and 'incorrect'
noise_mask

Unnamed: 0,id,gender,race,age,path
15,20,female,Asian,50,000020_female_Asian_50
1899,4418,male,Asian,20,004418_male_Asian_20
2068,5227,male,Asian,22,005227_male_Asian_22


In [18]:
correct_req = train_df.loc[(train_df.id.isin(noise_mask_ids)) & (train_df.Mask.isin(['Not Wear', 'Incorrect']))]
to_incorrect = correct_req[correct_req.Mask == 'Not Wear']
to_notwear = correct_req[correct_req.Mask == 'Incorrect']

# Not Wear → Incorrect
# Label -6
train_df.loc[to_incorrect.index, 'Mask'] = 'Incorrect'
train_df.loc[to_incorrect.index, 'Class'] = train_df.loc[to_incorrect.index, 'Class'] - 6

# Incorrect → Not Wear
# Label +6
train_df.loc[to_notwear.index, 'Mask'] = 'Not Wear'
train_df.loc[to_notwear.index, 'Class'] = train_df.loc[to_notwear.index, 'Class'] + 6

In [19]:
train_df = train_df.rename(columns=dict(full_path='path'))
train_df.to_csv(dir_train.joinpath('train_processed.csv'))
train_df.head(3)

Unnamed: 0,id,path,Mask,Age,Gender,Class
0,1,/opt/ml/input/data/train/images/000001_female_...,Wear,>= 30 and < 60,Female,4
1,1,/opt/ml/input/data/train/images/000001_female_...,Wear,>= 30 and < 60,Female,4
2,1,/opt/ml/input/data/train/images/000001_female_...,Wear,>= 30 and < 60,Female,4


<br>

# <center> - END - </center>

In [32]:
print(train_df[['Mask', 'Age', 'Gender', 'Class']].describe())

print("MASK", train_df['Mask'].value_counts(), sep='\n', end='\n\n')
print("Age", train_df['Age'].value_counts(), sep='\n', end='\n\n')
print("Gender", train_df['Gender'].value_counts(), sep='\n', end='\n\n')
print("Class", train_df['Class'].value_counts(), sep='\n', end='\n\n')


         Mask    Age  Gender  Class
count   18900  18900   18900  18900
unique      3      3       2     18
top      Wear   < 30  Female      4
freq    13500   8967   11564   4085
MASK
Wear         13500
Incorrect     2700
Not Wear      2700
Name: Mask, dtype: int64

Age
< 30              8967
>= 30 and < 60    8589
>= 60             1344
Name: Age, dtype: int64

Gender
Female    11564
Male       7336
Name: Gender, dtype: int64

Class
4     4085
3     3630
0     2775
1     2050
10     817
16     817
15     726
9      726
12     555
6      555
5      545
2      415
13     410
7      410
17     109
11     109
14      83
8       83
Name: Class, dtype: int64

