# Usage Example
---
- DataSet
- processed_train.csv

In [1]:
# System Libs
import multiprocessing as mp
import sys
import os
from glob import glob
from time import time
from pathlib import Path

# Other Libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from sklearn.metrics import f1_score
from PIL import Image
from tqdm import tqdm

# Torch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.transforms import CenterCrop, Resize, ToTensor, Normalize
device = ("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Local Libs
import data_utils
# from data_utils import MaskClassifierDataset

cuda


In [6]:
# 데이터 값 실수. 소수점 넷째자리까지 표시
pd.options.display.float_format = '{:.4f}'.format

data_df = pd.read_csv('processed_train.csv', header=0)

<br>
<br>

# DATASET
---

## Sample Split
---

In [17]:
# Case2 - 사진별 샘플 분할
# data, train, valid 마스크 분포 상이함
train, valid = data_utils.train_test_split_df(data_df, crit_col='FullPath', test_size=0.1)
df_dict = dict(data=data_df, train=train, valid=valid)

data_utils.dist_analysis(df_dict).xs('Ratio', axis=1, level=1)


Unnamed: 0,Unnamed: 1,data,train,valid
Age,< 30,0.4744,0.4726,0.491
Age,>= 30 and < 60,0.4544,0.4566,0.4349
Age,>= 60,0.0711,0.0708,0.0741
Gender,Female,0.6119,0.6138,0.5942
Gender,Male,0.3881,0.3862,0.4058
Mask,Incorrect,0.1429,0.1436,0.1365
Mask,Not Wear,0.1429,0.1434,0.1381
Mask,Wear,0.7143,0.7131,0.7254


In [19]:
# Case2 - 사람별(폴더별) 샘플 분할
# data, train, valid 마스크 분포 동일함
train, valid = data_utils.train_test_split_df(data_df, crit_col='path', test_size=0.1)
df_dict = dict(data=data_df, train=train, valid=valid)

data_utils.dist_analysis(df_dict).xs('Ratio', axis=1, level=1)

Unnamed: 0,Unnamed: 1,data,train,valid
Mask,Wear,0.7143,0.7143,0.7143
Mask,Incorrect,0.1429,0.1429,0.1429
Mask,Not Wear,0.1429,0.1429,0.1429
Age,< 30,0.4744,0.4733,0.4852
Age,>= 30 and < 60,0.4544,0.4523,0.4741
Age,>= 60,0.0711,0.0745,0.0407
Gender,Female,0.6119,0.6074,0.6519
Gender,Male,0.3881,0.3926,0.3481


## Dataset
---

In [22]:
# Train & Validation Dataset
transform = transforms.Compose(
    [
        transforms.ToTensor()
    ]
)

trainset = data_utils.MaskClassifierDataset(train['FullPath'], train['Class'], transform=transform)
validset = data_utils.MaskClassifierDataset(valid['FullPath'], valid['Class'], transform=transform)

In [32]:
# 마스크 착용 여부만 학습시키는 경우
# Column Name = ClassMask
paths = mask_df['FullPath']
labels = mask_df['ClassMask']

trainset = data_utils.MaskClassifierDataset(train['FullPath'], train['ClassMask'], transform=transform)
validset = data_utils.MaskClassifierDataset(valid['FullPath'], valid['ClassMask'], transform=transform)


In [34]:
# 마스크 착용 데이터만 추출하는 경우
# 이 경우 라벨값은 적절히 수정할 필요가 있음!
mask_df = data_df[~data_df['Mask'].isin(['Not Wear'])]
print(f"Total Size : {data_df.index.size}")
print(f"Wear Mask : {mask_df.index.size}")

Total Size : 18900
Wear Mask : 16200


<br><br><br><br><br>

# PROCESSED_TRAIN.CSV
---

## 경로 및 라벨 추출
---

In [7]:
paths = train_df['FullPath']
labels = train_df['Class']

print(f"label, path example: \nlabel: {labels[0]}\npath: {paths[0]}")

label, path example: 
label: 4
path: /opt/ml/input/data/train/images/000001_female_Asian_45/mask1.jpg


<br><br>
## 경로 변경
---
- 훈련 데이터의 이미지 폴더가 /opt/ml/input/data/train/images가 아닌 경우

In [8]:
from pathlib import Path

In [9]:
def update_full_path(train_df, dir_images):
    train_df = train_df.copy()
    
    train_df['FullPath'] = str(Path(dir_images)) + '/'
    train_df['FullPath'] = train_df['FullPath'].str.cat(train_df['Path'])

    return train_df

dir_images = 'C://user/images'
train_df_updated = update_full_path(train_df, dir_images)

In [10]:
paths = train_df_updated['FullPath']
labels = train_df_updated['Class']

print(f"label, path example: \nlabel: {labels[0]}\npath: {paths[0]}")

label, path example: 
label: 4
path: C:/user/images/000001_female_Asian_45/mask1.jpg


<br><br>
## 칼럼
---
* 기본 데이터
    * 원본 데이터(from train.csv)
        * gender
        * race
        * age
        * path
    * 파일명
        * file - path 내의 파일이름(확장자 포함)
* 추가 데이터
    * Path - 이미지파일 폴더 및 이미지파일 경로(ex. 000002_female_Asian_52/mask.jpg) 
    * FullPath - 이미지 파일 full path
    * Mask 
    * Age 
    * Gender
    * Class (Competition 18개 class) 
    * ClassMask (Mask 독립 class)
    * ClassGender (Gender 독립 class)
    * ClassAge (Age 독립 Class)

In [11]:
train_df.columns

Index(['id', 'gender', 'race', 'age', 'path', 'file', 'FullPath', 'Path',
       'Mask', 'Age', 'Gender', 'Class', 'ClassMask', 'ClassGender',
       'ClassAge'],
      dtype='object')

In [12]:
train_df[['Mask', 'Age', 'Gender', 'Class', 'ClassMask', 'ClassGender', 'ClassAge']].head(5)

Unnamed: 0,Mask,Age,Gender,Class,ClassMask,ClassGender,ClassAge
0,Wear,>= 30 and < 60,Female,4,0,1,1
1,Wear,>= 30 and < 60,Female,4,0,1,1
2,Wear,>= 30 and < 60,Female,4,0,1,1
3,Wear,>= 30 and < 60,Female,4,0,1,1
4,Wear,>= 30 and < 60,Female,4,0,1,1


<br><br>
## Label 변경
---

In [24]:
"""
ClassAge 예시
    변경 전
        < 30 : 0
        >= 30 and < 60 : 1
        >= 60 : 2
    변경 후 
        < 20 : 0
        >= 20 and < 40 : 1
        >= 40 and < 60 : 2
        >= 60 : 3
"""
age_modified = train_df.copy()

label_0 = age_modified['age'] < 20
label_1 = (age_modified['age'] >= 20) & (age_modified['age'] < 40)
label_2 = (age_modified['age'] >= 40) & (age_modified['age'] < 60)
label_3 = age_modified['age'] >= 60

age_modified.loc[label_0, 'ClassAge'] = 0
age_modified.loc[label_1, 'ClassAge'] = 1
age_modified.loc[label_2, 'ClassAge'] = 2
age_modified.loc[label_3, 'ClassAge'] = 3

In [25]:
"""
변경된 ClassAge 반영하여 전체 Class 업데이트(필요한 경우)
"""
def update_class_value(train_df):
    train_df = train_df.copy()

    num_age_labels = train_df['ClassAge'].unique().size
    num_gender_labels = train_df['ClassGender'].unique().size
    num_mask_labels = train_df['ClassMask'].unique().size

    train_df['Class'] = train_df['ClassAge'] + \
                        num_age_labels*train_df['ClassGender'] + \
                        (num_age_labels*num_gender_labels)*train_df['ClassMask']

    return train_df

In [27]:
age_modified = update_class_value(age_modified)
age_modified['Class'].unique().size # 3 x 2 x 4 = 24

24

<br><br>
## DataSet 예시
---
두가지 방법을 고려할 수 있습니다.\
1. train_df 자체를 넘겨주기
2. path 및 label을 넘겨주기 - 일반적인 경우?

In [16]:
from torch.utils.data import Dataset

- 이 경우, subsample을 Dataset 수준에서 쉽게 뽑아낼 수 있습니다.

# of labels
before : 24
after : 16
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
