In [1]:
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from glob import glob
# Ignore warnings
import warnings
import cv2
#from albumentations import *
import albumentations as A
from matplotlib import pyplot as plt
from PIL import Image
warnings.filterwarnings("ignore")
# from tqdm.notebook import tqdm
import albumentations.pytorch
from IPython.display import Audio

def seed_everything(seed):
    """
    동일한 조건으로 학습을 할 때, 동일한 결과를 얻기 위해 seed를 고정시킵니다.
    
    Args:
        seed: seed 정수값
    """
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if use multi-GPU
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
seed_everything(42)

In [2]:
root_dir = '/opt/ml/input/data/train/images'
csv_dir = '/opt/ml/input/data/train/train.csv'
log_dir   = '/opt/ml/input/data/train/log'
test_dir = '/opt/ml/input/data/eval'
test_dir_image='/opt/ml/input/data/eval/images'
for_semi_dir_possi='/opt/ml/input/data/eval/toensemble/submission_second_semi_supervised_learning_wo_argumetation_1.csv'

In [3]:
class maskDataset(Dataset):
    
    def __init__(self, root_dir,csv_dir, is_All=False,is_semi=False,splited_idx=None, transform=None):
        self.images_list = self._load_img_list(root_dir,is_All,is_semi,splited_idx)
        self.dir_without_filename=self._get_only_dir_without_filename()
        self.len=len(self.images_list)
        self.root_dir=root_dir
        self.csv_dir=csv_dir
        self.transform=transform
        self.is_semi=is_semi
        
    def __len__(self):
        return self.len
    
    def __getitem__(self,index):
        
        
        img_path=self.images_list[index]
        img = Image.open(img_path)
        if self.transform:
            img=self.transform(image=np.array(img))["image"]
            
        if not self.is_semi:
            label=self._get_class_idx_from_img_name(img_path)
        else:
            semi_df = pd.read_csv(for_semi_dir_possi)
            condi=semi_df['possi']>6.8
            semi_df = semi_df[condi]
            
            ids=semi_df['ans'].tolist()
            label=ids[index]
        
        return img, label
            
        
    def _get_only_dir_without_filename(self):
        dir_without_filename=[]
        for filename in glob(root_dir+'/*'): 
            dir_without_filename.append(filename)
        
        return dir_without_filename
    
        
    def _load_img_list(self, root_dir,is_All,is_semi,splited_idx):
        full_dir=glob(root_dir+'/*')
        train=[]
        valid=[]
        images_list=[]
        
        dir_without_filename=self._get_only_dir_without_filename()
            
        if is_All :
            for filename in dir_without_filename:
                images_list.extend(glob(filename+'/*')) 
                
        elif is_semi:
            semi_df = pd.read_csv(for_semi_dir_possi)
            condi=semi_df['possi']>6.8
            semi_df = semi_df[condi]
            ids=semi_df['ImageID']
            paths=ids.tolist()
            for each in paths:
                images_list.append(os.path.join(test_dir_image, each))
                
        else:
            for idx in splited_idx:
                images_list.extend(glob(dir_without_filename[idx]+'/*'))   


        return images_list

    def _read_csv(self,csv_dir):
        csv_df = pd.read_csv(csv_dir)
        
        bins_dividers=[0,29,57,61]
        bin_names = ['<30','>=30 and <60','>=60']
        csv_df['age_bin'] = pd.cut(x = csv_df['age'],
                     bins = bins_dividers,   
                     labels = bin_names)

        return csv_df
    
    def _load_img_ID(self, img_path):
        return int(img_path.split('/')[7].split('_')[0])
    
    def _get_class_idx_from_img_name(self,img_name):
        name_list=img_name.split('/')[7:]
        age=int(name_list[0][-2:])
 
        if 'incorrect' in name_list[1]:
            if self.isFemale(name_list[0]):
                if self.under30(age):
                    return 9
                elif self.under60(age):
                    return 10
                else:
                    return 11
            else:
                if self.under30(age):
                    return 6
                elif self.under60(age):
                    return 7
                else:
                    return 8
                
                
        elif 'mask' in name_list[1]:
            
            if self.isFemale(name_list[0]):
                if self.under30(age):
                    return 3
                elif self.under60(age):
                    return 4
                else:
                    return 5
            else:
                if self.under30(age):
                    return 0
                elif self.under60(age):
                    return 1
                else:
                    return 2
            
        elif 'normal' in name_list[1]:
            if self.isFemale(name_list[0]):
                if self.under30(age):
                    return 15
                elif self.under60(age):
                    return 16
                else:
                    return 17
            else:
                if self.under30(age):
                    return 12
                elif self.under60(age):
                    return 13
                else:
                    return 14
        else:
            raise ValueError("%s is not a valid filename. Please change the name of %s." % (img_name, img_path))
            
    def isFemale(self,name_list):
        if 'female' in name_list:
            return True
        else:
            return False

    def under30(self,age):
        if age<30:
            return True
        else:
            return False
        
    def under60(self,age):
        if age<58:
            return True
        else:
            return False
        

In [4]:
def getSplitedLoader(splits):
    train_loaders=[]
    valid_loaders=[]
    for train_idx,valid_idx in splits:
    
        train_dataset = maskDataset(root_dir,csv_dir, splited_idx=train_idx, transform=transform_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, pin_memory=True, shuffle=True,num_workers=8)

        valid_dataset = maskDataset(root_dir,csv_dir, splited_idx=valid_idx,transform=transform_valid)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, pin_memory=True, shuffle=True,num_workers=8)
        
        train_loaders.append(train_loader)
        valid_loaders.append(valid_loader)
        
    return train_loaders, valid_loaders

    

In [5]:
batch_size = 32
lr = 1e-4
input_size = 224

transform_train = A.Compose([
       A.CenterCrop(300,200,p=0.5),
        A.IAAPerspective(),
        A.Resize(224,224),
        A.Normalize(mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]),
    
    A.pytorch.ToTensor(),
     
    ])

transform_valid = A.Compose([
    
    A.Resize(224,224),
    A.Normalize(mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]),
     A.pytorch.ToTensor(),
     
    ])

all_dataset=maskDataset(root_dir,csv_dir,is_All=True, transform=transform_train)



In [6]:
semi_dataset = maskDataset(root_dir,csv_dir, is_semi=True, transform=transform_valid)
semi_loader = DataLoader(semi_dataset, batch_size=batch_size, pin_memory=True, shuffle=True,num_workers=8)

In [7]:
class TestDataset(Dataset):
    def __init__(self, img_paths, transform):
        self.img_paths = img_paths
        self.transform = transform

    def __getitem__(self, index):
        image = Image.open(self.img_paths[index])

        if self.transform:
            image = self.transform(image)
        return image

    def __len__(self):
        return len(self.img_paths)
    
    
   

In [8]:
import torch.nn as nn
import torch.nn.functional as F
class FocalLoss(nn.Module):
    def __init__(self, weight=None,
                 gamma=2., reduction='mean'):
        nn.Module.__init__(self)
        self.weight = weight
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input_tensor, target_tensor):
        log_prob = F.log_softmax(input_tensor, dim=-1)
        prob = torch.exp(log_prob)
        return F.nll_loss(
            ((1 - prob) ** self.gamma) * log_prob,
            target_tensor,
            weight=self.weight,
            reduction=self.reduction
        )

In [9]:
# Loss function and Optimizer
from efficientnet_pytorch import EfficientNet
from torch.optim import Adam

In [10]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [11]:
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from torch.optim.lr_scheduler import CosineAnnealingLR
import time
start=time.time()
print(start)

1617858200.609101


In [12]:
# Main
epochs=5
folds=KFold(n_splits=5,shuffle=False)
splits=[]
for current_fold,(train_idx, valid_idx) in enumerate(folds.split(all_dataset.dir_without_filename)):
    splits.append((train_idx,valid_idx))

train_loaders, valid_loaders= getSplitedLoader(splits)

kfold_num=0
for train_loader, valid_loader in zip(train_loaders, valid_loaders):
    start=time.time()
    kfold_num+=1
    #effiNet모델은 여기에서 선언 변하는 것 외에 차이가 없음
    model = torch.hub.load('pytorch/vision:v0.9.0', 'resnet50', pretrained=True)
    device = torch.device('cuda')
    model.to(device)

    criterion = FocalLoss()
    optimizer = Adam(model.parameters(), lr=lr)
    scheduler = CosineAnnealingLR(optimizer, T_max=2, eta_min=0.)
    
    for epoch in range(epochs):
        print(f"------------------------now {epoch+1}epoch/{kfold_num}kfold------------------------")
        model.train()
        for iter, (img, label) in enumerate(train_loader):

            optimizer.zero_grad()
            img, label = img.float().to(device), label.long().to(device)

            pred_logit = model(img)

            loss = criterion(pred_logit, label)

            loss.backward()
            optimizer.step()

            pred_label = torch.argmax(pred_logit, 1)
            f1=f1_score(label.to('cpu'), pred_label.to('cpu'),average='macro')

            train_loss = loss.item()
            train_f1=f1
            
            #(나름)semi-supervised learning 하는 파트
            #어떻게 하는지 잘 모르겠어서 일단 대충 203번쨰 iter에서 원래 보던 데이터들 말고 eval 데이터를 가져와서 봄
            #valid했더니 성능은 더 떨어져서 그 부분은 없앰 즉 학습만 하고 다시 기존 train dataset으로 넘어감
            if(iter==203):
                for plusiter, (img, label) in enumerate(semi_loader):
                    optimizer.zero_grad()
                    img, label = img.float().to(device), label.long().to(device)

                    pred_logit = model(img)

                    loss = criterion(pred_logit, label)

                    loss.backward()
                    optimizer.step()

                    pred_label = torch.argmax(pred_logit, 1)
                    f1=f1_score(label.to('cpu'), pred_label.to('cpu'),average='macro')

                    train_loss = loss.item()
                    train_f1=f1
                print("Iter [%3d/%3d] | Train Loss %.4f | Train f1 %.4f | Valid Loss %.4f | Valid f1 %.4f" %
                    (plusiter, len(semi_loader), train_loss, train_f1, valid_loss, valid_f1))
                

            if (iter % 20 == 0) or (iter == len(train_loader)-1):
                valid_loss, valid_f1 = AverageMeter(), AverageMeter()

                for img, label in valid_loader:
                    img, label = img.float().to(device), label.long().to(device)

                    with torch.no_grad():
                        pred_logit = model(img)

                    loss = criterion(pred_logit, label)

                    pred_label = torch.argmax(pred_logit, 1)
                    f1=f1_score(label.to('cpu'), pred_label.to('cpu'),average='macro')

                    valid_loss.update(loss.item(), len(img))
                    valid_f1.update(f1,len(img))

                valid_loss = valid_loss.avg
                valid_f1 = valid_f1.avg
            if (iter % 200 == 0) or (iter == len(train_loader)-1): 
                print("Iter [%3d/%3d] | Train Loss %.4f | Train f1 %.4f | Valid Loss %.4f | Valid f1 %.4f" %
                    (iter, len(train_loader), train_loss, train_f1, valid_loss, valid_f1))
                    
    
        scheduler.step(valid_loss)

    submission = pd.read_csv(os.path.join(test_dir, 'info.csv'))
    image_dir = os.path.join(test_dir, 'images')

    image_paths = [os.path.join(image_dir, img_id) for img_id in submission.ImageID]
    transform = transforms.Compose([
        transforms.ToTensor(),
         transforms.Resize((224,224)),
         transforms.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]),     
    ])

    eval_dataset = TestDataset(image_paths, transform)

    valid_loader = DataLoader(
        eval_dataset,
        shuffle=False,
        num_workers=4

    )

    model.eval()
#매 에폭 끝날 때 마다 csv 파일 하나씩 나옴.
    all_predictions = []
    all_possi=[]
    all_class_possi=[]
    for images in valid_loader:
        with torch.no_grad():
            images = images.to(device)
            pred = model(images)
            all_class_possi.append(pred[0])
            possi=pred.max(dim=-1)[0]
            pred = pred.argmax(dim=-1)
            
            all_predictions.extend(pred.cpu().numpy())
            all_possi.append(possi.cpu().numpy())
    submission['ans'] = all_predictions
    submission['possi'] = all_possi
    submission['allpossi']=all_class_possi
    ffff='submission_forth_resnet50'
    
    newFilename = f'{ffff}_{kfold_num}.csv'
    submission.to_csv(os.path.join(test_dir, newFilename), index=False)
    print('------------------------------------------test inference is done----------------------------------------------')
    print(time.time()-start)

Using cache found in /opt/ml/.cache/torch/hub/pytorch_vision_v0.9.0


------------------------now 1epoch/1kfold------------------------
Iter [  0/473] | Train Loss 8.9825 | Train f1 0.0000 | Valid Loss 8.5655 | Valid f1 0.0003
Iter [200/473] | Train Loss 0.2295 | Train f1 0.6417 | Valid Loss 0.3010 | Valid f1 0.6374
Iter [222/223] | Train Loss 0.0790 | Train f1 0.7833 | Valid Loss 0.3010 | Valid f1 0.6374
Iter [400/473] | Train Loss 0.1254 | Train f1 0.7796 | Valid Loss 0.2548 | Valid f1 0.6533
Iter [472/473] | Train Loss 0.0762 | Train f1 0.5800 | Valid Loss 0.2363 | Valid f1 0.6737
------------------------now 2epoch/1kfold------------------------
Iter [  0/473] | Train Loss 0.1819 | Train f1 0.6962 | Valid Loss 0.2430 | Valid f1 0.6742
Iter [200/473] | Train Loss 0.0753 | Train f1 0.9575 | Valid Loss 0.2143 | Valid f1 0.7143
Iter [222/223] | Train Loss 0.0018 | Train f1 1.0000 | Valid Loss 0.2143 | Valid f1 0.7143
Iter [400/473] | Train Loss 0.0986 | Train f1 0.7134 | Valid Loss 0.2236 | Valid f1 0.6867
Iter [472/473] | Train Loss 0.1017 | Train f1 0.6

Using cache found in /opt/ml/.cache/torch/hub/pytorch_vision_v0.9.0


------------------------now 1epoch/2kfold------------------------
Iter [  0/473] | Train Loss 9.3603 | Train f1 0.0000 | Valid Loss 8.5585 | Valid f1 0.0001
Iter [200/473] | Train Loss 0.0482 | Train f1 0.7316 | Valid Loss 0.3106 | Valid f1 0.6219
Iter [222/223] | Train Loss 0.4222 | Train f1 0.7667 | Valid Loss 0.3106 | Valid f1 0.6219
Iter [400/473] | Train Loss 0.2569 | Train f1 0.4568 | Valid Loss 0.2614 | Valid f1 0.6675
Iter [472/473] | Train Loss 0.0560 | Train f1 1.0000 | Valid Loss 0.2620 | Valid f1 0.6526
------------------------now 2epoch/2kfold------------------------
Iter [  0/473] | Train Loss 0.1365 | Train f1 0.5737 | Valid Loss 0.2595 | Valid f1 0.6451
Iter [200/473] | Train Loss 0.0656 | Train f1 0.9424 | Valid Loss 0.2319 | Valid f1 0.7085
Iter [222/223] | Train Loss 0.1802 | Train f1 0.6687 | Valid Loss 0.2319 | Valid f1 0.7085
Iter [400/473] | Train Loss 0.0412 | Train f1 0.8889 | Valid Loss 0.2395 | Valid f1 0.7039
Iter [472/473] | Train Loss 0.0476 | Train f1 0.8

Using cache found in /opt/ml/.cache/torch/hub/pytorch_vision_v0.9.0


------------------------now 1epoch/3kfold------------------------
Iter [  0/473] | Train Loss 9.0432 | Train f1 0.0000 | Valid Loss 8.6383 | Valid f1 0.0002
Iter [200/473] | Train Loss 0.3349 | Train f1 0.5641 | Valid Loss 0.3192 | Valid f1 0.6143
Iter [222/223] | Train Loss 0.0594 | Train f1 0.7576 | Valid Loss 0.3192 | Valid f1 0.6143
Iter [400/473] | Train Loss 0.1509 | Train f1 0.7428 | Valid Loss 0.2227 | Valid f1 0.6633
Iter [472/473] | Train Loss 0.2511 | Train f1 0.6370 | Valid Loss 0.2744 | Valid f1 0.6286
------------------------now 2epoch/3kfold------------------------
Iter [  0/473] | Train Loss 0.1844 | Train f1 0.5481 | Valid Loss 0.2687 | Valid f1 0.6297
Iter [200/473] | Train Loss 0.1825 | Train f1 0.7598 | Valid Loss 0.2499 | Valid f1 0.6301
Iter [222/223] | Train Loss 0.1386 | Train f1 0.7333 | Valid Loss 0.2499 | Valid f1 0.6301
Iter [400/473] | Train Loss 0.1296 | Train f1 0.8691 | Valid Loss 0.2270 | Valid f1 0.6974
Iter [472/473] | Train Loss 0.0847 | Train f1 0.8

Using cache found in /opt/ml/.cache/torch/hub/pytorch_vision_v0.9.0


------------------------now 1epoch/4kfold------------------------
Iter [  0/473] | Train Loss 9.2502 | Train f1 0.0000 | Valid Loss 8.5600 | Valid f1 0.0002
Iter [200/473] | Train Loss 0.2294 | Train f1 0.6067 | Valid Loss 0.3244 | Valid f1 0.6148
Iter [222/223] | Train Loss 0.0516 | Train f1 0.9394 | Valid Loss 0.3244 | Valid f1 0.6148
Iter [400/473] | Train Loss 0.2354 | Train f1 0.6341 | Valid Loss 0.2422 | Valid f1 0.6706
Iter [472/473] | Train Loss 0.4702 | Train f1 0.3735 | Valid Loss 0.2497 | Valid f1 0.6909
------------------------now 2epoch/4kfold------------------------
Iter [  0/473] | Train Loss 0.2184 | Train f1 0.7772 | Valid Loss 0.2612 | Valid f1 0.7044
Iter [200/473] | Train Loss 0.1467 | Train f1 0.8000 | Valid Loss 0.2110 | Valid f1 0.6881
Iter [222/223] | Train Loss 0.0596 | Train f1 0.8424 | Valid Loss 0.2110 | Valid f1 0.6881
Iter [400/473] | Train Loss 0.0527 | Train f1 0.9694 | Valid Loss 0.2242 | Valid f1 0.6934
Iter [472/473] | Train Loss 0.0278 | Train f1 1.0