<a href="https://colab.research.google.com/github/forminju/youtuber-look-alike-crawler/blob/main/efficientnetb3_look_alike.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 모델 학습을 위한 전처리 과정

In [1]:
# 토치 버전확인
import torch
print(torch.__version__)

1.13.1+cu116


In [2]:
# 라이브러리 세팅

import random
import pandas as pd
import numpy as np
import os
import cv2
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import albumentations as A #증강
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import train_test_split

In [3]:
# GPU 확인
print(torch.cuda.is_available())
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

True


In [4]:
# EfficientNet 오류 안나게..
torch.backends.cudnn.enabled = False

In [5]:
CFG = {
    'IMG_SIZE':456,
    'EPOCHS':10,
    'LEARNING_RATE':3e-4,
    'BATCH_SIZE':16,
    'SEED':42
}

# chatGPT 추천
# CFG = {
#     'embedding_size': 512,
#     'num_classes': num_classes,
#     'margin': 0.5,
#     'scale': 64,
#     'batch_size': 32,
#     'learning_rate': 0.0001,
#     'num_epochs': 50
# }

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
df = pd.read_csv('/content/drive/MyDrive/glory_match/look-alike/glory_train_df.csv')
image_root_dir = '/content/drive/MyDrive/glory_match/look-alike/pre-processed-image'

### Train, Test dataset 분리
항목별로 70%를 train, 30%를 valid로 사용합니다.

In [9]:
class_name_list = []

In [10]:
tmp=df.copy()
tmp['class'].shape
for initial in tmp['class']:
  if initial not in class_name_list:
    class_name_list.append(initial)
class_name_list

['SMO', 'CHJ', 'CDE', 'HAR', 'JJJ', 'JSI', 'OJY', 'SHE', 'SHG', 'IDH', 'IJH']

In [11]:
class_name_to_lower_case_list = [class_name.lower() for class_name in class_name_list]
class_name_to_lower_case_list

['smo', 'chj', 'cde', 'har', 'jjj', 'jsi', 'ojy', 'she', 'shg', 'idh', 'ijh']

In [12]:
for initial in class_name_to_lower_case_list:
  image_root_path = os.path.join(image_root_dir, initial)
  for i in range(len(tmp)):
    if (initial.upper() == tmp['class'][i]):
      tmp['file_name'][i] = os.path.join(image_root_path, tmp['file_name'][i])
tmp['file_name'].head()

0    /content/drive/MyDrive/glory_match/look-alike/...
1    /content/drive/MyDrive/glory_match/look-alike/...
2    /content/drive/MyDrive/glory_match/look-alike/...
3    /content/drive/MyDrive/glory_match/look-alike/...
4    /content/drive/MyDrive/glory_match/look-alike/...
Name: file_name, dtype: object

In [13]:
train = pd.DataFrame(columns=tmp.columns)
valid = pd.DataFrame(columns=tmp.columns)

for class_name in class_name_list:
  tmp_with_class = tmp.loc[tmp['class'] == class_name]
  train_tmp, valid_tmp = train_test_split(tmp_with_class, test_size = 0.3, random_state = 42)

  train_tmp['class'] = class_name
  valid_tmp['class'] = class_name
  train = pd.concat([train, train_tmp])
  valid = pd.concat([valid, valid_tmp])

In [14]:
train

Unnamed: 0,index,file_name,class
41,41,/content/drive/MyDrive/glory_match/look-alike/...,SMO
19,19,/content/drive/MyDrive/glory_match/look-alike/...,SMO
30,30,/content/drive/MyDrive/glory_match/look-alike/...,SMO
49,49,/content/drive/MyDrive/glory_match/look-alike/...,SMO
50,50,/content/drive/MyDrive/glory_match/look-alike/...,SMO
...,...,...,...
1213,1213,/content/drive/MyDrive/glory_match/look-alike/...,IJH
1121,1121,/content/drive/MyDrive/glory_match/look-alike/...,IJH
1199,1199,/content/drive/MyDrive/glory_match/look-alike/...,IJH
1286,1286,/content/drive/MyDrive/glory_match/look-alike/...,IJH


### label을 tensor로 반환하기 위해 one-hot-encoding 적용

In [15]:
one_hot_encoded = pd.get_dummies(tmp['class'])
train_one_hot_encoded = pd.get_dummies(train['class'])
valid_one_hot_encoded = pd.get_dummies(valid['class'])

In [16]:
data = pd.concat([tmp, one_hot_encoded], axis=1)
data = data.drop(['class'], axis=1)
train = pd.concat([train, train_one_hot_encoded], axis=1)
train = train.drop(['class'], axis=1)
valid = pd.concat([valid, valid_one_hot_encoded], axis=1)
valid = valid.drop(['class'], axis=1)

In [17]:
train.head()

Unnamed: 0,index,file_name,CDE,CHJ,HAR,IDH,IJH,JJJ,JSI,OJY,SHE,SHG,SMO
41,41,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1
19,19,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1
30,30,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1
49,49,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1
50,50,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1


In [18]:
valid.head()

Unnamed: 0,index,file_name,CDE,CHJ,HAR,IDH,IJH,JJJ,JSI,OJY,SHE,SHG,SMO
0,0,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1
5,5,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1
34,34,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1
13,13,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1
44,44,/content/drive/MyDrive/glory_match/look-alike/...,0,0,0,0,0,0,0,0,0,0,1


### CustomDataset

In [19]:
class CustomDataset(Dataset):
    def __init__(self, img_path_list, label_list, transforms=None):
        self.img_path_list = img_path_list
        self.label_list = label_list
        self.transforms = transforms
        
    def __getitem__(self, index):
        img_path = self.img_path_list[index]
        
        image = cv2.imread(img_path)
        
        if self.transforms is not None:
            image = self.transforms(image=image)['image']
        
        if self.label_list is not None:
            label = torch.FloatTensor(self.label_list[index])
            return image, label
        else:
            return image
        
    def __len__(self):
        return len(self.img_path_list)

In [20]:
train_transform = A.Compose([
    A.Resize(180, 180),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=(-0.3, 0.3), contrast_limit=(-0.3, 0.3), p=1),
    A.ChannelShuffle(p=0.2),
    ToTensorV2()
])

valid_transform = A.Compose([
    A.Resize(180, 180),
    ToTensorV2()
])

In [21]:
def get_labels(df):
    return df.iloc[:,2:].values

In [22]:
train_labels=get_labels(train)
valid_labels=get_labels(valid)

In [23]:
train_dataset = CustomDataset(train['file_name'].values, train_labels, transforms=train_transform)
valid_dataset = CustomDataset(valid['file_name'].values, valid_labels, transforms=valid_transform)

train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False, num_workers=4)

In [24]:
for x,y in train_loader:
  print(f'Image Shape: {x.shape}')
  print(f'Label Shape: {y.shape}')
  break

Image Shape: torch.Size([16, 3, 180, 180])
Label Shape: torch.Size([16, 11])


# Model Define

In [26]:
!pip install efficientnet_pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: efficientnet_pytorch
  Building wheel for efficientnet_pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet_pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16444 sha256=2ea199d58899150e1525d6d830f41d532169465c2085d99fa483396945e875d6
  Stored in directory: /root/.cache/pip/wheels/29/16/24/752e89d88d333af39a288421e64d613b5f652918e39ef1f8e3
Successfully built efficientnet_pytorch
Installing collected packages: efficientnet_pytorch
Successfully installed efficientnet_pytorch-0.7.1


In [27]:
from efficientnet_pytorch import EfficientNet

In [33]:
class BaseModel(nn.Module):
    def __init__(self, num_classes=11):
        super(BaseModel, self).__init__()
        self.backbone = models.efficientnet_b3(pretrained=True)
        self.classifier = nn.Linear(1000, num_classes)
        
        ######
        
    def forward(self, x):
        x = self.backbone(x)
        
        ######
        
        x = F.sigmoid(self.classifier(x))
        return x

In [34]:
def train(model, optimizer, train_loader, valid_loader, scheduler, device):
    model.to(device)
    criterion = nn.BCELoss().to(device)
    
    best_valid_acc = 0
    best_model = None
    
    for epoch in range(1, CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        for imgs, labels in tqdm(iter(train_loader)):
            imgs = imgs.float().to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            output = model(imgs)
            loss = criterion(output, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
                    
        _valid_loss, _valid_acc = validation(model, criterion, valid_loader, device)
        _train_loss = np.mean(train_loss)
        print(f'Epoch [{epoch}], Train Loss : [{_train_loss:.5f}] Valid Loss : [{_valid_loss:.5f}] Valid ACC : [{_valid_acc:.5f}]')
        
        if scheduler is not None:
            scheduler.step(_valid_acc)
            
        if best_valid_acc < _valid_acc:
            best_valid_acc = _valid_acc
            best_model = model
    
    return best_model

In [35]:
def validation(model, criterion, valid_loader, device):
    model.eval()
    valid_loss = []
    valid_acc = []
    with torch.no_grad():
        for imgs, labels in tqdm(iter(valid_loader)):
            imgs = imgs.float().to(device)
            labels = labels.to(device)
            
            probs = model(imgs)
            
            loss = criterion(probs, labels)
            
            probs  = probs.cpu().detach().numpy()
            labels = labels.cpu().detach().numpy()
            preds = probs > 0.5
            batch_acc = (labels == preds).mean()
            
            valid_acc.append(batch_acc)
            valid_loss.append(loss.item())
        
        _valid_loss = np.mean(valid_loss)
        _valid_acc = np.mean(valid_acc)
    
    return _valid_loss, _valid_acc

In [36]:
model = BaseModel()
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2,threshold_mode='abs',min_lr=1e-8, verbose=True)

infer_model = train(model, optimizer, train_loader, valid_loader, scheduler, device)

  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.27498] Valid Loss : [0.19780] Valid ACC : [0.92695]


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.16443] Valid Loss : [0.13672] Valid ACC : [0.94906]


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.10201] Valid Loss : [0.11859] Valid ACC : [0.96101]


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.06626] Valid Loss : [0.09066] Valid ACC : [0.96890]


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.05344] Valid Loss : [0.08315] Valid ACC : [0.97117]


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.04093] Valid Loss : [0.09390] Valid ACC : [0.97091]


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.03157] Valid Loss : [0.10503] Valid ACC : [0.96994]


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.03433] Valid Loss : [0.08943] Valid ACC : [0.97396]


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.03030] Valid Loss : [0.08662] Valid ACC : [0.97432]


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch [10], Train Loss : [0.02197] Valid Loss : [0.08464] Valid ACC : [0.97653]


In [37]:
PATH = '/content/drive/MyDrive/glory_match/'

In [38]:
torch.save(model, PATH + 'efficientb3.pt') 

In [39]:
torch.save(model.state_dict(), PATH + 'efficientb3_dict.pt')

In [40]:
torch.save({
    'efficientnetb3': model.state_dict(),
    'optimizer': optimizer.state_dict()
}, PATH + 'efficientnetb3_all.tar') 

# TestAccuracy 확인

In [None]:
# Function to test the model with the test dataset and print the accuracy for the test images
def testAccuracy():
    
    model.eval()
    accuracy = 0.0
    total = 0.0
    
    with torch.no_grad():
        for data in test_loader:
            images, labels = data
            # run the model on the test set to predict labels
            outputs = model(images)
            # the label with the highest energy will be our prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            accuracy += (predicted == labels).sum().item()
    
    # compute the accuracy over all test images
    accuracy = (100 * accuracy / total)
    return(accuracy)