# 0. Import

In [None]:
import random
import pandas as pd
import numpy as np
import os
import cv2

from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.multiprocessing as mp
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group


from transformers import AutoModel,ViTModel,ViTFeatureExtractor


from tqdm.auto import tqdm

import albumentations as A
from albumentations.pytorch.transforms import ToTensorV2
import torchvision.models as models

from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings(action='ignore')
import matplotlib.pyplot as plt
from collections import defaultdict
import matplotlib as mpl

plt.rcParams['font.family'] = 'NanumGothic'
mpl.rcParams['axes.unicode_minus'] = False

device = torch.device("cuda")

# Visualization

결과적으로 굉장히 Class Embalance한 Dataset임.

Weighted F1 Score더라도,, Embalance Dataset을 위한 Additional한 작업이 요구됨.

In [None]:
df = pd.read_csv('../train.csv')

le = preprocessing.LabelEncoder()
le.fit(df['cat3'].values)
df['original'] = df['cat3']
df['cat3'] = le.transform(df['cat3'].values)
le = preprocessing.LabelEncoder()
le.fit(df['cat2'].values)
df['cat2'] = le.transform(df['cat2'].values)
le = preprocessing.LabelEncoder()
le.fit(df['cat1'].values)
df['cat1'] = le.transform(df['cat1'].values)

In [None]:
folds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
df['kfold'] = -1
for i in range(5):
    df_idx, valid_idx = list(folds.split(df.values, df['cat3']))[i]
    valid = df.iloc[valid_idx]

    df.loc[df[df.id.isin(valid.id) == True].index.to_list(), 'kfold'] = i

In [None]:
cat3_cnt_dict = defaultdict(int)
for value in df['original']:
    cat3_cnt_dict[value] += 1
cat3_cnt_dict = {k : v for k, v in sorted(cat3_cnt_dict.items(), key=lambda x : x[1], reverse=True)}

fig, axes = plt.subplots(1, figsize=(12, 7))

x, y = list(cat3_cnt_dict.keys()), list(cat3_cnt_dict.values())
axes.bar(x,y)
plt.show()


In [None]:
class Custom_Dataset(nn.Module):
    def __init__(self, text, image_path, cats1, cats2, cats3, tokenizer, feature_extractor, max_len):
        self.text = text
        self.image_path = image_path
        self.cats1 = cats1
        self.cats2 = cats2
        self.cats3 = cats3
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor # pretrained LLM (HuggingFace KoBERT)
        self.max_len = max_len # 실험을 통해 overview의 max_len을 포함하게 해야할 듯(?)
    
    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        image_path = os.path.join('../', str(self.image_path[index][2:]))
        image = cv2.imread(image_path)
        cat1 = self.cats1[index] # Label1 
        cat2 = self.cats2[index] # Label2
        cat3 = self.cats3[index] # Label3
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation = True, # max_len 넘으면 자를거냐 (True)
            return_attention_mask = True, # Attention_mask return 할거냐 (True)
            return_tensors='pt' # Tensor로의 변환을 위해 pt 선택.  tf, np 옵션도 존재
        )
        image_feature = self.feature_extractor(images=image, return_tensors='pt')
        return {
            'input_ids' : encoding['input_ids'].flatten(), # 이거 왜 flatten() ?
            'attention_mask' : encoding['attention_mask'].flatten(),
            'pixel_values': image_feature['pixel_values'][0],
            'cats1': torch.tensor(cat1, dtype=torch.long),
            'cats2': torch.tensor(cat2, dtype=torch.long),
            'cats3': torch.tensor(cat3, dtype=torch.long)
        }

In [None]:
def create_data_loader(df, tokenizer, feature_extractor, max_len, batch_size, shuffle_=False):
    ds = Custom_Dataset(
        text=df.overview.to_numpy(),
        image_path = df.img_path.to_numpy(),
        cats1=df.cat1.to_numpy(),
        cats2=df.cat2.to_numpy(),
        cats3=df.cat3.to_numpy(),
        tokenizer=tokenizer,
        feature_extractor = feature_extractor,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4,
        shuffle = shuffle_ # TODO: 나중에 False하고 DistributedSampler를 사용해서 DDP로 구현하는 연습하기.
    )

In [None]:
from transformers import AutoModel,ViTModel,ViTFeatureExtractor
import torch.nn as nn

class TourClassifier(nn.Module):
  def __init__(self, n_classes1, n_classes2, n_classes3, text_model_name, image_model_name):
    super(TourClassifier, self).__init__()
    self.text_model = AutoModel.from_pretrained(text_model_name).to(device)
    self.image_model = ViTModel.from_pretrained(image_model_name).to(device)
    
    self.text_model.gradient_checkpointing_enable()  
    self.image_model.gradient_checkpointing_enable()  

    self.drop = nn.Dropout(p=0.1)

    def get_cls(target_size):
      return nn.Sequential(
          nn.Linear(self.text_model.config.hidden_size, self.text_model.config.hidden_size),
          nn.LayerNorm(self.text_model.config.hidden_size),
          nn.Dropout(p = 0.1),
          nn.ReLU(),
          nn.Linear(self.text_model.config.hidden_size, target_size),
      )  
    self.cls = get_cls(n_classes1)
    self.cls2 = get_cls(n_classes2)
    self.cls3 = get_cls(n_classes3)
    
  def forward(self, input_ids, attention_mask,pixel_values):
    text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
    image_output = self.image_model(pixel_values = pixel_values)
    # print(text_output.last_hidden_state.size(), image_output.last_hidden_state.size())
    concat_outputs = torch.cat([text_output.last_hidden_state, image_output.last_hidden_state],1)
    #config hidden size 일치해야함
    # Transformer의 Encoding 부분의 Layer가 TransformerEncoderLayer로 구현이 되어 있고, nhead=k를 통해 k번 반복해 Encoding Layer를 구성합니다.
    encoder_layer = nn.TransformerEncoderLayer(d_model=self.text_model.config.hidden_size, nhead=8).to(device)
    transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=2).to(device)

    outputs = transformer_encoder(concat_outputs)
    #cls token 
    outputs = outputs[:,0] # (Batch, [0]) -> (Batch, [CLS])
    output = self.drop(outputs)

    out1 = self.cls(output)
    out2 = self.cls2(output)
    out3 = self.cls3(output)
    return out1,out2,out3
    

In [None]:
from sklearn.metrics import f1_score
import time
import math
import torch

def calc_tour_acc(pred, label):
    _, idx = pred.max(1)
    
    acc = torch.eq(idx, label).sum().item() / idx.size()[0] 
    x = label.cpu().numpy()
    y = idx.cpu().numpy()
    f1_acc = f1_score(x, y, average='weighted')
    return acc,f1_acc


class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer
import argparse
import random
from sklearn.model_selection import StratifiedKFold
import torch.optim as optim
from transformers.optimization import get_cosine_schedule_with_warmup
from tqdm import tqdm

def train_epoch(model,data_loader,loss_fn,optimizer,device,scheduler,n_examples,epoch):
  batch_time = AverageMeter()     
  data_time = AverageMeter()      
  losses = AverageMeter()         
  accuracies = AverageMeter()
  f1_accuracies = AverageMeter()
  
  sent_count = AverageMeter()   
    

  start = end = time.time()

  model = model.train()
  correct_predictions = 0
  for step,d in enumerate(data_loader):
    data_time.update(time.time() - end)
    batch_size = d["input_ids"].size(0) 

    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    pixel_values = d['pixel_values'].to(device)
    cats1 = d["cats1"].to(device)
    cats2 = d["cats2"].to(device)
    cats3 = d["cats3"].to(device)

    outputs,outputs2,outputs3 = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      pixel_values=pixel_values
    )
    _, preds = torch.max(outputs3, dim=1)

    loss1 = loss_fn(outputs, cats1)
    loss2 = loss_fn(outputs2, cats2)
    loss3 = loss_fn(outputs3, cats3)

    loss = loss1 * 0.05 + loss2 * 0.1 + loss3 * 0.85

    correct_predictions += torch.sum(preds == cats3)
    losses.update(loss.item(), batch_size)
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    batch_time.update(time.time() - end)
    end = time.time()

    sent_count.update(batch_size)
    if step % 200 == 0 or step == (len(data_loader)-1):
                acc,f1_acc = calc_tour_acc(outputs3, cats3)
                accuracies.update(acc, batch_size)
                f1_accuracies.update(f1_acc, batch_size)

                
                print('Epoch: [{0}][{1}/{2}] '
                      'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                      'Elapsed {remain:s} '
                      'Loss: {loss.val:.3f}({loss.avg:.3f}) '
                      'Acc: {acc.val:.3f}({acc.avg:.3f}) '   
                      'f1_Acc: {f1_acc.val:.3f}({f1_acc.avg:.3f}) '           
                      'sent/s {sent_s:.0f} '
                      .format(
                      epoch, step+1, len(data_loader),
                      data_time=data_time, loss=losses,
                      acc=accuracies,
                      f1_acc=f1_accuracies,
                      remain=timeSince(start, float(step+1)/len(data_loader)),
                      sent_s=sent_count.avg/batch_time.avg
                      ))

  return correct_predictions.double() / n_examples, losses.avg

def validate(model,data_loader,loss_fn,optimizer,device,scheduler,n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  cnt = 0
  for d in tqdm(data_loader):
    with torch.no_grad():
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      pixel_values = d['pixel_values'].to(device)
      cats1 = d["cats1"].to(device)
      cats2 = d["cats2"].to(device)
      cats3 = d["cats3"].to(device)
      outputs,outputs2,outputs3 = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        pixel_values=pixel_values
      )
      _, preds = torch.max(outputs3, dim=1)
      loss1 = loss_fn(outputs, cats1)
      loss2 = loss_fn(outputs2, cats2)
      loss3 = loss_fn(outputs3, cats3)

      loss = loss1 * 0.05 + loss2 * 0.1 + loss3 * 0.85

      correct_predictions += torch.sum(preds == cats3)
      losses.append(loss.item())
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      if cnt == 0:
        cnt +=1
        outputs3_arr = outputs3
        cats3_arr = cats3
      else:
        outputs3_arr = torch.cat([outputs3_arr, outputs3],0)
        cats3_arr = torch.cat([cats3_arr, cats3],0)
  acc,f1_acc = calc_tour_acc(outputs3_arr, cats3_arr)
  return f1_acc, np.mean(losses)


In [None]:
import torch
import numpy as np
from transformers import AutoTokenizer
import argparse
import random
from sklearn.model_selection import StratifiedKFold
import torch.optim as optim
from transformers.optimization import get_cosine_schedule_with_warmup
from tqdm import tqdm

def train_epoch(model,data_loader,loss_fn,optimizer,device,scheduler,n_examples,epoch):

  batch_time = AverageMeter()     
  data_time = AverageMeter()      
  losses = AverageMeter()         
  accuracies = AverageMeter()
  f1_accuracies = AverageMeter()
  
  sent_count = AverageMeter()   
    

  start = end = time.time()

  model = model.train()
  correct_predictions = 0
  for step,d in enumerate(data_loader):
    data_time.update(time.time() - end)
    batch_size = d["input_ids"].size(0) 

    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    pixel_values = d['pixel_values'].to(device)
    cats1 = d["cats1"].to(device)
    cats2 = d["cats2"].to(device)
    cats3 = d["cats3"].to(device)

    outputs,outputs2,outputs3 = model(
      input_ids=input_ids,
      attention_mask=attention_mask,
      pixel_values=pixel_values
    )
    _, preds = torch.max(outputs3, dim=1)

    loss1 = loss_fn(outputs, cats1)
    loss2 = loss_fn(outputs2, cats2)
    loss3 = loss_fn(outputs3, cats3)

    loss = loss1 * 0.05 + loss2 * 0.1 + loss3 * 0.85

    correct_predictions += torch.sum(preds == cats3)
    losses.update(loss.item(), batch_size)
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

    batch_time.update(time.time() - end)
    end = time.time()

    sent_count.update(batch_size)
    if step % 200 == 0 or step == (len(data_loader)-1):
                acc,f1_acc = calc_tour_acc(outputs3, cats3)
                accuracies.update(acc, batch_size)
                f1_accuracies.update(f1_acc, batch_size)

                
                print('Epoch: [{0}][{1}/{2}] '
                      'Data {data_time.val:.3f} ({data_time.avg:.3f}) '
                      'Elapsed {remain:s} '
                      'Loss: {loss.val:.3f}({loss.avg:.3f}) '
                      'Acc: {acc.val:.3f}({acc.avg:.3f}) '   
                      'f1_Acc: {f1_acc.val:.3f}({f1_acc.avg:.3f}) '           
                      'sent/s {sent_s:.0f} '
                      .format(
                      epoch, step+1, len(data_loader),
                      data_time=data_time, loss=losses,
                      acc=accuracies,
                      f1_acc=f1_accuracies,
                      remain=timeSince(start, float(step+1)/len(data_loader)),
                      sent_s=sent_count.avg/batch_time.avg
                      ))

  return correct_predictions.double() / n_examples, losses.avg

def validate(model,data_loader,loss_fn,optimizer,device,scheduler,n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0
  cnt = 0
  for d in tqdm(data_loader):
    with torch.no_grad():
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      pixel_values = d['pixel_values'].to(device)
      cats1 = d["cats1"].to(device)
      cats2 = d["cats2"].to(device)
      cats3 = d["cats3"].to(device)
      outputs,outputs2,outputs3 = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        pixel_values=pixel_values
      )
      _, preds = torch.max(outputs3, dim=1)
      loss1 = loss_fn(outputs, cats1)
      loss2 = loss_fn(outputs2, cats2)
      loss3 = loss_fn(outputs3, cats3)

      loss = loss1 * 0.05 + loss2 * 0.1 + loss3 * 0.85

      correct_predictions += torch.sum(preds == cats3)
      losses.append(loss.item())
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
      if cnt == 0:
        cnt +=1
        outputs3_arr = outputs3
        cats3_arr = cats3
      else:
        outputs3_arr = torch.cat([outputs3_arr, outputs3],0)
        cats3_arr = torch.cat([cats3_arr, cats3],0)
  acc,f1_acc = calc_tour_acc(outputs3_arr, cats3_arr)
  return f1_acc, np.mean(losses)


In [None]:
train = df[df["kfold"] != 0].reset_index(drop=True)
valid = df[df["kfold"] == 0].reset_index(drop=True)

tokenizer = AutoTokenizer.from_pretrained("klue/roberta-small")
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
train_data_loader = create_data_loader(train, tokenizer, feature_extractor, 256, 16, shuffle_=True)
valid_data_loader = create_data_loader(valid, tokenizer, feature_extractor, 256, 16)


EPOCHS = 2
model = TourClassifier(n_classes1 = 6, n_classes2 = 18, n_classes3 = 128, text_model_name = "klue/roberta-small",image_model_name = "google/vit-base-patch16-224").to(device)
optimizer = optim.AdamW(model.parameters(), lr= 3e-5)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps*0.1),
    num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(3, 2)

fig.suptitle('Multiple plots')
fig.tight_layout(pad=2)
plt.show()

# Dataset & DataLoader 확인

In [None]:
from pprint import pprint
train_features = next(iter(train_data_loader))
for key, value in train_features.items():
    print(key, value.size())

In [None]:
max_acc = 0
for epoch in range(EPOCHS):
    print('-' * 10)
    print(f'Epoch {epoch}/{EPOCHS-1}')
    print('-' * 10)
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train),
        epoch
    )
    validate_acc, validate_loss = validate(
        model,
        valid_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(valid)
    )

    if validate_acc > max_acc:
        max_acc = validate_acc
        torch.save(model.state_dict(),f'tourbaseline_fold0.pt')

    print(f'Train loss {train_loss} accuracy {train_acc}')
    print(f'Validate loss {validate_loss} accuracy {validate_acc}')
    print("")
    print("")

# Inference