In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=c6943b180164c39e64926985628b1ef48984822d48b75f55c94898594c4a4c6c
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from transformers import BertTokenizer, BertForTokenClassification, BertConfig,BertForMaskedLM,BertConfig
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold,train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from seqeval.metrics import classification_report
from sklearn.metrics import confusion_matrix, classification_report, f1_score, roc_curve, auc
import seaborn as sns
from sklearn.preprocessing import label_binarize
from itertools import cycle
import os
from functools import partial
from io import StringIO
import ast

In [4]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print ('Available devices :', torch.cuda.device_count())
    print ('Current cuda device :', torch.cuda.current_device())
    print(torch.cuda.get_device_name(device))
else:
    print("CUDA is not available. Model is running on CPU.")
    device = torch.device('cpu')

Available devices : 1
Current cuda device : 0
Tesla T4


In [5]:
curr_path = os.getcwd()
join_with_curr = partial(os.path.join, curr_path)

In [6]:
def path_check(dst_path ,base_name):
    check_path = os.path.join(dst_path,base_name)
    if not os.path.exists(check_path):
        os.mkdir(check_path)
        print(f"Create path : {check_path}")
        return check_path
    else:
        print(f"Path already exist : {check_path}")
        return check_path

In [7]:
class ArtDatasetForTokenClassification(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=512):
        self.descriptions = dataframe['description'].values
        self.labels = [ast.literal_eval(StringIO(word).getvalue()) for word in dataframe['word_labels_tuples'].values]
        #labels 임시 출력
        print(self.labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, idx):
        description = self.descriptions[idx]
        label = self.labels[idx]

        inputs = self.tokenizer.encode_plus(
            description,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        # 레이블에 대한 패딩, 트렁크 처리
        label = label + ['O'] * (self.max_length - len(label))  # 레이블을 max_length에 맞게 패딩해주기
        label = label[:self.max_length]

        # 레이블을 텐서로 변환
        label_tensor = torch.tensor([label_encoder[l] for l in label], dtype=torch.long)  # label_encoder는 레이블을 숫자로 매핑하는 딕셔너리

        return inputs['input_ids'].squeeze(), inputs['attention_mask'].squeeze(), label_tensor

    def __len__(self):
        return len(self.descriptions)

In [11]:
# 예시 데이터 text = "The image depicts a chaotic and intense scene..."
# IOB 태그 할당 예시 labels = ["O", "O", "O", "B-theme", ...]
path_check_curr = partial(path_check, curr_path)
path_check_curr('IOB')
csv_file = join_with_curr('IOB','realism_SF_IOB.csv')
# 데이터 파일 로드
df = pd.read_csv('/content/IOB/realism_SF_IOB.csv', encoding='ISO-8859-1')

# 예시 데이터 형식: 'sentence', 'word_labels'
df.head()

Path already exist : /content/IOB


Unnamed: 0,NO,img,description,genre,Natural and Rural Scenes,Social and Cultural Dynamics,Artistic Elements and Objects,Conceptual and Emotional Expressions,Historical and Traditional Imagery,iob_tag_tuples,word_labels_tuples
0,1,A Man Reading in a Garden (recto); Preliminary...,The image depicts an artwork showcasing a tran...,realism,"outdoor, nature, natural","life, scene",Unknown,feeling,old,"[('the', 'O'), ('image', 'O'), ('depicts', 'O'...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Na..."
1,2,A Man Reading in a Garden (recto); Preliminary...,This is an image of a person sitting peacefull...,realism,"sea, forest, nature, natural","scene, figure, individual","style, technique","thought, feeling","era, past, old, classic","[('this', 'O'), ('is', 'O'), ('an', 'O'), ('im...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
2,3,A Norman Milkmaid at Grì±ville.jpg,This is an image of a painting depicting a sol...,realism,"sky, nature, landscape","scene, figure",expression,"thought, emotion, feeling",era,"[('this', 'O'), ('is', 'O'), ('an', 'O'), ('im...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."
3,4,A Seated Shepherd with His Dog and Sheep - Jea...,This is a monochrome drawing that captures a s...,realism,"outdoor, sky, landscape, natural, sea","life, scene, event, figure",Unknown,"emotion, feeling","era, classic","[('this', 'O'), ('is', 'O'), ('a', 'O'), ('mon...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-So..."
4,5,A Shepherdess with Her Flock - Jean-Franì±ois...,This image portrays a pastoral scene sketched ...,realism,"forest, sky, nature, landscape","life, scene, figure",tradition,"idea, feeling","traditional, era, old, past","[('this', 'O'), ('image', 'O'), ('portrays', '...","['O', 'O', 'O', 'O', 'O', 'B-Social_and_Cultur..."


In [12]:
# 전체 데이터를 훈련 및 테스트 세트로 8:2로 분리
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# 설정
k_folds = 5
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Cross-Validation을 위한 데이터 준비
descriptions = train_df['description'].tolist()
labels = train_df['word_labels_tuples'].tolist()

In [16]:
# 사전학습된 모델과 토크나이저 로드
# model_file = join_with_curr('models','best_model.pth')
tokenizer_path = join_with_curr('drive', 'MyDrive', 'tokenizer')
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
num_labels = 6
max_length = 512

In [17]:
# 라벨 인코더 만들기: 레이블을 숫자로 매핑하는 딕셔너리 생성 ## 추가한 부분!
unique_labels = set()  # 레이블 저장 집합 만들어 놓기

for labels in train_df['word_labels_tuples']:  # 모든 레이블 돌아가면서 순회
    unique_labels.update(ast.literal_eval(labels))

label_encoder = {label: idx for idx, label in enumerate(unique_labels)}

In [18]:
config = BertConfig.from_pretrained('oceanstar/pretrained_for_art_domain',token = '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', num_labels=num_labels)

model = BertForTokenClassification(config)
model.cuda()

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [19]:
epochs = 10

In [20]:
# 모든 레이블을 확인하는 코드 (확인용이니 굳이 실행 안하셔도 됩니다.)
all_labels = []
for labels in train_df['word_labels_tuples']:
    all_labels.extend(ast.literal_eval(labels))

label_set = set(all_labels)
print("Unique labels:", label_set)
print("Are all labels within range 0 to num_labels-1?", all(label in range(num_labels) for label in label_set))

Unique labels: {'B-Conceptual_and_Emotional_Expressions', 'B-Artistic_Elements_and_Objects', 'B-Social_and_Cultural_Dynamics', 'B-Historical_and_Traditional_Imagery', 'B-Natural_and_Rural_Scenes', 'O'}
Are all labels within range 0 to num_labels-1? False


In [21]:
batch_size = 16

# loss 기록을 위한 리스트
fold_perf = {}

for fold, (train_ids, val_ids) in enumerate(kf.split(descriptions)):
    print(f"FOLD {fold}")
    print("Train IDs range:", min(train_ids), "to", max(train_ids))
    print("Validation IDs range:", min(val_ids), "to", max(val_ids))
    print("--------------------------------")

    # train_df에서 서브셋을 추출하고 인덱스를 재조정합니다.
    train_subset_df = train_df.iloc[train_ids].reset_index(drop=True)
    val_subset_df = train_df.iloc[val_ids].reset_index(drop=True)

    # Subset을 사용하여 학습과 검증 데이터셋을 만듭니다.
    train_subset = Subset(ArtDatasetForTokenClassification(train_subset_df, tokenizer, max_length), range(len(train_ids)))
    val_subset = Subset(ArtDatasetForTokenClassification(val_subset_df, tokenizer, max_length), range(len(val_ids)))

    # DataLoader를 정의합니다.
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=batch_size)

    # 모델 및 최적화 알고리즘 초기화
    model = BertForTokenClassification(config)
    model.cuda()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-5)

    # 각 fold의 학습 진행
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for step, batch in enumerate(train_loader):
            b_input_ids, b_input_mask, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            # Forward pass
            optimizer.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs[0]
            total_loss += loss.item()

            # Backward pass
            loss.backward()
            optimizer.step()

        # 평균 손실 계산
        avg_train_loss = total_loss / len(train_loader)
        print(f"Average training loss: {avg_train_loss}")

        # 검증 데이터에 대한 평가
        model.eval()
        eval_loss = 0

        for batch in val_loader:
            b_input_ids, b_input_mask, b_labels = batch
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)

            with torch.no_grad():
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

            eval_loss += outputs[0].item()

        # 평균 손실 계산
        avg_val_loss = eval_loss / len(val_loader)
        print(f"Validation loss: {avg_val_loss}")
        fold_perf[fold] = {'train_loss': avg_train_loss, 'val_loss': avg_val_loss}

print(fold_perf)


Output hidden; open in https://colab.research.google.com to view.

### !밑에는 중간 중간 테스트 하기 위해 사용했던 코드라 무시하시면 됩니다.

In [None]:
train_subset_df = train_df.iloc[train_ids].reset_index(drop=True)
val_subset_df = train_df.iloc[val_ids].reset_index(drop=True)

print("Train subset size:", len(train_subset_df))
print("Validation subset size:", len(val_subset_df))

Train subset size: 320
Validation subset size: 80


In [None]:
# 데이터셋 크기 확인
print("Size of train_df:", len(train_df))

Size of train_df: 400


In [None]:
batch_size = 16
train_loader = DataLoader(ArtDatasetForTokenClassification(train_subset_df, tokenizer, max_length), batch_size=batch_size)
for i, batch in enumerate(train_loader):
    print(f"Batch {i} size:", len(batch[0])
    if i == 2:  # 처음 3개의 배치만 확인
        break

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Conceptual_and_Philosophical_Ideas', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Artistic_and_Cultural_Expressions', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Conceptual_and_Philosophical_Ideas', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Natural_and_Environme

In [None]:
# 데이터셋의 첫 번째 및 마지막 아이템 가져와보자
dataset = ArtDatasetForTokenClassification(train_subset_df, tokenizer, max_length=512)
first_item = dataset[0]
last_item = dataset[len(dataset) - 1]

print("First item:", first_item)
print("Last item:", last_item)

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Conceptual_and_Philosophical_Ideas', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Artistic_and_Cultural_Expressions', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Conceptual_and_Philosophical_Ideas', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Natural_and_Environme