# 데이터 형식 변환

In [None]:
# 드라이브 마운트
'''from google.colab import drive
drive.mount('/content/drive')'''

In [None]:
import json
import random
import pandas as pd
import os

# 상대 경로 기반 파일 경로 설정
data_dir = "./data"
json_path = os.path.join(data_dir, "dataset_v7w_telling.json")

# JSON 파일 로드
with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# qa_pairs 가져오기
qa_pairs = []
if isinstance(data, dict):
    if "qa_pairs" in data:
        qa_pairs = data["qa_pairs"]
    else:
        for k, v in data.items():
            if isinstance(v, list):
                for item in v:
                    if "qa_pairs" in item:
                        qa_pairs.extend(item["qa_pairs"])

print("총 QA pairs:", len(qa_pairs))

rows = []

# enumerate로 순번 부여
for idx, q in enumerate(qa_pairs, start=1):
    # 4지선다 생성
    choices = q["multiple_choices"].copy()
    if q["answer"] not in choices:
        choices.append(q["answer"])
    random.shuffle(choices)

    # 정답 레이블
    answer_letter = None
    for i, c in enumerate(choices):
        if c == q["answer"]:
            answer_letter = chr(65 + i)
            break

    # 이미지 경로 (상대경로)
    image_id = q["image_id"]
    img_path = f"./images/v7w_{image_id}.jpg"

    # ID를 TRAIN_00001 이런 식으로 생성
    id_str = f"TRAIN_{idx:05d}"

    rows.append({
        "ID": id_str,
        "img_path": img_path,
        "Question": q["question"],
        "A": choices[0],
        "B": choices[1],
        "C": choices[2],
        "D": choices[3],
        "answer": answer_letter
    })

df = pd.DataFrame(rows)

# 예시 출력
print(df.head())

# 상대경로로 CSV 저장
output_path = "./visual7w_formatted.csv"
df.to_csv(output_path, index=False)
print(f"✅ CSV 저장 완료: {output_path}")

# BEiT-3

## 라이브러리 import/설치

In [None]:
!pip install -U requests==2.31.0

Collecting requests==2.31.0
  Downloading requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Downloading requests-2.31.0-py3-none-any.whl (62 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/62.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.6/62.6 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: requests
  Attempting uninstall: requests
    Found existing installation: requests 2.32.3
    Uninstalling requests-2.32.3:
      Successfully uninstalled requests-2.32.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires requests==2.32.3, but you have requests 2.31.0 which is incompatible.[0m[31m
[0mSuccessfully installed requests-2.31.0


In [None]:
# git 설치
!pip install git+https://github.com/microsoft/torchscale.git

Collecting git+https://github.com/microsoft/torchscale.git
  Cloning https://github.com/microsoft/torchscale.git to /tmp/pip-req-build-yhrwvh44
  Running command git clone --filter=blob:none --quiet https://github.com/microsoft/torchscale.git /tmp/pip-req-build-yhrwvh44
  Resolved https://github.com/microsoft/torchscale.git to commit 4d1e0e82e5adf86dd424f1463192635b73fc8efc
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fairscale==0.4.0 (from torchscale==0.2.0)
  Downloading fairscale-0.4.0.tar.gz (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 kB[0m [31m60.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting timm==0.6.13 (from torchscale==0.2.0)
  Downloading timm-0.6.13-py3-none-any.whl.metadata (38 kB)
Collectin

In [None]:
!pip install timm torchmetrics opencv-python

Collecting torchmetrics
  Downloading torchmetrics-1.8.0-py3-none-any.whl.metadata (21 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.14.3-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->timm)
  Using cached nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.w

In [None]:
!nvidia-smi

Wed Jul 23 23:39:37 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off |   00000000:00:04.0 Off |                    0 |
| N/A   30C    P0             44W /  400W |       0MiB /  40960MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
# 데이터 unzip

!unzip -qq './data/visual7w_images.zip' -d '/content/'
!unzip -qq './data/open.zip' -d '/content/'

In [None]:
# git clone은 최초 한번만 필요

#!git clone https://github.com/hoon-bari/DACON_VQA.git

# github_folder가 없다면 먼저 생성

#!mkdir -p ./github_folder/DACON_VQA

# beit3 폴더를 드라이브로 복사 (최초 1회 실행)

#!cp -r ./DACON_VQA/BEiT-3 ./github_folder/DACON_VQA

In [None]:
'''
git을 clone하고 수정했기에, git을 clone하지마시고
링크로 보내드린 githubfolder를 직접 로드해서 사용해주세요
'''

%cd ./github_folder/DACON_VQA/BEiT-3

/content/drive/MyDrive/멀티모달/github_folder/DACON_VQA/BEiT-3


In [None]:
!pip install -r requirements.txt

# tensorboardX는 선택사항

!pip install -U tensorboardX

In [None]:
import numpy as np
import pandas as pd
import operator
import os
import string
import re
import random
import sys
import platform
import json
import shutil

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split

from PIL import Image

from transformers import XLMRobertaTokenizer

from tqdm.auto import tqdm

In [None]:
# 라이브러리 및 device 확인

device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Python Platform: {platform.platform()}")
print(f"PyTorch Version: {torch.__version__}")
print()
print(f"Python {sys.version}")
print(f"Pandas {pd.__version__}")
print("GPU is", "available" if torch.cuda.is_available() else "NOT AVAILABLE")
print(f"Target device is {device}")

Python Platform: Linux-6.1.123+-x86_64-with-glibc2.35
PyTorch Version: 2.6.0+cu124

Python 3.11.13 (main, Jun  4 2025, 08:57:29) [GCC 11.4.0]
Pandas 2.2.2
GPU is available
Target device is cuda


## CFG, data

In [None]:
# run_beit3_finetuning.py 파일 실행할 때 seed가 있긴 하지만, seed 고정

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

In [None]:
# 파일들을 불러옵니다.

train_df = pd.read_csv('./data/visual7w_formatted.csv')
test_df = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

train_img_path = './images'
test_img_path = './test_input_images'

In [None]:
# validation을 위해, train_df를 나눠줍시다.

train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

# json_file 폴더를 하나 세션에 만듭시다.

json_dir = './json_file'
os.makedirs(json_dir, exist_ok=True)

## github_folder수정 (실행X)

In [None]:
# answer= i['answer'] -> answer = normalize_word(i['answer'])\n")
'''file_path = "./github_folder/DACON_VQA/BEiT-3/datasets.py"

# 파일 읽기
with open(file_path, "r") as f:
    lines = f.readlines()

# 내용 바꾸기
new_lines = []
for line in lines:
    if "answer = i['answer']" in line:
        # 작은따옴표에 맞춰서 교체
        new_lines.append("                    answer = normalize_word(i['answer'])\n")
    else:
        new_lines.append(line)

# 파일 덮어쓰기
with open(file_path, "w") as f:
    f.writelines(new_lines)'''

In [None]:
# 4지선다이니까, class를 4로 고정
# modeling_finerune을 수정
'''# 원하는 클래스 수
num_classes = 4

file_path = "./github_folder/DACON_VQA/BEiT-3/modeling_finetune.py"

with open(file_path, "r") as f:
    lines = f.readlines()

new_lines = []
for line in lines:
    if "nn.Linear(embed_dim, num_classes)" in line.replace(" ", ""):
        print("✅ 기존 라인:", line.strip())
        line = line.replace("num_classes", str(num_classes))
        print("🔄 수정된 라인:", line.strip())
    if "nn.Linear(embed_dim*2,num_classes)" in line.replace(" ", ""):
        print("✅ 기존 라인:", line.strip())
        line = line.replace("num_classes", str(num_classes))
        print("🔄 수정된 라인:", line.strip())
    new_lines.append(line)

with open(file_path, "w") as f:
    f.writelines(new_lines)

print("✅ 두 개의 Linear 레이어 출력 차원을", num_classes, "로 수정 완료.")

file_path = "./github_folder/DACON_VQA/BEiT-3/modeling_finetune.py"

with open(file_path, "r") as f:
    lines = f.readlines()

new_lines = []
for line in lines:
    # 117번 라인을 정확히 찾아서 바꾼다
    if "self.head = nn.Linear" in line and "num_classes" in line:
        print("✅ 기존 라인:", line.strip())
        line = "        self.head = nn.Linear(embed_dim, 4)\n"
        print("🔄 수정된 라인:", line.strip())
    new_lines.append(line)

with open(file_path, "w") as f:
    f.writelines(new_lines)

print("✅ 117번 라인을 고정 완료.")'''


In [None]:
'''file_path = "./github_folder/DACON_VQA/BEiT-3/utils.py"

# 파일 읽기
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# 수정하기
new_lines = []
for line in lines:
    if "torch.load(ckpt_path, map_location='cpu')" in line:
        new_line = "    checkpoint = torch.load(ckpt_path, map_location='cpu', weights_only=False)\n"
        new_lines.append(new_line)
    else:
        new_lines.append(line)

# 덮어쓰기
with open(file_path, "w", encoding="utf-8") as f:
    f.writelines(new_lines)

# 변경 확인
print("✅ 수정 완료! 아래 라인이 잘 바뀌었는지 확인하세요:")
for i, line in enumerate(new_lines, 1):
    if "torch.load" in line:
        print(f"{i}: {line.strip()}")'''


In [None]:
'''file_path = "./github_folder/DACON_VQA/BEiT-3/utils.py"

with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

new_lines = []
for i, line in enumerate(lines):
    # 잘못된 들여쓰기 라인 고치기
    if "torch.load(ckpt_path, map_location='cpu', weights_only=False)" in line:
        new_line = "        checkpoint = torch.load(ckpt_path, map_location='cpu', weights_only=False)\n"
        new_lines.append(new_line)
    else:
        new_lines.append(line)

with open(file_path, "w", encoding="utf-8") as f:
    f.writelines(new_lines)

print("✅ 들여쓰기 수정 완료!")'''

In [None]:
'''file_path = "./github_folder/DACON_VQA/BEiT-3/utils.py"

with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

new_lines = []
for line in lines:
    if 'checkpoint = torch.load(args.resume' in line and 'weights_only' not in line:
        # weights_only=False를 추가
        fixed = line.replace("torch.load(args.resume", "torch.load(args.resume, weights_only=False")
        new_lines.append(fixed)
    else:
        new_lines.append(line)

with open(file_path, "w", encoding="utf-8") as f:
    f.writelines(new_lines)

print("✅ `weights_only=False` 수정 완료!")'''


In [None]:
'''file_path = "./github_folder/DACON_VQA/BEiT-3/utils.py"

with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

new_lines = []
for line in lines:
    if "optimizer.load_state_dict" in line:
        # Optimizer 로드를 주석 처리
        new_lines.append("# " + line)
    else:
        new_lines.append(line)

with open(file_path, "w", encoding="utf-8") as f:
    f.writelines(new_lines)

print("✅ optimizer.load_state_dict 주석 처리 완료")'''

'file_path = "/content/drive/MyDrive/멀티모달/github_folder/DACON_VQA/BEiT-3/utils.py"\n\nwith open(file_path, "r", encoding="utf-8") as f:\n    lines = f.readlines()\n\nnew_lines = []\nfor line in lines:\n    if "optimizer.load_state_dict" in line:\n        # Optimizer 로드를 주석 처리\n        new_lines.append("# " + line)\n    else:\n        new_lines.append(line)\n\nwith open(file_path, "w", encoding="utf-8") as f:\n    f.writelines(new_lines)\n\nprint("✅ optimizer.load_state_dict 주석 처리 완료")'

## model

In [None]:
# BEIT-3 model을 fine-tuning 할때, json파일이 필요
# 데이터셋이 바뀔 때, 바뀌어야 되는 부분

def make_mcq_json(df, out_path, is_train=False):
    mcq_data = []
    for _, r in df.iterrows():
        # ID → question_id
        num_part = re.findall(r'\d+', r['ID'])[0]
        qid = f"1{num_part}1"

        # img_path 로부터 파일명만 뽑고, 확장자(.jpg) 제거
        filename = os.path.basename(r['img_path'])  # e.g. "TEST_000.jpg"
        name, _ext  = os.path.splitext(filename)    # ("TEST_000", ".jpg")

        # 질문+보기 결합
        mcq = (
            r['Question']
            + " Choices:"
            + f" A. {r['A']}"
            + f" B. {r['B']}"
            + f" C. {r['C']}"
            + f" D. {r['D']}"
        )

        entry = {
            "image_id":    name,    # <-- 확장자를 뗀 순수 파일명
            "question":    mcq,
            "question_id": qid,
        }
        if is_train:
            entry["answer"] = r["answer"]
        mcq_data.append(entry)

    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(mcq_data, f, ensure_ascii=False, indent=2)


# 세션에 저장합니다.

make_mcq_json(train_df, "./json_file/train.json", is_train=True)
make_mcq_json(val_df,   "./json_file/val.json", is_train=True)
make_mcq_json(test_df,  "./json_file/test.json", is_train=False)

In [None]:
# BEIT-3 모델이 학습하는 형식에 맞게 index를 만드는 코드

#CustomDataset
import importlib.util

spec = importlib.util.spec_from_file_location(
    "datasets",
    "./github_folder/DACON_VQA/BEiT-3/datasets.py"
)
datasets = importlib.util.module_from_spec(spec)
spec.loader.exec_module(datasets)

CustomDataset = datasets.CustomDataset

from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer("./github_folder/model_file/beit3.spm")

CustomDataset.make_dataset_index(
    data_path=".",
    tokenizer=tokenizer,
    json_data_path="./json_file/",
)

Write /content/vqa.train.jsonl with 118887 items !
Write /content/vqa.val.jsonl with 20981 items !
Write /content/vqa.test.jsonl with 852 items !
Contains 14539 image and 20981 pairs for val set!
Write /content/vqa.trainable_val.jsonl with 19567 items !
Write /content/vqa.rest_val.jsonl with 1414 items !


In [None]:
!mkdir -p ./train
!cp ./images/*.jpg ./train/

In [None]:
# 경로 설정
model_dir = "."
model_file = f"{model_dir}/finetuned_model_file"

# 최초 pretrained checkpoint 경로 (처음부터 학습이면 이 경로 사용)
model_ckpt = f"{model_dir}/github_folder/model_file/beit3_large_indomain_patch16_480_vqa.pth"

# 학습 실행
# 총 5 eopch 학습
# colab 런타임 시간 제한 때문에, resume 사용
# checkpoint-0,1,2,3 이런식으로 한 epoch씩 끊어서 학습
!python ./github_folder/DACON_VQA/BEiT-3/run_beit3_finetuning.py \
    --model beit3_large_patch16_480_vqacustom \
    --input_size 480 \
    --task vqacustom \
    --batch_size 64 \
    --layer_decay 1.0 \
    --lr 2e-5 \
    --update_freq 1 \
    --randaug \
    --epochs 5 \
    --resume ./finetuned_model_file/checkpoint-3.pth \
    --warmup_epochs 0 \
    --drop_path 0.15 \
    --sentencepiece_model ./github_folder/model_file/beit3.spm \
    --finetune {model_ckpt} \
    --data_path . \
    --output_dir {model_file}/ \
    --log_dir {model_file}/log \
    --weight_decay 0.01 \
    --num_max_bpe_tokens 128 \
    --nb_classes 4 \
    --seed 42 \
    --save_ckpt_freq 1 \
    --task_head_lr_weight 20 \
    --opt_betas 0.9 0.98 \
    --checkpoint_activations


Not using distributed mode
Namespace(model='beit3_large_patch16_480_vqacustom', task='vqacustom', input_size=480, drop_path=0.15, checkpoint_activations=True, sentencepiece_model='/content/drive/MyDrive/멀티모달/github_folder/model_file/beit3.spm', vocab_size=64010, num_max_bpe_tokens=128, model_ema=False, model_ema_decay=0.9999, model_ema_force_cpu=False, opt='adamw', opt_eps=1e-08, opt_betas=[0.9, 0.98], clip_grad=None, momentum=0.9, weight_decay=0.01, lr=2e-05, layer_decay=1.0, task_head_lr_weight=20.0, warmup_lr=1e-06, min_lr=1e-06, warmup_epochs=0, warmup_steps=-1, batch_size=64, eval_batch_size=None, epochs=5, update_freq=1, save_ckpt_freq=1, randaug=True, train_interpolation='bicubic', finetune='/content/drive/MyDrive/멀티모달/github_folder/model_file/beit3_large_indomain_patch16_480_vqa.pth', model_key='model|module', model_prefix='', data_path='/content/', output_dir='/content/drive/MyDrive/멀티모달/finetuned_model_file/', log_dir='/content/drive/MyDrive/멀티모달/finetuned_model_file/log', de