In [None]:
%cd /content

!unzip -qq "/content/drive/MyDrive/ATL/data/image.zip"

/content


In [None]:
from glob import glob

train_file = list(glob('/content/train/*.jpg'))
test_file = list(glob('/content/test/*.jpg'))

print(len(train_file), len(test_file))

107231 11915


In [None]:
! git clone https://github.com/ddobokki/dacon_vqa.git

Cloning into 'dacon_vqa'...
remote: Enumerating objects: 22, done.[K
remote: Counting objects: 100% (22/22), done.[K
remote: Compressing objects: 100% (17/17), done.[K
remote: Total 22 (delta 1), reused 19 (delta 0), pack-reused 0[K
Receiving objects: 100% (22/22), 6.10 KiB | 6.10 MiB/s, done.
Resolving deltas: 100% (1/1), done.


In [None]:
!pip install -r /content/dacon_vqa/requirements.txt

In [None]:
!pip install Pillow==7.2.0

In [None]:
import pandas as pd

data_path = '/content/drive/MyDrive/ATL/data/'
train_df = pd.read_csv(data_path + 'train.csv')
test_df = pd.read_csv(data_path + 'test.csv')

train_df["img_path"] = train_df["image_id"].apply(lambda x: f"/content/train/{x}.jpg")
test_df["img_path"] = test_df["image_id"].apply(lambda x: f"/content/test/{x}.jpg")

In [None]:
train_df.to_csv("/content/dacon_vqa/data/train_df.csv", index=False)
test_df.to_csv("/content/dacon_vqa/data/test_df.csv", index=False)

In [None]:
import os

os.getcwd()

'/content'

In [None]:
new_directory = '/content/dacon_vqa'
os.chdir(new_directory)

In [None]:
import logging

from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoProcessor,
    Trainer,
    set_seed,
)
from transformers.trainer_utils import is_main_process

from utils import DataCollatorForGit, get_dataset

logger = logging.getLogger(__name__)

from arguments import DatasetsArguments, ModelArguments, MyTrainingArguments

def main(model_args: ModelArguments, data_args: DatasetsArguments, training_args: MyTrainingArguments):
    set_seed(training_args.seed)

    dataset = get_dataset(csv_path=data_args.train_data_path)
    dataset = dataset.train_test_split(test_size=0.1, seed=training_args.seed)

    train_dataset = dataset["train"]
    valid_dataset = dataset["test"]

    config = AutoConfig.from_pretrained(model_args.model_name_or_path)
    processor = AutoProcessor.from_pretrained(model_args.model_name_or_path)
    model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)

    data_collator = DataCollatorForGit(processor=processor)

    trainer = Trainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
    )
    trainer.train()

    if is_main_process(training_args.local_rank):
        model.save_pretrained(training_args.output_dir)
        config.save_pretrained(training_args.output_dir)
        processor.save_pretrained(training_args.output_dir)

model_args = ModelArguments(model_name_or_path='microsoft/git-base-coco')
data_args = DatasetsArguments(train_data_path='/content/dacon_vqa/data/train_df.csv')
training_args = MyTrainingArguments(
    output_dir="output",
    seed=42,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_strategy="steps",
    logging_steps=10,
    eval_steps=250,
    save_steps=250,
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=5e-5,
    dataloader_num_workers=2,
    label_names=["labels"],
    fp16=True,
    remove_unused_columns=False,
    report_to=[]
)

logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN,
)
main(model_args=model_args, data_args=data_args, training_args=training_args)


Downloading (…)lve/main/config.json:   0%|          | 0.00/2.82k [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/707M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]



Step,Training Loss,Validation Loss
250,1.2011,1.287029
500,1.3302,1.223786
750,1.257,1.209771
1000,1.2331,1.195824
1250,1.2162,1.173014
1500,1.1754,1.155957
1750,1.1848,1.152878
2000,1.1325,1.156712
2250,1.1022,1.132245
2500,1.114,1.133307


KeyboardInterrupt: ignored

In [None]:
processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
model = AutoModelForCausalLM.from_pretrained("/content/dacon_vqa/output/checkpoint-6000")

In [None]:
model.to('cuda')

GitForCausalLM(
  (git): GitModel(
    (embeddings): GitEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(1024, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (image_encoder): GitVisionModel(
      (vision_model): GitVisionTransformer(
        (embeddings): GitVisionEmbeddings(
          (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16), bias=False)
          (position_embedding): Embedding(197, 768)
        )
        (pre_layrnorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (encoder): GitVisionEncoder(
          (layers): ModuleList(
            (0-11): 12 x GitVisionEncoderLayer(
              (self_attn): GitVisionAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
test_datasets = get_dataset('/content/dacon_vqa/data/test_df.csv')

In [None]:
from tqdm.notebook import tqdm
labels = []

for i in range(len(test_datasets)):
    image = test_datasets[i]["img"]
    question = test_datasets[i]["question"].lower()
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to('cuda')
    input_ids = processor(text=question, return_tensors="pt").input_ids.to('cuda')

    generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50,eos_token_id = 102)[0]
    answer = processor.tokenizer.decode(generated_ids,skip_special_tokens=True).replace(question,"").lstrip().rstrip()
    labels.append(answer)

In [None]:
import pandas as pd
data_path = '/content/drive/MyDrive/ATL/data/'
sub = pd.read_csv(data_path + 'sample_submission.csv')
sub["answer"] = labels

In [None]:
sub.to_csv(data_path + 'GiT_6000.csv',index=False)