In [1]:
!pip install peft openai torchviz



In [2]:
!unzip ./ScienceQA.zip

Archive:  ./ScienceQA.zip
replace ScienceQA/.git/config? [y]es, [n]o, [A]ll, [N]one, [r]ename: 
error:  invalid response [{ENTER}]
replace ScienceQA/.git/config? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


IMPORT PACKAGES

In [3]:
import os
import json
from tqdm import tqdm
from pprint import pprint

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [6]:
from torchviz import make_dot

In [7]:
from peft import PromptTuningConfig, PromptTuningInit, TaskType, get_peft_model

In [8]:
from huggingface_hub import login

In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup

In [10]:
from ScienceQA.models.base_prompt import *
from ScienceQA.models.run_gpt3 import *

CONFIGURATION

In [11]:
LEARNING_RATE = 1e-4
BATCH_SIZE = 1
NUM_EPOCHS = 50
SEQ_LENGTH = 512
SOFT_PROMPT_LENGTH = 64
MODEL_NAME = 'meta-llama/Meta-Llama-3-8B-Instruct'

In [12]:
tqdm.pandas()

In [13]:
args_dict = {
    'data_root': './ScienceQA/data/scienceqa/',
    'output_root': './ScienceQA/results/',
    'caption_file': './ScienceQA/data/captions.json',
    'model': 'Llama-3',
    'options': ['A', 'B', 'C', 'D', 'E'],

    'label': 'PLAIN',
    'test_split': 'val',
    'test_number': 3,
    'use_caption': True,
    'save_every': 10,
    'debug': True,
    'prompt_format': 'CQM-A',
    'shot_number': 3,
    'shot_qids': None,
    'seed': 42,
}

In [14]:
class Dict2Class(object):
    def __init__(self, my_dict):
        for key in my_dict:
            setattr(self, key, my_dict[key])

In [15]:
args = Dict2Class(args_dict)

In [16]:
with open('./key.json', 'r') as f:
    data = json.load(f)

In [17]:
access_token = data['HF_API_KEY']

In [18]:
login(token = access_token, add_to_git_credential = True, new_session = True)

Token is valid (permission: read).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [20]:
device

device(type='cuda')

IMPORT DATASET

In [21]:
problems, qids, shot_qids = load_data(args)

number of test problems: 3

training question ids for prompting:  ['4923', '19795', '4299'] 



In [22]:
result_file = get_result_file(args)

In [23]:
df = pd.DataFrame(problems).transpose()

In [24]:
train_df = df[df['split'] == 'train']
valid_df = df[df['split'] == 'val']
test_df = df[df['split'] == 'test']

In [25]:
pd.concat((df, df.loc[qids])).head(10)

Unnamed: 0,question,choices,answer,hint,image,task,grade,subject,topic,category,skill,lecture,solution,split,caption
1,Which of these states is farthest north?,"[West Virginia, Louisiana, Arizona, Oklahoma]",0,,image.png,closed choice,grade2,social science,geography,Geography,Read a map: cardinal directions,"Maps have four cardinal directions, or main di...","To find the answer, look at the compass rose. ...",train,An aerial view of a painting of a forest.
2,Identify the question that Tom and Justin's ex...,[Do ping pong balls stop rolling along the gro...,1,The passage below describes an experiment. Rea...,image.png,closed choice,grade8,natural science,science-and-engineering-practices,Designing experiments,Identify the experimental question,Experiments can be designed to answer specific...,,train,A wooden board with a wooden head on top of it.
3,Identify the question that Kathleen and Bryant...,[Does Kathleen's snowboard slide down a hill i...,0,The passage below describes an experiment. Rea...,image.png,closed choice,grade7,natural science,science-and-engineering-practices,Designing experiments,Identify the experimental question,Experiments can be designed to answer specific...,,train,A man riding skis down a snow covered slope.
4,Which figure of speech is used in this text?\n...,"[chiasmus, apostrophe]",1,,,closed choice,grade11,language science,figurative-language,Literary devices,"Classify the figure of speech: anaphora, antit...",Figures of speech are words or phrases that us...,"The text uses apostrophe, a direct address to ...",test,
5,Which of the following could Gordon's test show?,[if the spacecraft was damaged when using a pa...,1,People can use the engineering-design process ...,image.png,closed choice,grade8,natural science,science-and-engineering-practices,Engineering practices,Evaluate tests of engineering-design solutions,People can use the engineering-design process ...,,test,A large white kite is in the air.
6,What does the verbal irony in this text sugges...,"[The snoring is loud., The snoring occurs in b...",0,,,closed choice,grade8,language science,figurative-language,Literary devices,Interpret figures of speech,Figures of speech are words or phrases that us...,"The text uses verbal irony, which involves say...",val,
7,Which animal's mouth is also adapted for botto...,"[discus, armored catfish]",1,"Sturgeons eat invertebrates, plants, and small...",image.png,closed choice,grade3,natural science,biology,Adaptations,"Animal adaptations: beaks, mouths, and necks",An adaptation is an inherited trait that helps...,Look at the picture of the sturgeon.\nThe stur...,val,A fish is sticking out of the top of a fish tank.
8,Is this a sentence fragment?\nDuring the const...,"[no, yes]",1,,,yes or no,grade12,language science,writing-strategies,"Sentences, fragments, and run-ons",Identify sentence fragments,A sentence is a group of words that expresses ...,This is a sentence fragment. It does not expre...,val,
9,Which tense does the sentence use?\nMona will ...,"[present tense, future tense, past tense]",1,,,closed choice,grade2,language science,verbs,Verb tense,"Is the sentence in the past, present, or futur...",Present tense verbs tell you about something t...,The sentence is in future tense. You can tell ...,train,
10,Complete the sentence.\nSewing an apron is a ().,"[chemical change, physical change]",1,,,closed choice,grade4,natural science,chemistry,Physical and chemical change,Identify physical and chemical changes,Chemical changes and physical changes are two ...,Sewing an apron is a physical change. The fabr...,train,


In [26]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
class ScienceQADataset(Dataset):
    def __init__(self, df, shot_qids, args, shot_df, tokenizer):
        super().__init__()
        self.prompts = []

        shot_added_df = pd.concat((df, shot_df))
        shot_added_dict = shot_added_df.transpose().to_dict()

        for qid in tqdm([str(x) for x in df.index if x not in shot_qids]):
            prompt = build_prompt(shot_added_dict, shot_qids, qid, args)

            self.prompts.append(prompt)
        self.answers = df.progress_apply(lambda x : x['choices'][x['answer']], axis = 1).to_list()
        self.distractors = df.progress_apply(lambda x : ';'.join([choice for choice in x['choices'] if choice is not x['choices'][x['answer']]]), axis = 1).to_list()

    def __len__(self):
        return len(self.prompts)

    def __getitem__(self, index):
        return self.prompts[index], self.answers[index], self.distractors[index]

In [28]:
train_dataset = ScienceQADataset(train_df, shot_qids, args, df.loc[shot_qids], tokenizer)
valid_dataset = ScienceQADataset(valid_df, shot_qids, args, df.loc[shot_qids], tokenizer)
test_dataset = ScienceQADataset(test_df, shot_qids, args, df.loc[shot_qids], tokenizer)

  shot_added_dict = shot_added_df.transpose().to_dict()
100%|██████████| 12723/12723 [00:00<00:00, 47395.69it/s]
100%|██████████| 12726/12726 [00:00<00:00, 100788.56it/s]
100%|██████████| 12726/12726 [00:00<00:00, 52981.31it/s]
100%|██████████| 4241/4241 [00:00<00:00, 47537.34it/s]
100%|██████████| 4241/4241 [00:00<00:00, 113496.27it/s]
100%|██████████| 4241/4241 [00:00<00:00, 52777.72it/s]
100%|██████████| 4241/4241 [00:00<00:00, 42750.26it/s]
100%|██████████| 4241/4241 [00:00<00:00, 114605.56it/s]
100%|██████████| 4241/4241 [00:00<00:00, 53557.15it/s]


In [29]:
train_dataloader = DataLoader(train_dataset, shuffle = False, batch_size = BATCH_SIZE)
valid_dataloader = DataLoader(valid_dataset, shuffle = False, batch_size = BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, shuffle = False, batch_size = BATCH_SIZE)

IMPORT MODEL

In [30]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code = True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [31]:
model = model.to(device)

PROMPT TESTING

In [32]:
# prompts = []
# for qid in qids:
#     prompt = build_prompt(problems, shot_qids, qid, args)
#     prompts.append(prompt)

In [33]:
# inputs = tokenizer(prompts, max_length = SEQ_LENGTH, truncation = True, padding = True, return_tensors = 'pt')
# # inputs['input_ids'] = inputs['input_ids'].to(device)
# # inputs['attention_mask'] = inputs['attention_mask'].to(device)

In [34]:
# inputs['input_ids'] = inputs['input_ids'].to(device)
# inputs['attention_mask'] = inputs['attention_mask'].to(device)

In [35]:
# outputs = model.generate(**inputs)

In [36]:
# prompts[0]

In [37]:
# pprint(tokenizer.decode(outputs[0]))

PEFT SETTING

In [38]:
peft_config = PromptTuningConfig(
    task_type = TaskType.CAUSAL_LM,
    prompt_tuning_init = PromptTuningInit.TEXT,
    num_virtual_tokens = SOFT_PROMPT_LENGTH,
    tokenizer_name_or_path = MODEL_NAME,
    prompt_tuning_init_text="Answer correctly to following question. Try your best to reduce hallucination.",
)

In [39]:
peft_model = get_peft_model(model, peft_config)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [40]:
print(peft_model.print_trainable_parameters())

trainable params: 262,144 || all params: 8,030,523,392 || trainable%: 0.0033
None


TRAINING

In [41]:
optimizer = torch.optim.AdamW(peft_model.parameters(), lr = LEARNING_RATE)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = (len(train_dataloader) * NUM_EPOCHS),
)

In [42]:
train_losses = []
valid_losses = []

In [None]:
for epoch in range(NUM_EPOCHS):
    print(f'[{epoch}/{NUM_EPOCHS}] proceeded.')

    model.train()
    train_loss = 0.0

    for index, (prompts, answers, distractors) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()

        inputs = tokenizer(prompts, max_length = SEQ_LENGTH, truncation = True, padding = True, return_tensors = 'pt')

        inputs['input_ids'] = inputs['input_ids'].to(device)
        inputs['attention_mask'] = inputs['attention_mask'].to(device)

        outputs = peft_model(**inputs, labels = inputs['input_ids'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    model.eval()
    valid_loss = 0.0

    with torch.no_grad():
        for index, (prompts, answers, distractors) in enumerate(tqdm(valid_dataloader)):
            inputs = tokenizer(prompts, max_length = SEQ_LENGTH, truncation = True, padding = True, return_tensors = 'pt')

            inputs['input_ids'] = inputs['input_ids'].to(device)
            inputs['attention_mask'] = inputs['attention_mask'].to(device)

            outputs = peft_model(**inputs, labels = inputs['input_ids'])
            loss = outputs.loss

            valid_loss += loss.item()

    train_loss = train_loss / len(train_dataloader)
    valid_loss = valid_loss / len(valid_dataloader)

    print(f'train loss: {train_loss}, valid loss: {valid_loss}')
    print()

[0/50] proceeded.


100%|██████████| 12723/12723 [2:32:33<00:00,  1.39it/s]
100%|██████████| 4241/4241 [25:31<00:00,  2.77it/s]


train loss: 0.31372362315557545, valid loss: 0.25509189780335106

[1/50] proceeded.


 35%|███▌      | 4515/12723 [54:02<1:33:30,  1.46it/s]

In [None]:
import pickle

peft_model.save_pretrained('./model.pt')

with open('train_losses.pkl', 'wb') as f:
    pickle.dump(train_losses, f)

with open('train_losses.pkl', 'wb') as f:
    pickle.dump(valid_losses, f)