In [None]:
%pip install -q -U bitsandbytes transformers peft accelerate datasets scipy einops evaluate trl rouge_score scikit-learn

In [2]:
import torch
for i in range(torch.cuda.device_count()):
   print(torch.cuda.get_device_properties(i).name)

NVIDIA GeForce RTX 4090


In [None]:
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

: 

In [20]:
def random_split(df, train_frac, valid_frac):
    df = df.sample(frac = 1, random_state = 123).reset_index(drop=True)
    train_end = int(len(df) * train_frac)
    valid_end = train_end + int(len(df) * valid_frac)

    train_df = df[:train_end]
    valid_df = df[train_end:valid_end]

    test_df = df[valid_end:]

    return train_df,valid_df,test_df

In [21]:
dataframeQA = pd.read_json('./data/q&a_arbitrum.jsonl', lines=True)
dataframeQAStylus = pd.read_json('./data/q&a_v4.jsonl', lines=True)
dataset = pd.concat([dataframeQA, dataframeQAStylus], ignore_index=True)

In [22]:
train_df, valid_df, test_df = random_split(dataset, 0.7, 0.1)

In [25]:
valid_df

Unnamed: 0,question,answer
3250,What type of data does the createRetryableTick...,The createRetryableTicket method returns a Pro...
3251,What is the purpose of the MapL1SenderContract...,The MapL1SenderContractAddressToL2Alias functi...
3252,What type of Ethereum node earns $ETH for proc...,The type of Ethereum node that earns $ETH for ...
3253,What is the return type of the waitForRedeem f...,The return type of the waitForRedeem function ...
3254,What steps are required for a transition to na...,The steps include pausing deposits and withdra...
...,...,...
3709,What tool can be used to configure the Sequenc...,Foundry
3710,What is the method to obtain the fee collector...,You can obtain the fee collector address by ca...
3711,What is the return type of the `getWithdrawalR...,The return type of the `getWithdrawalRequest` ...
3712,What are the default development account crede...,The default development account has an address...


In [None]:
# Prepare features and target
#X = dataset.drop('answer', axis=1)
#y = dataset['answer']

# Split the data
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)"

In [3]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False,
)

In [6]:
model_name='microsoft/phi-2'
device_map = {"": 0}
original_model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    device_map=device_map,
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

In [16]:
from transformers import set_seed
seed = 42
set_seed(seed)

In [72]:
def create_prompt_formats(sample):
    """
    Format various fields of the sample ('instruction','output')
    Then concatenate them using two newline characters 
    :param sample: Sample dictionnary
    """
    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruct: Summarize the below conversation."
    RESPONSE_KEY = "### Output:"
    END_KEY = "### End"
    
    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['question']}" if sample["question"] else None
    response = f"{RESPONSE_KEY}\n{sample['answer']}"
    end = f"{END_KEY}"
    
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample
  
from functools import partial

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

In [80]:
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats) #, batched=True)
    print(type(dataset))
    # Apply preprocessing to each batch of the dataset & and remove 'question', 'answer', fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        remove_columns=['question', 'answer'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [81]:
## Pre-process dataset
max_length = get_max_length(original_model)
print(max_length)

train_dataset = preprocess_dataset(tokenizer, max_length,seed, train_df)
eval_dataset = preprocess_dataset(tokenizer, max_length,seed, valid_df)

Found max lenth: 2048
2048
Preprocessing dataset...


TypeError: string indices must be integers