<a id='1'></a>
#  <center><font size = 3><span style="color:#a8dadc"> <p style="background-color:#90e0ef;font-family:newtimeroman;color:#03045e;font-size:200%;text-align:center;border-radius:100px 10px;">1. Importing and Installing Necessary Libraries📂 </p>   </span></font></center>

In [1]:
# !pip install git+https://github.com/huggingface/transformers.git 
!pip install einops -q
!pip install -q -U trl transformers git+https://github.com/huggingface/peft.git 
!pip install bitsandbytes -q
!pip install accelerate -q

In [2]:
import numpy as np 
import pandas as pd 
import os
import torch
from pathlib import Path
from tqdm.notebook import tqdm
import string
import re
import random
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
from datasets import Dataset, DatasetDict
from pynvml import *
from numba import cuda
import warnings
warnings.filterwarnings('ignore')



In [3]:
device = cuda.get_current_device()
device.reset()

In [4]:
def set_seed(seed: int = 42) -> None:
    """ Set seed for reproducibility """
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")
set_seed()
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb=14500'

Random seed set as 42


In [5]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")

In [6]:
print_gpu_utilization()

GPU memory occupied: 2 MB.


In [7]:
class Config:
    ## General
    ROOT_DIR = '/kaggle/input/oneapi-hackathon-the-llm-challenge'
#     MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
    MODEL_ID = 'microsoft/phi-1_5'
    LOCAL_DIR = 'base_model'
    CONTEXT_LENGTH=1024
    SEED=42
    DEVICE= torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
class DownloadHFModel:
    """ Download HF model """
    def __init__( self, model_id, cached_dir):
        self.model_id = model_id
        self.cached_dir = cached_dir
    def download_model( self):
        snapshot_download(self.model_id, cache_dir=self.cached_dir )
        print( f"{self.model_id} downloaded model successfully")

In [9]:
# dhf = DownloadHFModel( model_id=Config.MODEL_ID,  ) # cached_dir=Config.LOCAL_DIR
# dhf.download_model()

In [10]:
class DataframeLoader:
    """ Loads train, test, submission csv files and returns a dictonary """
    def __init__( self, root_dir , file_ext ):
        self.root_dir = root_dir
        self.file_ext = file_ext
        self.files_list = list(Path( self.root_dir ).glob( '*.'+self.file_ext ))
        self.data_dict = {}
    
    def _get_file_name( self , file_path:str ):
        """ returns name of the file w/o ext """
        file_name = str(file_path).split('/')[-1].split('.')[0]
        return file_name
    
    def _get_csv_df( self , path ):
        return pd.read_csv( path )
        
    def _get_excel_df( self , path ):
        return pd.read_excel( path )
    
    def load( self ):
        """ Loads train ,test, csv from data """
        for file in tqdm(self.files_list, desc='read csv files'):
            file_name = self._get_file_name( str(file) )
            if (self.file_ext == 'csv') & (file_name == 'train'):
                df = self._get_csv_df( str(file))
                self.data_dict['train'] = df
            elif (self.file_ext == 'csv') & (file_name == 'test'):
                df = self._get_csv_df( str(file))
                self.data_dict['test'] = df
            else :
                df = self._get_csv_df( str(file))
                self.data_dict['sub'] = df
        print( f'{len(self.data_dict)} files loaded {self.data_dict.keys()}')
        return self.data_dict

In [11]:
dl = DataframeLoader( Config.ROOT_DIR , 'csv')
data_dict = dl.load()
train_df = data_dict['train']
test_df = data_dict['test']
sub_df = data_dict['sub']

read csv files:   0%|          | 0/3 [00:00<?, ?it/s]

3 files loaded dict_keys(['sub', 'train', 'test'])


In [12]:
train_df.head()

Unnamed: 0,Story,Question,span_start,span_end,span_text,Answer
0,CHAPTER IV. \n\nNotwithstanding the earnest in...,What emotions did she show?,199.0,288.0,The countenance of this young lady exhibited a...,mirth and sadness
1,"(CNN) -- Polk County, Florida, detectives arre...",What do police believe the motive to be?,428.0,465.0,We believe that his motive is robbery,robbery
2,"Malawi (, or ; or [maláwi]), officially the Re...",Is it a large country?,653.0,702.0,Malawi is among the smallest countries in Africa.,No
3,CHAPTER XXXII. \n\nMR. GILMORE'S SUCCESS. \n\n...,What reminded him of Bullhampton?,1438.0,1448.0,Everything,Everything
4,"Computer security, also known as cybersecurity...",Besides breaking or staling a computer what ot...,,-1.0,unknown,unknown


### Clean text

In [13]:
def remove_punctuation(text):
    if isinstance(  text, str):
        no_punct=[words for words in text if words not in string.punctuation]
        words_wo_punct=''.join(no_punct)
        return words_wo_punct
    else:
        return str( text)
    
def apply_regex(text):
    if isinstance( text, str ):
    #     text = re.sub("\S*\d\S*"," ", text)         # removes numbers and words concatenated with numbers (IE h4ck3r)
        text = re.sub("\S*@\S*\s?"," ", text)        # removes emails and mentions (words with @)
        text = re.sub("\S*#\S*\s?"," ", text)        # removes hashtags (words with #)
        text = re.sub(r'http\S+', ' ', text)         # removes URLs
        text = re.sub(r'[^a-zA-Z0-9 ]', ' ',text)    # keeps numbers and letter
        text = text.replace(u'\ufffd', '8')          # replaces the ASCII '�' symbol with '8'
        text = re.sub(' +', ' ', text)               # removes multiple spaces
        return text.lower()
    else:
        return ""

In [14]:
col_to_clean =['Story']
for col in tqdm(col_to_clean, desc='cleaning text columns'):
    train_df[col]=train_df[col].apply(lambda x: remove_punctuation(x) )
    test_df[col]=test_df[col].apply(lambda x: remove_punctuation(x) )
    train_df[col]=train_df[col].apply(lambda x: apply_regex(x) )
    test_df[col]=test_df[col].apply(lambda x: apply_regex(x) )

cleaning text columns:   0%|          | 0/1 [00:00<?, ?it/s]

### Prepare Dataset

In [15]:
def format_text( example ):
    pre_text = f"""
    Answer the question based on the context:
    {example['Question']}
    {example['Story']}
    Answer: {example['Answer']}
    """
    return {"prompt": pre_text}

def format_text_test( example ):
    pre_text = f"""
    Answer the question based on the context:
   {example['Question']}
   {example['Story']}
    Answer: 
    
    """
    return {"prompt": pre_text}

In [16]:
train_ds = Dataset.from_pandas( train_df )
test_ds = Dataset.from_pandas( test_df )

In [17]:
train_ds = train_ds.map( format_text)
test_ds = test_ds.map( format_text_test)

  0%|          | 0/66611 [00:00<?, ?ex/s]

  0%|          | 0/28548 [00:00<?, ?ex/s]

In [18]:
ds = DatasetDict( 
    train= train_ds,
    val=test_ds
)

In [19]:
ds

DatasetDict({
    train: Dataset({
        features: ['Story', 'Question', 'span_start', 'span_end', 'span_text', 'Answer', 'prompt'],
        num_rows: 66611
    })
    val: Dataset({
        features: ['Story', 'Question', 'prompt'],
        num_rows: 28548
    })
})

<a id='1'></a>
#  <center><font size = 3><span style="color:#a8dadc"> <p style="background-color:#90e0ef;font-family:newtimeroman;color:#03045e;font-size:200%;text-align:center;border-radius:100px 10px;"> Train data set </p>   </span></font></center>

In [20]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)
n_gpus = torch.cuda.device_count()
max_memory = f'{40960}MB'

In [21]:
%%time
model  = AutoModelForCausalLM.from_pretrained( 
                                                Config.MODEL_ID,
                                                trust_remote_code=True,
                                                quantization_config=bnb_config,
                                                device_map='auto',
                                                max_memory = {i: max_memory for i in range(n_gpus)},
    

)
tokenizer = AutoTokenizer.from_pretrained(
                                                Config.MODEL_ID, 
                                                trust_remote_code=True,
                                                device_map='auto'
)
tokenizer.pad_token = tokenizer.eos_token

Downloading (…)lve/main/config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

Downloading (…)former_sequential.py:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- configuration_mixformer_sequential.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading (…)former_sequential.py:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-1_5:
- modeling_mixformer_sequential.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

CPU times: user 5.93 s, sys: 7.67 s, total: 13.6 s
Wall time: 21.6 s


In [22]:
def find_all_linear_names(model):
    import bitsandbytes as bnb
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


In [23]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print( f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}" )


print_trainable_parameters(model)

trainable params: 209868800 || all params: 814290944 || trainable%: 25.77319587628866


In [24]:
# Verifying the datatypes.
dtypes = {}
for _, p in model.named_parameters():
    dtype = p.dtype
    if dtype not in dtypes:
        dtypes[dtype] = 0
    dtypes[dtype] += p.numel()
total = 0
for k, v in dtypes.items():
    total += v
for k, v in dtypes.items():
    print(k, v, v / total)

torch.float16 210311168 0.2582752142212207
torch.uint8 603979776 0.7417247857787793


In [25]:
find_all_linear_names(model)

['out_proj', 'fc1', 'fc2', 'Wqkv']

In [26]:
def tokenize_text( sample ):
    input_text = f"""
    Answer the question based on the context:
    question: {sample['Question']}
    context: {sample['Story']}"""
    response = f"""
    response: {sample['Answer']}"""
    
    return { 'input_ids':tokenizer( input_text, padding="max_length", 
                                   truncation=True, max_length=512,return_tensors="pt").input_ids[0],

             'labels': tokenizer(response, padding="max_length", truncation=True,
                                 max_length=512,return_tensors="pt").input_ids[0]
    
           }

In [27]:
# train_ds = train_ds.remove_columns('prompt').train_test_split(test_size=0.1)

In [28]:
train = ds['train']
val = ds['val']

In [29]:
BATCH_SIZE = 4
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 3
OUTPUT_DIR = "results"
LOGGING_STEPS = 1  

In [30]:
training_arguments = TrainingArguments(
    per_device_train_batch_size=MICRO_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=2,
    max_steps=TRAIN_STEPS,
    learning_rate=LEARNING_RATE,
    fp16=True,
    logging_steps=LOGGING_STEPS,
    optim="adamw_torch",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=1,
    save_steps=1,
    output_dir=OUTPUT_DIR,
    save_total_limit=3,
    load_best_model_at_end=True,
    report_to="tensorboard"
)


# output_dir = "./results"
# per_device_train_batch_size = 2
# gradient_accumulation_steps = 2
# # optim = "paged_adamw_32bit"
# save_steps = 1                 # Check points at which we would like to store the results
# # logging_steps = 5              # After this steps loss will be logged
# # learning_rate = 2e-4
# max_grad_norm = 0.3
# # max_steps = 50                  # Number of Epochs for which we will train the model.
# warmup_ratio = 0.03
# lr_scheduler_type = "constant"

# training_arguments = TrainingArguments(
#     output_dir=OUTPUT_DIR,
#     per_device_train_batch_size=MICRO_BATCH_SIZE,
#     gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
#     optim='adamw_torch',
#     evaluation_strategy="steps",
#     save_strategy="steps",
#     eval_steps=1,
#     save_steps=1,
#     logging_steps=LOGGING_STEPS,
#     learning_rate=LEARNING_RATE,
#     fp16=True,
#     max_grad_norm=max_grad_norm,
#     max_steps=TRAIN_STEPS,
#     warmup_ratio=warmup_ratio,
#     group_by_length=True,
#     lr_scheduler_type=lr_scheduler_type,
#     report_to="tensorboard"
# )

In [31]:
train = ds['train'].select([i for i in range(2000)])
val = ds['train'].select([i for i in range(2000,2100)])

In [32]:
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        'Wqkv',
        'out_proj'
    ]
)

In [33]:
trainer = SFTTrainer(
    model=model,
    train_dataset=train,
    dataset_text_field='prompt',
    eval_dataset=val,
    args=training_arguments,
    max_seq_length=Config.CONTEXT_LENGTH,
    peft_config=peft_config,
)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [34]:
from torch import cuda


def get_less_used_gpu(gpus=None, debug=False):
    """Inspect cached/reserved and allocated memory on specified gpus and return the id of the less used device"""
    if gpus is None:
        warn = 'Falling back to default: all gpus'
        gpus = range(cuda.device_count())
    elif isinstance(gpus, str):
        gpus = [int(el) for el in gpus.split(',')]

    # check gpus arg VS available gpus
    sys_gpus = list(range(cuda.device_count()))
    if len(gpus) > len(sys_gpus):
        gpus = sys_gpus
        warn = f'WARNING: Specified {len(gpus)} gpus, but only {cuda.device_count()} available. Falling back to default: all gpus.\nIDs:\t{list(gpus)}'
    elif set(gpus).difference(sys_gpus):
        # take correctly specified and add as much bad specifications as unused system gpus
        available_gpus = set(gpus).intersection(sys_gpus)
        unavailable_gpus = set(gpus).difference(sys_gpus)
        unused_gpus = set(sys_gpus).difference(gpus)
        gpus = list(available_gpus) + list(unused_gpus)[:len(unavailable_gpus)]
        warn = f'GPU ids {unavailable_gpus} not available. Falling back to {len(gpus)} device(s).\nIDs:\t{list(gpus)}'

    cur_allocated_mem = {}
    cur_cached_mem = {}
    max_allocated_mem = {}
    max_cached_mem = {}
    for i in gpus:
        cur_allocated_mem[i] = cuda.memory_allocated(i)
        cur_cached_mem[i] = cuda.memory_reserved(i)
        max_allocated_mem[i] = cuda.max_memory_allocated(i)
        max_cached_mem[i] = cuda.max_memory_reserved(i)
    min_allocated = min(cur_allocated_mem, key=cur_allocated_mem.get)
    if debug:
        print(warn)
        print('Current allocated memory:', {f'cuda:{k}': v for k, v in cur_allocated_mem.items()})
        print('Current reserved memory:', {f'cuda:{k}': v for k, v in cur_cached_mem.items()})
        print('Maximum allocated memory:', {f'cuda:{k}': v for k, v in max_allocated_mem.items()})
        print('Maximum reserved memory:', {f'cuda:{k}': v for k, v in max_cached_mem.items()})
        print('Suggested GPU:', min_allocated)
    return min_allocated


def free_memory(to_delete: list, debug=False):
    import gc
    import inspect
    calling_namespace = inspect.currentframe().f_back
    if debug:
        print('Before:')
        get_less_used_gpu(debug=True)

    for _var in to_delete:
        calling_namespace.f_locals.pop(_var, None)
        gc.collect()
        cuda.empty_cache()
    if debug:
        print('After:')
        get_less_used_gpu(debug=True)

In [35]:
free_memory( [0], True)

Before:
Falling back to default: all gpus
Current allocated memory: {'cuda:0': 1541575680}
Current reserved memory: {'cuda:0': 1935671296}
Maximum allocated memory: {'cuda:0': 1674118144}
Maximum reserved memory: {'cuda:0': 1935671296}
Suggested GPU: 0
After:
Falling back to default: all gpus
Current allocated memory: {'cuda:0': 1541575680}
Current reserved memory: {'cuda:0': 1725956096}
Maximum allocated memory: {'cuda:0': 1674118144}
Maximum reserved memory: {'cuda:0': 1935671296}
Suggested GPU: 0


In [36]:
print_gpu_utilization()

GPU memory occupied: 2475 MB.


In [37]:
trainer.train()

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
1,3.6141,3.928518
2,3.8124,3.922828
3,3.5788,3.906992


TrainOutput(global_step=3, training_loss=3.668460210164388, metrics={'train_runtime': 69.8253, 'train_samples_per_second': 0.172, 'train_steps_per_second': 0.043, 'total_flos': 34309067390976.0, 'train_loss': 3.668460210164388, 'epoch': 0.01})

In [38]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model
model_to_save.save_pretrained("final_finetuned_model")

In [39]:
lora_config = LoraConfig.from_pretrained('final_finetuned_model')
model = get_peft_model(model, lora_config, )
model = model.to(Config.DEVICE)

<a id='1'></a>
#  <center><font size = 3><span style="color:#a8dadc"> <p style="background-color:#90e0ef;font-family:newtimeroman;color:#03045e;font-size:200%;text-align:center;border-radius:100px 10px;">Inference</p>   </span></font></center>

In [40]:
%%time
test_str = test_ds['prompt'][1]
inputs = tokenizer( test_str, return_tensors='pt', return_attention_mask=False).to(Config.DEVICE)
output = model.generate(**inputs, max_length=512, ).to(Config.DEVICE)
response = tokenizer.decode( output[0] )
print( 'generate response: \n', response)

generate response: 
 
    Answer the question based on the context:
   By how many?
   the uralic languages sometimes called uralian languages constitute a language family of 38 languages spoken by approximately 25 million people predominantly in northern eurasia the uralic languages with the most native speakers are hungarian finnish and estonian which are official languages of hungary finland and estonia respectively and of the european union other uralic languages with significant numbers of speakers are erzya moksha mari udmurt and komi which are officially recognized languages in various regions of russia the name uralic derives from the fact that areas where the languages are spoken spread on both sides of the ural mountains also the original homeland urheimat is commonly hypothesized to lie in the vicinity of the urals finnougric is sometimes used as a synonym for uralic though finnougric is widely understood to exclude the samoyedic languages scholars who do not accept the trad