# Finetune => phi 1.5 v2 CUDA

In [1]:
!pip install einops trl peft evaluate rouge_score bitsandbytes accelerate -q

In [2]:
import os
os.environ["WANDB_DISABLED"] = "True"

In [3]:
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM,BitsAndBytesConfig, AutoTokenizer, DataCollatorWithPadding,TrainingArguments, Trainer, DataCollatorForLanguageModeling
from tqdm.notebook import tqdm
import torch




In [4]:
class  Config:
    MODEL_ID = 'microsoft/phi-1_5'
    LOCAL_DIR = 'base_model'
    TRAIN_DATA_FILE = '/kaggle/input/60k-stack-overflow-questions-with-quality-rate/train.csv'
    TEST_DATA_FILE = '/kaggle/input/60k-stack-overflow-questions-with-quality-rate/valid.csv'
    MODEL_CHECKPOINT = 'model_ckp'
    CONTEXT_LENGTH=512
    SEED=42
    DEVICE='cuda'


In [5]:
torch.manual_seed( Config.SEED )

<torch._C.Generator at 0x7ff178586550>

In [6]:
class DownloadModel:
    """ downloads model to local dir """
    def __init__( self, model_id, local_dir):
        self.model_id = model_id
        self.local_dir = local_dir
        
    def download_repo( self ):
        model_name = self.model_id.split('/')[-1]
        snapshot_download( repo_id=self.model_id,
                           cache_dir=self.local_dir
                         )
        print( f'{model_name} downloaded to {self.local_dir}')

In [7]:
dwnmodel = DownloadModel( Config.MODEL_ID, Config.LOCAL_DIR )
dwnmodel.download_repo()

Fetching 14 files:   0%|          | 0/14 [00:00<?, ?it/s]

Downloading (…)ab4071d5/config.json:   0%|          | 0.00/880 [00:00<?, ?B/s]

Downloading (…)071d5/.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

Downloading (…)e9ab4071d5/README.md:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading (…)earch%20License.docx:   0%|          | 0.00/38.9k [00:00<?, ?B/s]

Downloading (…)d5/added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading (…)former_sequential.py:   0%|          | 0.00/2.23k [00:00<?, ?B/s]

Downloading (…)9ab4071d5/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)former_sequential.py:   0%|          | 0.00/32.2k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Downloading (…)071d5/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)9ab4071d5/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

phi-1_5 downloaded to base_model


In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [9]:
class LLMModel:
    
    def __init__( self ):
        pass
    
    def load_model( self ):
        
        n_gpus = torch.cuda.device_count()
        max_memory = f'{40960}MB'
        model = AutoModelForCausalLM.from_pretrained( 
                                            Config.MODEL_ID,
                                            local_files_only=True,
                                            trust_remote_code=True,
                                            cache_dir= Config.LOCAL_DIR,
                                            quantization_config=bnb_config, 
                                            device_map={"":0},
                                             max_memory = {i: max_memory for i in range(n_gpus)},
                                            )
        print( 'model loaded...!')
        tokenizer = AutoTokenizer.from_pretrained(
                                            Config.MODEL_ID,
                                            local_files_only=True,
                                            trust_remote_code=True,
                                            cache_dir= Config.LOCAL_DIR,
                                            add_prefix_space=True,
                                            quantization_config=bnb_config, 
                                            device_map={"":0},
                                             max_memory = {i: max_memory for i in range(n_gpus)},

                                            
        )
        print( 'tokenizer loaded...!')
        return model, tokenizer

## Load Model

In [10]:
%%time
llmmodel = LLMModel()
model, tokenizer = llmmodel.load_model()

model loaded...!
tokenizer loaded...!
CPU times: user 3.51 s, sys: 4.86 s, total: 8.37 s
Wall time: 11.3 s


In [11]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

## Inference

In [12]:
%%time
inputs = tokenizer('''```sql for count number of employee from a company
''', return_tensors="pt", return_attention_mask=False)

outputs = model.generate(**inputs.to('cuda'), max_length=200)
text = tokenizer.batch_decode(outputs)[0]
print(text)

 ```sql for count number of employee from a company
```

```python
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user="yourusername",
  password="yourpassword",
  database="mydatabase"
)

mycursor = mydb.cursor()

mycursor.execute("SELECT COUNT(*) FROM employees")

myresult = mycursor.fetchone()

print(myresult)
```

### Exercise 2

Write a Python program to insert multiple rows of data into a table using the `executemany()` method.

```python
import mysql.connector

mydb = mysql.connector.connect(
  host="localhost",
  user="yourusername",
  password="yourpassword",
  database="mydatabase"
)
CPU times: user 12.6 s, sys: 96.9 ms, total: 12.7 s
Wall time: 14.5 s


### Train on custom dataset

In [13]:
import pandas as pd
from datasets import Dataset, DatasetDict
import datasets 

In [14]:
train_df = pd.read_csv( Config.TRAIN_DATA_FILE )
train_df.head()


Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,<p>I'm already familiar with repeating tasks e...,<java><repeat>,2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,<p>I'd like to understand why Java 8 Optionals...,<java><optional>,2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,<p>I am attempting to overlay a title over an ...,<javascript><image><overlay><react-native><opa...,2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"<p>The question is very simple, but I just cou...",<swift><operators><whitespace><ternary-operato...,2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,<p>I'm using custom floatingactionmenu. I need...,<android><material-design><floating-action-but...,2016-01-01 05:21:48,HQ


In [15]:
train_df.shape

(45000, 6)

In [16]:
test_df = pd.read_csv( Config.TEST_DATA_FILE )
test_df.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552974,How to get all the child records from differen...,I am having 4 different tables like \r\nselect...,<sql><sql-server>,2016-01-01 01:44:52,LQ_EDIT
1,34554721,Retrieve all except some data of the another t...,I have two table m_master and tbl_appointment\...,<php><mysql><sql><codeigniter><mysqli>,2016-01-01 08:43:50,LQ_EDIT
2,34555135,Pandas: read_html,<p>I'm trying to extract US states from wiki U...,<python><pandas>,2016-01-01 09:55:22,HQ
3,34555448,Reader Always gimme NULL,"I'm so new to C#, I wanna make an application ...",<sql-server><c#-4.0>,2016-01-01 10:43:45,LQ_EDIT
4,34555752,php rearrange array elements based on condition,basically i have this array:\r\n\r\n array(...,<php>,2016-01-01 11:34:09,LQ_EDIT


In [17]:
test_df.shape

(15000, 6)

In [18]:
import re
def remove_non_ascii( text ):
    return re.sub(r'[^\x00-\x7f]_?',' ', text)

def strip_str( text ):
    return text.strip('"<p>""</p>"').replace( '><',',')

def remove_extra_spaces( text ):
    return " ".join(text.split())

In [19]:
columns_to_clean = ['Title', 'Body', 'Tags']
def clean_df_text( df ):
    for col in columns_to_clean:
        df[col] = df[col].apply( lambda x: remove_non_ascii(x))
        df[col] = df[col].apply( lambda x: remove_extra_spaces(x))
        df[col] = df[col].apply( lambda x: strip_str(x))
    

In [20]:
clean_df_text( train_df )
clean_df_text( test_df )

In [21]:
train_df.head()

Unnamed: 0,Id,Title,Body,Tags,CreationDate,Y
0,34552656,Java: Repeat Task Every Random Seconds,I'm already familiar with repeating tasks ever...,"java,repeat",2016-01-01 00:21:59,LQ_CLOSE
1,34553034,Why are Java Optionals immutable?,I'd like to understand why Java 8 Optionals we...,"java,optional",2016-01-01 02:03:20,HQ
2,34553174,Text Overlay Image with Darkened Opacity React...,I am attempting to overlay a title over an ima...,"javascript,image,overlay,react-native,opacity",2016-01-01 02:48:24,HQ
3,34553318,Why ternary operator in swift is so picky?,"The question is very simple, but I just could ...","swift,operators,whitespace,ternary-operator,op...",2016-01-01 03:30:17,HQ
4,34553755,hide/show fab with scale animation,I'm using custom floatingactionmenu. I need to...,"android,material-design,floating-action-button",2016-01-01 05:21:48,HQ


In [22]:
def create_dataset( train_df, test_df ):
    ds = DatasetDict({
        'train': Dataset.from_pandas(train_df),
        'test': Dataset.from_pandas( test_df )
    })
    print(ds)
    return ds

def format_prompt( sample ):
    return f""" given the Title and Tags of the programming problem you need to generate the body of problem.
                Title: {sample['Title']}
                Tags: {sample['Tags']}
                Answer: {sample['Body']}
    """

# def dataset_tokenize( sample ):
#     tokenizer.truncation_side = "left"
#     text = sample['text']
#     tokenized_inputs = tokenizer(    
#                          text, 
#                          return_tensors="pt", 
#                          truncation=True, 
#                          padding='max_length',
#                          max_length=Config.CONTEXT_LENGTH)
#     return tokenized_inputs

def tokenize(sample):
    outputs = tokenizer(
        sample["text"],
        truncation=True,
        max_length=Config.CONTEXT_LENGTH,
        return_overflowing_tokens=True,
        return_length=True,
    )
    input_batch = []
    for length, input_ids in zip(outputs["length"], outputs["input_ids"]):
        if length == Config.CONTEXT_LENGTH:
            input_batch.append(input_ids)
    return {"input_ids": input_batch}


In [23]:
train_df['text'] = train_df.apply( lambda sample: format_prompt(sample), axis=1)
test_df['text'] = test_df.apply( lambda sample: format_prompt(sample), axis=1)

In [24]:

ds = create_dataset( train_df.head(1000), test_df.head(1000) )

DatasetDict({
    train: Dataset({
        features: ['Id', 'Title', 'Body', 'Tags', 'CreationDate', 'Y', 'text'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['Id', 'Title', 'Body', 'Tags', 'CreationDate', 'Y', 'text'],
        num_rows: 1000
    })
})


In [25]:
outputs = tokenizer(
    ds["train"][:2]["text"],
    truncation=True,
    padding='max_length',
    max_length=Config.CONTEXT_LENGTH,
    return_overflowing_tokens=True,
    return_length=True,
)

print(f"Input IDs length: {len(outputs['input_ids'])}")
print(f"Input chunk lengths: {(outputs['length'])}")
print(f"Chunk mapping: {outputs['overflow_to_sample_mapping']}")

Input IDs length: 2
Input chunk lengths: [512, 512]
Chunk mapping: [0, 1]


### Tokenize train and test data set

In [26]:
tokenized_datasets = ds.map(
    tokenize, batched=True, remove_columns=ds["train"].column_names,
)
tokenized_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 228
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 171
    })
})

In [27]:
train_ds = tokenized_datasets['train']
test_ds = tokenized_datasets['test']

### SFT trainer

In [28]:
from trl import SFTTrainer
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import evaluate

In [29]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [30]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [31]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False, 
    r=8, 
    lora_alpha=32, 
    lora_dropout=0.1,
    target_modules = ["Wqkv"],
)
model = get_peft_model(model, peft_config)
print_trainable_parameters(model)

trainable params: 1572864 || all params: 815863808 || trainable%: 0.1927851173905731


In [32]:
# hyperparameters
lr = 1e-3 # size of optimization step 
batch_size = 1 # number of examples processed per optimziation step
num_epochs = 2 # number of times model runs through training data
tokenizer.pad_token = tokenizer.eos_token
# define training arguments
training_args = TrainingArguments(
    output_dir= Config.MODEL_CHECKPOINT + "-ques-ans",
    learning_rate=lr,
    per_device_train_batch_size=batch_size, 
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_steps=1,
    warmup_steps=2,
    lr_scheduler_type="cosine",
    fp16=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    optim="paged_adamw_8bit",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [33]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)

In [34]:
logging_steps=100## verify data collector samples
out = data_collator([tokenized_datasets["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

You're using a CodeGenTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


input_ids shape: torch.Size([5, 512])
attention_mask shape: torch.Size([5, 512])
labels shape: torch.Size([5, 512])


In [35]:
# del trainer

In [36]:
trainer = Trainer(
    model=model.to(Config.DEVICE), # our peft model
    args=training_args, # hyperparameters
    train_dataset=train_ds, # training data
    eval_dataset=test_ds, # validation data
    tokenizer=tokenizer, # define tokenizer
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
    compute_metrics=compute_metrics, # evaluates model using compute_metrics() function from before
)

In [37]:
%%time
torch.cuda.empty_cache()
trainer.train()

Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 6.35 GiB (GPU 0; 15.90 GiB total capacity; 7.35 GiB already allocated; 6.36 GiB free; 8.65 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Inference

In [38]:
import torch
from transformers import pipeline

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model=Config.MODEL_ID, cached_dir=Config.MODEL_CHECKPOINT + "-ques-ans",trust_remote_code=True, device=device
)

# creater trainer object
trainer = Trainer(
    model=model, # our peft model
    args=training_args, # hyperparameters
    train_dataset=train_ds, # training datatxt = """\
# create some data
x = np.random.randn(100)
y = np.random.randn(100)

# create scatter plot with x, y
"""
print(pipe(txt, num_return_sequences=1)[0]["generated_text"])
    eval_dataset=test_ds, # validation data
    tokenizer=tokenizer, # define tokenizer
    data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
#     compute_metrics=compute_metrics, # evaluates model using compute_metrics() function from before
)



SyntaxError: unterminated triple-quoted string literal (detected at line 26) (826445565.py, line 19)