### Install Dependencies 

In [None]:
%pip install einops
%pip install peft
%pip install trl
%pip install tensorboard
%pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl
%pip install tokenizers
%pip install torch==2.1.2+cu121 --index-url https://download.pytorch.org/whl/cu121
%pip install torchaudio==2.1.2+cu121 --index-url https://download.pytorch.org/whl/cu121
%pip install torchvision==0.16.2+cu121 --index-url https://download.pytorch.org/whl/cu121
%pip install ipywidgets
%pip install -q -U git+https://github.com/huggingface/transformers.git
%pip install -q -U git+https://github.com/huggingface/peft.git
%pip install -q -U git+https://github.com/huggingface/accelerate.git
%pip install -q -U datasets scipy ipywidgets

### CHECK TO MAKE SURE YOU GOT A GPU

In [1]:
import torch
torch.cuda.is_available()

True

Helper Functions to normalize your data

In [2]:
import re
def remove_special_characters_and_spaces(input_string):
    # Define a regular expression pattern to match special characters and spaces
    pattern = r'[^a-zA-Z0-9]+'  # This pattern will keep only letters and digits

    # Use the sub method to replace matches of the pattern with an empty string
    clean_string = re.sub(pattern, '', input_string)

    return clean_string

def remove_special_characters(input_string):
    # Define a regular expression pattern to match special characters and spaces
    pattern = r'[^a-zA-Z0-9\s]+'  # This pattern will keep only letters and digits

    # Use the sub method to replace matches of the pattern with an empty string
    clean_string = re.sub(pattern, '', input_string)

    return clean_string

### Proto Prompting 

In [4]:
import csv
import json

Converting Your CSV to JSON

In [5]:
def convert_csv_to_json(csv_file_path):
    # Read CSV file
    with open(csv_file_path, 'r') as file:
        reader = csv.DictReader(file)
        rows = list(reader)
    # Convert CSV data to JSON
    json_data = json.dumps(rows, indent=4)

    # Save JSON data to a file (optional)
    with open(f'{csv_file_path}.json', 'w') as json_file:
        json_file.write(json_data)

    return json_data

In [55]:

import random

# Specify the path to your CSV file
csv_file_path = '../data/All Playlists Combined.csv'

# Convert CSV to JSON
json_data = convert_csv_to_json(csv_file_path)

print("Conversion completed. JSON data:")
# print(json_data)
json_data = json.loads(json_data)
songs = {}
songs['train']={
    'target':[],
    'lyrics':[],
    'id':[]
}
songs['validation']={
    'target':[],
    'lyrics':[],
    'id':[]
}
songs['test']={
    'target':[],
    'lyrics':[],
    'id':[]
}
count = 1
for i in json_data:
    track_name = remove_special_characters(i["track_name"])
    lyrics = remove_special_characters(i["lyrics"])
    if random.randint(0, 125125124) < 125125124/100:
        songs['test']['id'].append(f"test-{count}")
        songs['test']['target'].append(track_name)
        songs['test']['lyrics'].append(lyrics)
        songs['validation']['id'].append(f"validation-{count}")
        songs['validation']['target'].append(track_name)
        songs['validation']['lyrics'].append(lyrics)
    else:
        songs['train']['id'].append(f"train-{count}")
        songs['train']['target'].append(track_name)
        songs['train']['lyrics'].append(lyrics)
    count+=1
with open(f'../data/songs.json', 'w', encoding='utf-8') as f:
    json.dump(songs, f, ensure_ascii=True, indent=4)
    f.close()          

Conversion completed. JSON data:


#### Fine-tuning a Mixtral-7b 

In [3]:
from accelerate import FullyShardedDataParallelPlugin, Accelerator
from torch.distributed.fsdp.fully_sharded_data_parallel import FullOptimStateDictConfig, FullStateDictConfig
fsdp_plugin = FullyShardedDataParallelPlugin(
    state_dict_config=FullStateDictConfig(offload_to_cpu=True, rank0_only=False),
    optim_state_dict_config=FullOptimStateDictConfig(offload_to_cpu=True, rank0_only=False),
)
accelerator = Accelerator(fsdp_plugin=fsdp_plugin)

In [3]:
from datasets import load_dataset, DatasetDict

train_dataset = load_dataset("json", data_files='../data/songs.json',field="train", split='all')
eval_dataset = load_dataset("json", data_files='../data/songs.json', field="validation", split='all') 
test_dataset = load_dataset("json", data_files='../data/songs.json', field="test", split='all')

dataset = DatasetDict(
    {
        "train":train_dataset,
        "validation":eval_dataset,
        "test":test_dataset
    }
)

In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['target', 'lyrics', 'id'],
        num_rows: 1241
    })
    validation: Dataset({
        features: ['target', 'lyrics', 'id'],
        num_rows: 1241
    })
    test: Dataset({
        features: ['target', 'lyrics', 'id'],
        num_rows: 1241
    })
})

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig


base_model_id = "../models/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True)
tokenizer.pad_token = tokenizer.eos_token

In [7]:
def tokenize(prompt):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=512,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

In [8]:
def generate_and_tokenize_prompt(data_point):
    full_prompt =f"""Given a target song title, write a christmas song that corresponds to the target song title.


### Target song title:
{data_point["target"]}


### Song Lyrics:
{data_point["lyrics"]}
"""
    return tokenize(full_prompt)

In [9]:
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)

Map:   0%|          | 0/1241 [00:00<?, ? examples/s]

In [10]:
print(tokenized_train_dataset[4]['input_ids'])

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 12628, 264, 2718, 4034, 3941, 28725, 3324, 264, 22735, 4876, 4034, 369, 16535,

In [11]:
print("Target song title: " + test_dataset[1]['target'])
print("Song Lyrics: " + test_dataset[1]['lyrics'] + "\n")

Target song title: It's Beginning to Look a Lot Like Christmas
Song Lyrics: 

It's beginning to look a lot like Christmas
Everywhere you go
Take a look in the five-and-ten
Glistening once again
With candy canes and silver lanes aglow

It's beginning to look a lot like Christmas
Toys in every store
But the prettiest sight to see
Is the holly that will be
On your own front door

A pair of hop along boots and a pistol that shoots
Is the wish of Barney and Ben
Dolls that will talk and will go for a walk
Is the hope of Janice and Jen
And mom and dad can hardly wait for school to start again

It's beginning to look a lot like Christmas
Everywhere you go
Now there's a tree in the Grand Hotel
One in the park as well
The sturdy kind that doesn't mind the snow

It's beginning to look a lot like Christmas
Soon the bells will start
And the thing that will make them ring
Is the carol that you sing
Right within your heart

A pair of hop along boots and a pistol that shoots
Is the wish of Barney and 

In [25]:
eval_prompt = f"""Given a target song title, provide the lyrics of song that corresponds to the target song title.


### Target song title:
    {test_dataset[1]['target']}


### Song Lyrics:
"""

In [17]:
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
    print(tokenizer.decode(model.generate(**model_input, max_new_tokens=256, padding_side='left',pad_token_id=2)[0], skip_special_tokens=True))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Given a target song title, provide the lyrics of song that corresponds to the target song title.


### Target song title:
    It's Beginning to Look a Lot Like Christmas


### Song Lyrics:
 1.
(Verse 1)
What's that coming over the hill,
Is it a man with a gun,
Or a sleepy old town,
Or just a few more days 'til Christmas?

It's beginning to look a lot like Christmas,
Everywhere I go.
Stores are filled with bright and shiny things,
And families gather together in joy.

2.
(Verse 2)
Snow on the ground,
Gifts wrapped and placed under the tree for all to see,
Candy canes on shelves in every store,
And children listening,
Listening for sleigh bells in the snow.

It's beginning to look a lot like Christmas,
Everywhere I go.
There'll be much to do,
With just a week 'til Christmas day.

3.
(Verse 3)
There'll be parties for hosting,
Marshmallows for toasting,
And caroling out in the snow.
There'll be noises,
And bright lights,
Telling me,
Yes, it's beginning to look a lot like Christmas


In [12]:
from peft import prepare_model_for_kbit_training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [13]:
from peft import LoraConfig, get_peft_model
config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=0.05,  # Conventional
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
print_trainable_parameters(model)
# Apply the accelerator. You can comment this out to remove the accelerator.
model = accelerator.prepare_model(model)

trainable params: 21260288 || all params: 3773331456 || trainable%: 0.5634354746703705


In [14]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): L

In [None]:
if torch.cuda.device_count() > 1: # If more than 1 GPU
    model.is_parallelizable = True
    model.model_parallel = True

In [30]:
import transformers
from datetime import datetime


project = "xmas-finetune"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "../models/" + run_name


tokenizer.pad_token = tokenizer.eos_token


trainer = transformers.Trainer(
    model=model,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    args=transformers.TrainingArguments(
        output_dir=output_dir,
        warmup_steps=5,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        max_steps=5000,
        learning_rate=2.5e-5, # Want about 10x smaller than the Mistral learning rate
        logging_steps=1,
        bf16=True,
        optim="paged_adamw_8bit",
        logging_dir="./logs",        # Directory for storing logs
        save_strategy="steps",       # Save the model checkpoint every logging step
        save_steps=100,                # Save checkpoints every 50 steps
        evaluation_strategy="steps", # Evaluate the model every logging step
        eval_steps=500,
        save_total_limit=3,              # Evaluate and save checkpoints every 50 steps
        do_eval=True,                # Perform evaluation at the end of training          # Comment this out if you don't want to use weights & baises
        run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"          # Name of the W&B run (optional)
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!


In [None]:
trainer.train()

In [None]:
trainer.train(resume_from_checkpoint="../models/mistral-xmas-finetune/checkpoint-900")

In [21]:
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Mistral, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
    use_auth_token=True
)
tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [None]:
from peft import PeftModel
adapter_path = "../models/mistral-xmas-finetune/checkpoint-900"
ft_model = PeftModel.from_pretrained(base_model, adapter_path)

In [38]:
eval_prompt ="""Given a target song title, write a christmas song that corresponds to the target song title.


### Target song title:
Twas once a moonlit night


### Song Lyrics:
"""
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")


In [None]:
ft_model.eval()

In [39]:
with torch.no_grad():
    print(tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=1024, pad_token_id=2)[0], skip_special_tokens=True))

Given a target song title, write a christmas song that corresponds to the target song title.


### Target song title:
Twas once a moonlit night


### Song Lyrics:


[Verse 1]
Oh by gosh by golly gee
It's the big fat Christmas family
I wanna sing about the greatest thing
That ever happened on the 25th thing
So let's all join together friends
And sing about the greatest thing that ever was
Born on a cold December night
The greatest thing was born
Born on a cold December night
The greatest thing was born

[Chorus]
Joy to the world it's a moonlit night
Joy to the world it's a moonlit night
Born in a stable so cold
Blessed is the world with the holy birth
Of the Savior of the world
Born on a cold December night

[Verse 2]
Lay your head on that old cabin door
Come and listen to the old men four
Sing about the greatest thing that
The Lord was born in Bethlehem
So you will know the day will come
When he'll come again and take you home
So by gosh by golly wow
He'll come again and take you home


In [None]:
save_to="../models/mixtral-xmas-finetune" 
ft_model.save_pretrained(save_to, safe_serialization=True, max_shard_size='4GB')
tokenizer.save_pretrained(save_to)