In [1]:
%%capture
!pip install accelerate peft bitsandbytes transformers trl wandb

In [4]:
import os
from datasets import Dataset, load_dataset
import pandas as pd
from peft import LoraConfig
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
import torch
from trl import SFTTrainer

In [5]:
# Model from Hugging Face hub
base_model = "NousResearch/Meta-Llama-3-8B"

# Fine-tuned model
new_model = "Meta-Llama-3-8B-Crossword"

In [6]:
df = pd.read_csv("/kaggle/input/crossword/nytcrosswords.csv")

In [7]:
df.describe()

Unnamed: 0,Date,Word,Clue
count,781573,781539,781573
unique,10207,63313,493935
top,5/16/1999,ERA,Jai ___
freq,181,634,122


In [8]:
# Keeping only unique rows based on 'Clue' column
df_unique = df.drop_duplicates(subset=['Clue'])

In [9]:
df_unique.describe()

Unnamed: 0,Date,Word,Clue
count,493935,493917,493935
unique,10207,61391,493935
top,10/3/2021,ERA,King's superior
freq,138,356,1


In [10]:
# Shuffling the DataFrame
df_unique = df_unique.sample(frac=1).reset_index(drop=True)

# Calculating number of samples for each split
#total_samples = len(df_unique)
total_samples = 5000 # Currently keeping it to 5000 due to resource constraints
train_size = int(0.7 * total_samples)  # 70% for training
validation_size = int(0.15 * total_samples)   # 15% for validation
test_size = total_samples - train_size - validation_size  # Remaining for test

# Splitting into training, validation, and test sets
train_df = df_unique.iloc[:train_size]
validation_df = df_unique.iloc[train_size:train_size + validation_size]
test_df = df_unique.iloc[train_size + validation_size:]

In [11]:
# Converting pandas DataFrames to datasets
train_dataset = Dataset.from_pandas(train_df)
validation_dataset = Dataset.from_pandas(validation_df)
test_dataset = Dataset.from_pandas(test_df)

In [12]:
# Defining a function to merge intent and snippet into a single column
def merge_columns(example):
    clue = example["Clue"]
    word = example["Word"]
    merged_text = f"<s> [INST]{clue} [/INST] {word} </s>"
    return {"text": merged_text}  # Return a dictionary with the merged text

# Apply the merge function to each split of the dataset
train_dataset = train_dataset.map(merge_columns)
validation_dataset = validation_dataset.map(merge_columns)
test_dataset = test_dataset.map(merge_columns)

# Convert dictionaries to strings
train_dataset = [example["text"] for example in train_dataset]
validation_dataset = [example["text"] for example in validation_dataset]
test_dataset = [example["text"] for example in test_dataset]

# Show the number of samples in each split
print("Number of samples in train split:", len(train_dataset))
print("Number of samples in validation split:", len(validation_dataset))
print("Number of samples in test split:", len(test_dataset))

Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/489685 [00:00<?, ? examples/s]

Number of samples in train split: 3500
Number of samples in validation split: 750
Number of samples in test split: 489685


In [13]:
# Convert the list of strings to a dataset object
train_dataset = Dataset.from_dict({"text": train_dataset})
validation_dataset = Dataset.from_dict({"text": validation_dataset})
test_dataset = Dataset.from_dict({"text": test_dataset})

In [14]:
train_dataset[0:5]

{'text': ['<s> [INST]Heroine of Wagner\'s "The Flying Dutchman" [/INST] SENTA </s>',
  '<s> [INST]Distributed [/INST] SOWN </s>',
  '<s> [INST]"Super Hits" company [/INST] KTEL </s>',
  '<s> [INST]"Rotten School" series author [/INST] STINE </s>',
  '<s> [INST]Watson, Willard and Woodhouse [/INST] EMMAS </s>']}

In [15]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

In [16]:
# Load base model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0}
)
model.config.use_cache = False
model.config.pretraining_tp = 1

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

In [17]:
# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [18]:
# Load LoRA configuration
peft_args = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [19]:
training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [20]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_args,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [22]:
# Train model
trainer.train()

Step,Training Loss
25,3.7255
50,2.3526
75,2.2271
100,2.0479
125,1.7033
150,1.6015
175,1.587
200,1.576
225,1.607
250,1.5309


TrainOutput(global_step=875, training_loss=1.6729201093401227, metrics={'train_runtime': 2412.1869, 'train_samples_per_second': 1.451, 'train_steps_per_second': 0.363, 'total_flos': 3121172853202944.0, 'train_loss': 1.6729201093401227, 'epoch': 1.0})

In [23]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [24]:
trainer.tokenizer.save_pretrained(new_model)

('Meta-Llama-3-8B-Crossword/tokenizer_config.json',
 'Meta-Llama-3-8B-Crossword/special_tokens_map.json',
 'Meta-Llama-3-8B-Crossword/tokenizer.json')

In [None]:
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer

# Load the fine-tuned model and tokenizer from the local directory
new_model = AutoModelForCausalLM.from_pretrained("Meta-Llama-3-8B-Crossword")
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B", trust_remote_code=True)

# Ignore warnings
import logging
logging.getLogger("transformers").setLevel(logging.ERROR)

# Run text generation pipeline with our fine-tuned model
prompt = "<s> <INST> Capital of USA </INST>"
pipe = pipeline(task="text-generation", model=new_model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]