## Training

In [2]:
# ! apt update && apt install -y zip unzip

In [4]:
# ! unzip ./phase_2_input_data.zip

In [1]:
# ! pip install -q datasets unsloth pandas

In [1]:
import pandas as pd
train_features_df = pd.read_json("training_data/train.features", lines=True)
train_labels_df = pd.read_json("training_data/train.labels", lines=True)
print(train_features_df.shape, train_labels_df.shape)
train_df = pd.merge(train_features_df, train_labels_df, on="indoml_id")

(561838, 4) (561838, 5)


In [2]:
train_df["label"] = train_df['supergroup']  + " __" + train_df['group'] + " __" + train_df['module'] + " __" + train_df['brand']

In [3]:
count_df = train_df["label"].value_counts().reset_index()

single_label = count_df[count_df["count"] == 1]["label"].tolist()

In [4]:
df1 = train_df[train_df["label"].isin(single_label)]

In [5]:
train_df = train_df[~train_df["label"].isin(single_label)]
train_df.shape

(560118, 9)

In [6]:
# ! pip install -q scikit-learn

In [7]:
# perform stratified split based on brand
from sklearn.model_selection import train_test_split
df2, val_df = train_test_split(train_df, test_size=0.83, stratify=train_df['label'])
df2.shape, val_df.shape

((95220, 9), (464898, 9))

In [8]:
train_df = pd.concat([df1, df2], axis=0)
train_df.shape

(96940, 9)

In [9]:
import json
from tqdm import tqdm
max_length = 0
lines_list = []
for _, row in tqdm(train_df.iterrows()):
    template = []
    desc = row["description"]
    retailer = row["retailer"]
    price = row["price"]
    human_val = json.dumps({"description": desc, "retailer": retailer, "price": price})
    gpt_val = json.dumps({"supergroup": row["supergroup"], "group": row["group"], "module": row["module"], "brand": row["brand"]})
    template.append({'from': 'human', 'value': human_val})
    template.append({'from': 'gpt', 'value': gpt_val})
    if len(human_val) + len(gpt_val) > max_length:
        max_length = len(human_val) + len(gpt_val)
    lines_list.append(template)

96940it [00:07, 12806.77it/s]


In [10]:
from datasets import Dataset

data_dict = {
    'conversations': lines_list
}

# Create a Hugging Face Dataset from the dictionary
dataset = Dataset.from_dict(data_dict)

# Display the first few rows to verify
print(dataset)

Dataset({
    features: ['conversations'],
    num_rows: 96940
})


In [12]:
# ! pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"

In [13]:
# ! pip install -qU unsloth

In [11]:
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svâ€¦

In [12]:
# ! pip install -q huggingface-hub==0.24.7

In [13]:
# !pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [14]:
# ! pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"

In [15]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "srinathmkce/indoml_100k_llama_model_16bit",
    max_seq_length = 512,
    dtype = torch.bfloat16,
    load_in_4bit = True,
    use_gradient_checkpointing = "unsloth",
    attn_implementation="flash_attention_2"
)

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.3: Fast Llama patching. Transformers = 4.46.0.dev0.
   \\   /|    GPU: NVIDIA A40. Max memory: 44.352 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 8.6. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [16]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.10.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [17]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

# from datasets import load_dataset
# dataset = load_dataset("srinathmkce/indoml", split="train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/96940 [00:00<?, ? examples/s]

In [18]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = 512,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        # auto_find_batch_size = True,
        per_device_train_batch_size = 16,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs=1,
        # max_steps = 100,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs"
    ),
)

Map (num_proc=2):   0%|          | 0/96940 [00:00<?, ? examples/s]

  super().__init__(
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [19]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A40. Max memory = 44.352 GB.
5.633 GB of memory reserved.


In [20]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 96,940 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 16 | Gradient Accumulation steps = 4
\        /    Total batch size = 64 | Total steps = 1,514
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,3.2461
2,3.294
3,3.2188
4,3.0413
5,2.6624
6,2.0569
7,1.7708
8,1.5572
9,1.4388
10,1.348


In [21]:
model.push_to_hub("srinathmkce/indoml_100k_llama_model_epoch2_lora", private=True) # Online saving
tokenizer.push_to_hub("srinathmkce/indoml_100k_llama_model_epoch2_lora", private=True) # Online saving

README.md:   0%|          | 0.00/606 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/168M [00:00<?, ?B/s]

Saved model to https://huggingface.co/srinathmkce/indoml_100k_llama_model_epoch2


tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [22]:
model.push_to_hub_merged("srinathmkce/indoml_100k_llama_model_epoch2", tokenizer, save_method = "merged_16bit")

Unsloth: You are pushing to hub, but you passed your HF username = srinathmkce.
We shall truncate srinathmkce/indoml_100k_llama_model_epoch2 to indoml_100k_llama_model_epoch2


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 331.41 out of 503.53 RAM for saving.


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 32/32 [00:00<00:00, 37.64it/s]


Unsloth: Saving tokenizer...

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/srinathmkce/indoml_100k_llama_model_epoch2


## Inference

In [23]:
! pip install -q vllm

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.0.2+cu118 requires torch==2.0.1, but you have torch 2.4.0 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
from vllm import LLM, SamplingParams

In [3]:
llm = LLM(model="srinathmkce/indoml_100k_llama_model_epoch2", max_model_len=1024)

config.json:   0%|          | 0.00/818 [00:00<?, ?B/s]

INFO 10-21 08:36:21 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='srinathmkce/indoml_100k_llama_model_epoch2', speculative_config=None, tokenizer='srinathmkce/indoml_100k_llama_model_epoch2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=srinathmkce/indoml_100k_llama_model_epoch2, num_scheduler_steps=1, chunked_prefill

tokenizer_config.json:   0%|          | 0.00/51.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/225 [00:00<?, ?B/s]

INFO 10-21 08:36:26 model_runner.py:1056] Starting to load model srinathmkce/indoml_100k_llama_model_epoch2...
INFO 10-21 08:36:26 weight_utils.py:243] Using model weights format ['*.safetensors']


model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 10-21 08:39:22 model_runner.py:1067] Loading model weights took 14.9595 GB
INFO 10-21 08:39:23 gpu_executor.py:122] # GPU blocks: 12042, # CPU blocks: 2048
INFO 10-21 08:39:23 gpu_executor.py:126] Maximum concurrency for 1024 tokens per request: 188.16x
INFO 10-21 08:39:26 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 10-21 08:39:26 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 10-21 08:39:39 model_runner.py:1523] Graph capturing finished in 13 secs.


In [4]:
sampling_params = SamplingParams(temperature=0.1, max_tokens=128)

In [5]:
import pandas as pd
import json
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

df = pd.read_json("final_test_data/final_test_data.features", lines=True)

print(df.shape)

random_df = df.sample(100)
random_df.shape

(184664, 4)


(100, 4)

In [6]:
import json
output_list = []
for index, row in tqdm(random_df.iterrows()):

    conversation = [
        {
            "role": "human",
            "content": json.dumps({"description": row["description"], "retailer": row["retailer"], "price": row["price"]})
        }
    ]
    outputs = llm.chat(conversation, sampling_params=sampling_params, use_tqdm=False)
    output_list.append(outputs)
    break

0it [00:01, ?it/s]


In [8]:
def print_outputs(outputs):
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(generated_text)
        # print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
    print("-" * 80)


In [9]:
print_outputs(outputs)

{"supergroup": "biscuits & confectionery & snacks", "group": "snacks", "module": "snacks chips crisps", "brand": "walkers crisps"}
--------------------------------------------------------------------------------


In [14]:
import json

def build_prompt(row):
    # Use default values for missing keys
    description = row.get("description", "unknown")
    retailer = row.get("retailer", "unknown")
    price = row.get("price", "unknown")
    
    try:
        conversation = [
            {
                "role": "human",
                "content": json.dumps({"description": description, "retailer": retailer, "price": price})
            }
        ]
        return conversation
    except Exception as e:
        # If any error occurs during processing, set all values to "unknown"
        return [
            {
                "role": "human",
                "content": json.dumps({"description": "unknown", "retailer": "unknown", "price": "unknown"})
            }
        ]


In [18]:
from tqdm import tqdm
conversation = []
batch_output = []
for index, row in tqdm(df.iterrows()):
    prompt = build_prompt(row)
    conversation.append(prompt)
    
outputs = llm.chat(conversation, sampling_params=sampling_params, use_tqdm=True)
for index, output in enumerate(outputs):
    json_str = outputs[index].outputs[0].text
    batch_output.append(json.loads(json_str))


184664it [00:12, 14700.70it/s]
Processed prompts: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 184664/184664 [1:03:27<00:00, 48.50it/s, est. speed input: 1930.48 toks/s, output: 1928.09 toks/s] 


In [19]:
result_df = pd.DataFrame(batch_output)
result_df.head()

Unnamed: 0,supergroup,group,module,brand
0,garden & flora,garden & flora detail unknown total,garden & flora,receipt all
1,homecare,fresheners deodorizers,air fresheners continuous,airwick
2,beverages alcoholic,beer lager ales,beer lager ales,budweiser
3,beverages alcoholic,beer lager ales,beer lager ales,4 pure
4,automotive,automotive detail unknown total,automotive,receipt all


In [20]:
result_df.to_csv("./phase_2_test_set4_predictions.csv", index=False)

In [21]:
result_df = pd.concat([df["indoml_id"], result_df[['supergroup', 'group', 'module', 'brand']]], axis=1)

In [22]:
result_df.isna().sum()

indoml_id     0
supergroup    0
group         0
module        0
brand         0
dtype: int64

In [23]:
result_df.dtypes

indoml_id      int64
supergroup    object
group         object
module        object
brand         object
dtype: object

In [24]:
category_list = []
for index, row in tqdm(result_df.iterrows()):
    # print(row)
    category_dict = {
        "indoml_id": row["indoml_id"],
        "supergroup": row["supergroup"],
        "group": row["group"],
        "module": row["module"],
        "brand": row["brand"]
    }
    category_list.append(category_dict)

184664it [00:11, 15896.70it/s]


In [25]:
# Save predictions to JSON file
import json
with open("test_gradientgurus5.predict", "w") as fp:
    for row in category_list:
        fp.write(json.dumps(row) + "\n")