### Загрузка данных

In [5]:
from datasets import load_dataset

dataset = load_dataset("camel-ai/chemistry")
print(dataset)

README.md:   0%|          | 0.00/2.14k [00:00<?, ?B/s]

chemistry.zip:   0%|          | 0.00/21.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['role_1', 'topic;', 'sub_topic', 'message_1', 'message_2'],
        num_rows: 20000
    })
})


In [1]:
import pandas as pd

In [7]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['role_1', 'topic;', 'sub_topic', 'message_1', 'message_2'],
        num_rows: 20000
    })
})


In [8]:
print(dataset['train'][0])

{'role_1': 'Chemist_RoleType.ASSISTANT', 'topic;': 'Organic chemistry', 'sub_topic': 'Naming organic compounds', 'message_1': 'What is the IUPAC name for the organic compound with the molecular formula C6H12O2?', 'message_2': 'There can be several isomers with the molecular formula C6H12O2, so it is not possible to provide a specific IUPAC name without more information about the structure of the compound. If you can provide the structure or any additional details, I would be happy to help you determine the IUPAC name.'}


### Загрузка предобученной модели и токенизатора

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Загружаем токенизатор
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=False,
    load_in_4bit=True
)

In [29]:
# Форматирование данных
def format_example(example):
    user_question = example['message_1']
    assistant_answer = example['message_2']

    prompt = f"<s>[INST] {user_question} [/INST] {assistant_answer}</s>"
    return {"text": prompt}

In [12]:
formatted_dataset = dataset['train'].map(format_example)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [13]:
print(formatted_dataset[0]["text"])

<s>[INST] What is the IUPAC name for the organic compound with the molecular formula C6H12O2? [/INST] There can be several isomers with the molecular formula C6H12O2, so it is not possible to provide a specific IUPAC name without more information about the structure of the compound. If you can provide the structure or any additional details, I would be happy to help you determine the IUPAC name.</s>


In [23]:
# Токенизация
def tokenize_function(example):
    tokenized = tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=384,
    )
    # Копируем input_ids как labels
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

In [24]:
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=formatted_dataset.column_names
)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [41]:
# Путь для сохранения модели
model_path = "/kaggle/working/tinyllama-chemistry"

In [30]:
from peft import get_peft_model, LoraConfig, TaskType

# Настройка LoRa
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,  # мы обучаем генерацию текста
    inference_mode=False,
    r=8,             # размер low-rank матриц
    lora_alpha=16,   # масштаб
    lora_dropout=0.1 # дропаут во время обучения
)

# Обернём модель в LoRA
model = get_peft_model(model, peft_config)

# Посмотрим, какие параметры будут обучаться
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [42]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Настройка обучения
training_args = TrainingArguments(
    output_dir= model_path,   
    per_device_train_batch_size=2,        
    gradient_accumulation_steps=8,        
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=20,
    save_steps=500,
    save_total_limit=2,
    fp16=True,                      
    report_to="none"
)

# Объединение примеров в батчи
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False 
)

In [43]:
# Запуск дообучения 
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
20,0.9626
40,0.9395
60,0.9499
80,0.9451
100,0.9485
120,0.9335
140,0.9312
160,0.9384
180,0.9301
200,0.9348


TrainOutput(global_step=3750, training_loss=0.8563698542277018, metrics={'train_runtime': 18247.5028, 'train_samples_per_second': 3.288, 'train_steps_per_second': 0.206, 'total_flos': 1.4316670550016e+17, 'train_loss': 0.8563698542277018, 'epoch': 3.0})

In [39]:
# Проверка модели
from transformers import pipeline

pipe = pipeline("text-generation", model="./tinyllama-mini", tokenizer=tokenizer)

prompt = "<s>[INST] What the name of C2H5Cl? [/INST]"
output = pipe(prompt, max_new_tokens=100)
print(output[0]["generated_text"])

Device set to use cuda:0


<s>[INST] What the name of C2H5Cl? [/INST] The name of C2H5Cl is chloroethane.


In [None]:
# Сохранение модели
model_path = "/kaggle/working/tinyllama-chemistry"

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [50]:
import shutil

shutil.make_archive('/kaggle/working/tinyllama-chemistry', 'zip', '/kaggle/working')

'/kaggle/working/tinyllama-chemistry.zip'