In [None]:
%%capture

%pip install -U transformers
%pip install -U datasets
%pip install -U accelerate
%pip install -U peft
%pip install -U trl
%pip install -U bitsandbytes
%pip install -U wandb
%pip install -U arxiv
%pip install -U PyPDF2

In [None]:
import os, json, torch, wandb
import PyPDF2, io, requests, arxiv

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    TrainerCallback,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)

from datasets import load_dataset, concatenate_datasets, Dataset
from trl import SFTTrainer, setup_chat_format
from dataclasses import dataclass

### Preprocess elibrary dataset

In [None]:
directory = '/content/drive/MyDrive/mmp/'

!mkdir /content/embeds
for filename in os.listdir(directory):
    if filename.endswith('.zip'):
        file_path = os.path.join(directory, filename)
        !unzip "$file_path" -d /content/embeds

In [None]:
datasets = []
datasets_dir = "/content/embeds/"
error_files = ['18_embed.jsonl']
for filename in os.listdir(datasets_dir):
    if filename.endswith('.jsonl') and filename not in error_files:
        try:
            file_path = os.path.join(datasets_dir, filename)
            print("File:", filename)
            datasets.append(load_dataset('json', data_files=file_path))
        except:
            print("Error while processing:", file_path)

In [None]:
for filename in error_files:
    list_of_dataset = []
    file_path = os.path.join(datasets_dir, filename)
    with open(file_path, 'r') as file:
        for i, line in enumerate(file, start=1):
            try:
                json_file = json.loads(line)
                list_of_dataset.append(json_file)
            except json.JSONDecodeError as e:
                print(f"Error in line {i}: {e}")
    datasets.append(Dataset.from_list(list_of_dataset))

In [None]:
dataset = concatenate_datasets([ds['train'] for ds in datasets])

In [None]:
@dataclass
class Config:
    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
    new_model = "SciMMP-1.5b-ift"
    torch_dtype = torch.bfloat16
    attn_implementation = "eager"
cfg = Config()

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("huggingface_token")

login(token = hf_token)

wb_token = user_secrets.get_secret("wandb_api_key")

wandb.login(key=wb_token)
run = wandb.init(
    project=cfg.new_model, 
    job_type="training", 
    anonymous="allow"
)

### Load and configure the model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    cfg.model_name,
    device_map="auto",
    attn_implementation=cfg.attn_implementation,
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)
model, tokenizer = setup_chat_format(model, tokenizer)
tokenizer.padding_side = 'right'
tokenizer.padding_token = '<|pad|>'

In [None]:
peft_config = LoraConfig(
    r=4,
    lora_alpha=8,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)
model = get_peft_model(model, peft_config)

### Load and process instruction datasets

In [None]:
datasets_names = {
    "vikhr": "Vikhrmodels/GrandMaster-PRO-MAX",
    "arxiv": "taesiri/arxiv_qa",
    "no_robots": "HuggingFaceH4/no_robots",
    "chemistry": "dim/camel_ai_chemistry",
    "physics": "lgaalves/camel-ai-physics",
    "biology": "dim/camel_ai_biology",
    "math": "lighteval/MATH"
}

datasets_dict = {
    "vikhr": load_dataset(datasets_names["vikhr"]),
    "arxiv": load_dataset(datasets_names["arxiv"]),
    "no_robots": load_dataset(datasets_names["no_robots"]),
    "chemistry": load_dataset(datasets_names["chemistry"]),
    "physics": load_dataset(datasets_names["physics"]),
    "biology": load_dataset(datasets_names["biology"]),
    "math": load_dataset(datasets_names["math"])
}


In [None]:
# Dataset preprocessing functions
def get_arxiv_paper_text(paper_id):
    search = arxiv.Search(id_list=[paper_id])

    for result in search.results():
        paper_url = result.pdf_url
        response = requests.get(paper_url)
        pdf_file = io.BytesIO(response.content)

        reader = PyPDF2.PdfReader(pdf_file)
        paper_text = ""
        for page_num in range(len(reader.pages)):
            page = reader.pages[page_num]
            paper_text += page.extract_text()

        return paper_text

def preprocess_vikhr(row):
    row_json = [
        {"role": "user", "content": row['conversation'][0]["content"]},
        {"role": "assistant", "content": row["conversation"][1]["content"]}
    ]
    text = tokenizer.apply_chat_template(row_json, tokenize=False)
    for k in list(row.keys()):
        row.pop(k)
    
    return {"text": text}

def preprocess_arxiv(row):
    arxiv_text = get_arxiv_paper_text(row["paper_id"])[:4000]
    
    arxiv_formatted = f"""Here's the arxiv article you'll be working with:
    
    {arxiv_text}
    
    {row["question"]}"""
    
    template = [
        {"role": "user", "content": arxiv_formatted},
        {"role": "assistant", "content": row["answer"]}
    ]
    
    text = tokenizer.apply_chat_template(template, tokenize=False)
    
    for k in list(row.keys()):
        row.pop(k)
        
    return {"text": text}

def preprocess_robots(row):
    row_json = [
        {"role": "user", "content": row['messages'][0]["content"]},
        {"role": "assistant", "content": row["messages"][1]["content"]}
    ]
    text = tokenizer.apply_chat_template(row_json, tokenize=False)
    
    for k in list(row.keys()):
        row.pop(k)
        
    return {"text": text}

def preprocess_camel(row):
    row_json = [
        {"role": "user", "content": row["message_1"]},
        {"role": "assistant", "content": row["message_2"]}
    ]
    text = tokenizer.apply_chat_template(row_json, tokenize=False)
    
    for k in list(row.keys()):
        row.pop(k)
        
    return {"text": text}

def preprocess_math(row):
    row_json = [
        {"role": "user", "content": row["problem"]},
        {"role": "assistant", "content": row["solution"]}
    ]
    text = tokenizer.apply_chat_template(row_json, tokenize=False)
    
    for k in list(row.keys()):
        row.pop(k)
        
    return {"text": text}

In [None]:
SEED = 2025

datasets_train = [
    datasets_dict["vikhr"]["train"].shuffle(SEED).select(range(6000)).map(preprocess_vikhr),
    datasets_dict["arxiv"]["train"].shuffle(SEED).select(range(100)).map(preprocess_arxiv),
    datasets_dict["no_robots"]["train"].shuffle(SEED).select(range(1000)).map(preprocess_robots),
    datasets_dict["chemistry"]["train"].shuffle(SEED).select(range(1000)).map(preprocess_camel),
    datasets_dict["physics"]["train"].shuffle(SEED).select(range(1000)).map(preprocess_camel),
    datasets_dict["biology"]["train"].shuffle(SEED).select(range(1000)).map(preprocess_camel),
    datasets_dict["math"]["train"].shuffle(SEED).select(range(200)).map(preprocess_math),
]

In [None]:
SEED = 2025
TEST_SAMPLES = 100

dataset_sh = dataset.shuffle(seed=SEED).select(range(80_000))
dataset_train = concatenate_datasets(datasets_train).shuffle(SEED)

dataset_sh = concatenate_datasets(dataset_sh, dataset_train)
dataset_sh = dataset_sh.train_test_split(TEST_SAMPLES/len(dataset_sh), seed=SEED)
dataset_sh

In [None]:
def generate_text(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    generated = input_ids.to(model.device)

    for _ in range(max_length):
        outputs = model(generated)
        logits = outputs.logits

        next_token_logits = logits[:, -1, :]
        probabilities = torch.softmax(next_token_logits, dim=-1)

        next_token = torch.multinomial(probabilities, num_samples=1)

        generated = torch.cat((generated, next_token), dim=1)

        if next_token.item() == tokenizer.eos_token_id:
            break

    generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    return generated_text

def generate_text_it(prompt, max_length=50):
    prompt = [
        { "role": "user", "content": prompt },
    ]
    prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    generated = input_ids.to(model.device)

    for _ in range(max_length):
        outputs = model(generated)
        logits = outputs.logits

        next_token_logits = logits[:, -1, :]
        probabilities = torch.softmax(next_token_logits, dim=-1)

        next_token = torch.multinomial(probabilities, num_samples=1)

        generated = torch.cat((generated, next_token), dim=1)

        if next_token.item() == tokenizer.eos_token_id:
            break

    generated_text = tokenizer.decode(generated[0], skip_special_tokens=True)
    return generated_text

In [None]:
# Set of prompts to see how well the model performs during training
set_of_prompts = ["The results of the experiment demonstrated a significant increase in cell proliferation when exposed to ",
"According to the theory of general relativity, space-time is curved by ",
"The chemical reaction between sodium chloride and water results in the formation of",
"Quantum mechanics suggests that particles exist in multiple states until they are",
"The primary advantage of using CRISPR technology in gene editing is its ability to",
"In a double-blind clinical trial, the control group was administered a placebo while the experimental group received",
"The neural network was trained using a dataset of over one million images, and its performance was evaluated based on ",
"Photosynthesis in plants is driven by light energy, which is absorbed by chlorophyll molecules located in the",
"""The Role of Inflammatory Markers in Cardiovascular Disease
Cardiovascular disease is one of the leading causes of mortality worldwide. Recent studies have highlighted the significance of inflammatory markers such as C-reactive protein (CRP) and interleukin-6 (IL-6) in""",
"""Advances in Nanotechnology for Drug Delivery
Nanotechnology has revolutionized the field of drug delivery by enabling the targeted delivery of therapeutics with minimal side effects. In recent years, several novel nanocarriers such as liposomes, dendrimers, and""",
"""Machine Learning Approaches to Predict Protein-Protein Interactions
Protein-protein interactions (PPIs) play a crucial role in biological processes. However, experimental identification of PPIs is time-consuming and costly. Machine learning models have been developed to""",
"Recent advances in artificial intelligence have led to the development of deep learning models capable of surpassing traditional machine learning algorithms in various tasks. For example, Convolutional Neural Networks (CNNs) have shown remarkable performance in",
"Several studies have investigated the effect of atmospheric CO2 levels on global climate patterns. The seminal work by Smith et al. (2010) demonstrated a clear correlation between ",
"A study measured the blood pressure of 200 patients before and after administering a new antihypertensive drug. On average, blood pressure decreased by 15%. This suggests that the drug...",
"In a recent clinical trial, 60% of participants in the treatment group showed symptom improvement, while only 20% of participants in the control group reported similar results. This indicates that",
"Результаты эксперимента продемонстрировали значительное увеличение пролиферации клеток при воздействии",
"Согласно общей теории относительности, пространство-время искривляется на",]

set_of_prompts_it = ["Explain the role of transcription factors in gene expression, and give an example of how they can regulate cellular differentiation.",
"Summarize the process of mitosis, focusing on the key stages and their significance for cellular division.",
"Describe the evolutionary significance of horizontal gene transfer in bacteria and its impact on antibiotic resistance.",
"How does the structure of the phospholipid bilayer contribute to the selective permeability of the cell membrane?",
"Compare and contrast the mechanisms of SN1 and SN2 nucleophilic substitution reactions, including the factors that favor each.",
"Explain the concept of Gibbs free energy and its relevance in predicting the spontaneity of chemical reactions.",
"Describe how infrared spectroscopy can be used to identify functional groups in an organic compound.",
"Write a step-by-step explanation of the process of acid-base titration, including how to determine the equivalence point.",
"Explain how the Heisenberg Uncertainty Principle limits our ability to measure both the position and momentum of a particle simultaneously.",
"Describe the fundamental differences between general relativity and quantum mechanics, and explain why they are difficult to reconcile.",
"In the context of thermodynamics, explain the second law and how it relates to entropy in isolated systems.",
"Provide a detailed explanation of how the photoelectric effect supports the quantum theory of light."]

In [None]:
class TextGenerationCallback(TrainerCallback):
    def __init__(self):
        super().__init__()
        self.cnt = 0
        self.cnt_it = 0

    def on_evaluate(self, args, state, control, **kwargs):
        model.eval()
        sample_input = set_of_prompts[self.cnt]
        self.cnt = (self.cnt + 1) % len(set_of_prompts)
        generated_text = generate_text(sample_input)
        print(f"Generated text at step {state.global_step}: {generated_text}")
        
        sample_input = set_of_prompts_it[self.cnt_it]
        self.cnt_it = (self.cnt_it + 1) % len(set_of_prompts_it)
        generated_text = generate_text_it(sample_input)
        print(f"Generated instruct text at step {state.global_step}: {generated_text}")
        
        !rm -r /kaggle/working/SciMMP-1.5b-it/checkpoint-*

In [None]:
training_arguments = TrainingArguments(
    output_dir=cfg.new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=1000,
    logging_steps=100,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    lr_scheduler_type='linear',
    fp16=False,
    bf16=True,
    group_by_length=True,
    report_to="wandb",
    run_name=cfg.new_model,
)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_sh["train"],
    eval_dataset=dataset_sh["test"],
    peft_config=peft_config,
    max_seq_length=4096,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
    callbacks=[TextGenerationCallback()]
)

In [None]:
trainer.train(resume_from_checkpoint=True)