In [None]:
%%capture

%pip install -U trl
%pip install -U transformers
%pip install -U datasets

In [None]:
%%capture

%pip install --target=/kaggle/working vllm
%pip install --target=/kaggle/working "grpcio>=1.60.0"
%rm -rf /kaggle/working/ray*
%pip wheel "ray>=2.11" -w /kaggle/working/packages/

In [None]:
model_name = "elvispresniy/SciMMP-1.5-it-v2"

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'right'
tokenizer.padding_token = '<|pad|>'

In [None]:
from vllm import LLM, SamplingParams

llm = LLM(model="elvispresniy/SciMMP-1.5-it-v2")

In [None]:
from datasets import load_dataset

datasets_names = {
    "no_robots": "HuggingFaceH4/no_robots",
    "allenai": "allenai/sciq"
}

datasets_dict = {
    "no_robots": load_dataset(datasets_names["no_robots"]),
    "allenai": load_dataset(datasets_names["allenai"])
}

In [None]:
system_prompt_1 = "Answer only on the subject. Don't be too much verbose. Provide scientific evidence. As soon as the answer is provided return [EOS] token. "
system_prompt_0 = " As soon as you finished return [EOS] token."

def preprocess_robots(row):
    question = row['messages'][0]["content"]
    row_json = [
        {"role": "user", "content": question + system_prompt_0},
    ]
    text = tokenizer.apply_chat_template(row_json, tokenize=False)
    
    for k in list(row.keys()):
        row.pop(k)
        
    return {"text": text, "question": question}

def preprocess_allenai(row):
    question =  row["question"]
    row_json = [
        {"role": "user", "content": system_prompt_1 + question},
    ]
    text = tokenizer.apply_chat_template(row_json, tokenize=False)
    
    
        
    return {"text": text, "question": question}

In [None]:
from datasets import concatenate_datasets

SEED = 2025

datasets_train = [
    datasets_dict["allenai"]["train"].shuffle(SEED).map(preprocess_allenai),
    datasets_dict["no_robots"]["train"].shuffle(SEED).select(range(9_000)).map(preprocess_robots),
]

In [None]:
import pandas as pd
from datasets import Dataset
from tqdm.auto import tqdm

qa_pairs = []

BATCH_SIZE = 256
CHECKPOINT_INTERVAL = 1_000_000_000

sampling_params = SamplingParams(top_k=1, max_tokens=512, repetition_penalty=1.1)

for i in range(0, len(datasets_train[1]), BATCH_SIZE):
    
    prompts = datasets_train[1][i:i+BATCH_SIZE]['text']
    answers = llm.generate(prompts, sampling_params)
    print(i)
    
    for j in range(len(datasets_train[1][i:i+BATCH_SIZE]['text'])):
        qa_pairs.append(
            {"question": datasets_train[1][i + j]['text'],
             "answer": answers[j].outputs[0].text}
        )
        
    if i % (BATCH_SIZE * CHECKPOINT_INTERVAL) == 0:
        checkpoint_df = pd.DataFrame(qa_pairs)
        checkpoint_df.to_csv(f"checkpoint_{i}.csv", index=False)

In [None]:
from datasets import Dataset

ds = Dataset.from_list(qa_pairs)

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("huggingface_token")

login(token = hf_token)

In [None]:
ds.push_to_hub("elvispresniy/synthetic-no_robots")

In [None]:
prompt = "Can you come up with five popular colors for each season? I want to start decorating my bedroom and living room based on the change from winter, summertime, springtime, and autumn each year."

prompt = [
    { "role": "user", "content": prompt },
]

prompt = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)