In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset, Dataset
from tqdm import tqdm
import json
import gc
import os
import torch.multiprocessing as mp
import time
import argparse
from config import storage_dir

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model_name = "Qwen/Qwen3-30B-A3B"
context_length = 4096
start_idx = 0
max_samples = 100
batch_size = 2

In [6]:
def format_conversation_for_qwen(user_prompt, tokenizer):
    formatted = tokenizer.apply_chat_template(
        user_prompt,
        tokenize=False,
        add_generation_prompt=True,
    )
    return formatted

In [None]:
def extract_and_count_tokens(response, tokenizer):
    # Extract content between <think> and </think>
    start_tag = "<think>"
    end_tag = "</think>"
    start_idx = response.find(start_tag)
    end_idx = response.find(end_tag)
    if start_idx == -1 or end_idx == -1 or start_idx > end_idx:
        return None, 0  # Return None and 0 tokens if tags are not found or invalid
    
    extracted_content = response[start_idx + len(start_tag):end_idx]
    
    # Count tokens in the extracted content
    token_count = len(tokenizer.encode(extracted_content, add_special_tokens=False))
    
    return extracted_content, token_count

Extracted Content: 
Okay, the user wants an introduction over 200 words for Shanghai BMG Chemical Co., Ltd. Let me start by gathering the necessary information. The company is located in Room 602, No. 291 Sikai Road, Shanghai, China. First, I need to structure the introduction properly.

I should mention the company's name, location, and maybe their main business areas. Since it's a chemical company, I should highlight their products or services. Maybe they specialize in certain chemicals? The user didn't specify, so I'll keep it general but professional.

I need to include the address as given. Also, think about the company's mission or values. Perhaps they focus on innovation, quality, or sustainability. Since there's no specific info, I can use common terms like "commitment to excellence" or "customer-centric approach."

Check for any industry-specific keywords. Terms like "research and development," "industrial applications," "sustainable solutions" might be relevant. Also, mention

# Main

In [None]:
model_storage_dir = os.path.join(storage_dir, "lm_sys", model_name.split("/")[-1])
if not os.path.exists(model_storage_dir):
    os.makedirs(model_storage_dir)
cache_dir = "/n/holylfs06/LABS/krajan_lab/Lab/cfang/hf_cache/"
response_dir = os.path.join(model_storage_dir, f'lm_sys_{start_idx}_{start_idx+max_samples}')
dataset = Dataset.load_from_disk(os.path.join(storage_dir, 'lm_sys', 'lm_sys_prompts_maxlen=200'))
dataset = dataset['conversations']
dataset = dataset[start_idx:start_idx + max_samples]

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    cache_dir=cache_dir, 
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Actually run processing
processed_data = []
for i in range(0, len(dataset), batch_size):
    print(f'processing index {i} to {i + batch_size}')
    end_idx = min(i + batch_size, len(dataset))
    batch = dataset[i:end_idx]
    user_prompts = []
    user_turns = []
    for user_prompt in batch:
        formatted_prompt = format_conversation_for_qwen(user_prompt, tokenizer)
        user_prompts.append(formatted_prompt)
        user_turns.append(user_prompt)
    inputs = tokenizer(
        user_prompts,
        return_tensors="pt",
        padding_side='left',
        padding=True,
        truncation=True,
        max_length=context_length
    )
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=context_length,
            do_sample=False,
        )
    for idx in range(len(user_prompts)):
        input_len = inputs['input_ids'][idx].shape[0]
        generated_ids = outputs[idx][inputs['input_ids'][idx].shape[0]:]
        response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
        new_conversation = [user_turns[idx][0], {'role': 'assistant', 'content': response}]
        processed_data.append({'conversation': new_conversation})

    break

Loading checkpoint shards: 100%|██████████| 16/16 [00:16<00:00,  1.05s/it]
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


processing index 0 to 2


In [None]:
# Example usage
extracted_content, token_count = extract_and_count_tokens(response, tokenizer)
print("Extracted Content:", extracted_content)
print("Token Count:", token_count)

In [None]:
response = tokenizer.batch_decode(
    generated_ids, skip_special_tokens=True)[0]
print(response)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


<think>
Okay, the user wants an introduction over 200 words for Shanghai BMG Chemical Co., Ltd. Let me start by gathering the necessary information. The company is located in Room 602, No. 291 Sikai Road, Shanghai, China. First, I need to structure the introduction properly.

I should mention the company's name, location, and maybe their main business areas. Since it's a chemical company, I should highlight their products or services. Maybe they specialize in certain chemicals? The user didn't specify, so I'll keep it general but professional.

I need to include the address as given. Also, think about the company's mission or values. Perhaps they focus on innovation, quality, or sustainability. Since there's no specific info, I can use common terms like "commitment to excellence" or "customer-centric approach."

Check for any industry-specific keywords. Terms like "research and development," "industrial applications," "sustainable solutions" might be relevant. Also, mention their role 