In [23]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from tqdm import tqdm
import json
import gc
import os
from datasets import load_from_disk

from config import storage_dir

# Setting arguments and paths

In [2]:
model_name = "Qwen/Qwen3-14B"
model_storage_dir = os.path.join(storage_dir, "lm_sys", model_name.split("/")[-1])
response_path = os.path.join(model_storage_dir, 'lm_sys_responses')

# Loading and examining dataset

In [None]:
dataset = load_from_disk(response_path)
print(dataset)

Dataset({
    features: ['conversation'],
    num_rows: 10500
})


In [5]:
with open(os.path.join(response_path, 'generation_params.json'), 'r') as f:
    generation_params = json.load(f)
print(generation_params)

{'context_length': 512, 'start_idx': 0, 'max_samples': 10500, 'model_name': 'Qwen/Qwen3-14B', 'batch_size': 128, 'gpu_count': 4, 'processing_time': 34.9708014011383}


In [27]:
def count_tokens(text, tokenizer):
    return len(tokenizer.encode(text, add_special_tokens=False))

def assistant_content_token_counts(dataset, tokenizer):
    counts = []
    for item in dataset:
        content = item['conversation'][-1]['content']
        counts.append(count_tokens(content, tokenizer))
    return counts

tokenizer = AutoTokenizer.from_pretrained(model_name)
counts = assistant_content_token_counts(dataset, tokenizer)

In [None]:
import numpy as np
print(np.sum(np.array(counts) ==512)/len(counts))  # Didn't reach EOS

0.33076190476190476


# Conversion to ROT13

In [42]:
import re
import codecs

def rot13_alpha(text):
    # Apply ROT13 only to alphabetic substrings
    def rot13_match(match):
        return codecs.encode(match.group(0), 'rot_13')
    # Substitute only [A-Za-z]+ substrings
    return re.sub(r'[A-Za-z]+', rot13_match, text)

def rot13_dataset(dataset):
    new_dataset = []
    n_items = 0
    for item in dataset:
        new_item = item.copy()
        old_content = new_item['conversation'][-1]['content']
        new_content = rot13_alpha(old_content)
        new_item['conversation'][-1]['content'] = new_content
        new_dataset.append(new_item)
        n_items += 1
    return new_dataset

In [43]:
new_dataset = rot13_dataset(dataset)
new_hf_dataset = Dataset.from_list(new_dataset)
rot13_save_path = os.path.join(model_storage_dir, 'lm_sys_responses_rot13')
new_hf_dataset.save_to_disk(rot13_save_path)

Saving the dataset (0/1 shards):   0%|          | 0/10500 [00:00<?, ? examples/s]