In [17]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from tqdm import tqdm
import json
import gc
import os
from datasets import load_from_disk

from config import storage_dir

# Setting arguments and paths

In [18]:
model_name = "meta-llama/Llama-3.3-70B-Instruct" #Qwen/Qwen3-14B"
model_storage_dir = os.path.join(storage_dir, "lm_sys", model_name.split("/")[-1])
response_path = os.path.join(model_storage_dir, 'lm_sys_responses')

# Loading and examining dataset

In [3]:
dataset = load_from_disk(response_path)
print(dataset)

Dataset({
    features: ['conversation'],
    num_rows: 20000
})


In [4]:
dataset[1]

{'conversation': [{'content': "Beside OFAC's selective sanction that target the listed individiuals and entities, please elaborate on the other types of US's sanctions, for example, comprehensive and sectoral sanctions. Please be detailed as much as possible",
   'role': 'user'},
  {'content': "The United States employs a range of sanctions tools to achieve its foreign policy and national security objectives. Besides the selective or targeted sanctions imposed by the Office of Foreign Assets Control (OFAC), which focus on specific individuals and entities, the U.S. also implements comprehensive and sectoral sanctions. Each type of sanction has a distinct approach and impact on the targeted country, entity, or individual.\n\n### 1. Comprehensive Sanctions\n\nComprehensive sanctions, also known as country-based sanctions, are the most severe form of economic sanctions. They are imposed on entire countries and essentially cut off the targeted country from the global economy, restricting a

In [6]:
generation_params_path = os.path.join(response_path, 'generation_params.json')
if os.path.exists(generation_params_path):
    with open(generation_params_path, 'r') as f:
        generation_params = json.load(f)
    print(generation_params)

In [7]:
def count_tokens(text, tokenizer):
    return len(tokenizer.encode(text, add_special_tokens=False))

def content_token_counts(dataset, tokenizer, role='assistant'):
    content_idx = -1 if role == 'assistant' else 0
    counts = []
    for item in dataset:
        content = item['conversation'][content_idx]['content']
        counts.append(count_tokens(content, tokenizer))
    return counts

tokenizer = AutoTokenizer.from_pretrained(model_name)
counts = content_token_counts(dataset, tokenizer)

In [8]:
import numpy as np
print(np.sum(np.array(counts) ==512)/len(counts))  # Didn't reach EOS

0.29405


In [9]:
dataset[0]

{'conversation': [{'content': 'how can identity protection services help protect me against identity theft',
   'role': 'user'},
  {'content': "Identity protection services can help protect you against identity theft in several ways:\n\n1. **Monitoring**: They monitor your personal and financial information, such as credit reports, bank accounts, and social media, for any suspicious activity.\n2. **Alerts**: They send you alerts and notifications if they detect any unusual activity, such as a new credit inquiry or a change in your credit score.\n3. **Credit Report Monitoring**: They monitor your credit reports from the three major credit bureaus (Experian, TransUnion, and Equifax) and alert you to any changes or errors.\n4. **Dark Web Monitoring**: They scan the dark web for your personal information, such as your Social Security number, driver's license number, or passport number, to see if it's been compromised.\n5. **Identity Theft Insurance**: They offer insurance that can help cov

# Conversion to ROT13

In [10]:
import re
import codecs

def rot13_alpha(text):
    # Apply ROT13 only to alphabetic substrings
    def rot13_match(match):
        return codecs.encode(match.group(0), 'rot_13')
    # Substitute only [A-Za-z]+ substrings
    return re.sub(r'[A-Za-z]+', rot13_match, text)

def rot13_dataset(dataset):
    new_dataset = []
    n_items = 0
    for item in dataset:
        new_item = item.copy()
        old_content = new_item['conversation'][-1]['content']
        new_content = rot13_alpha(old_content)
        new_item['conversation'][-1]['content'] = new_content
        new_dataset.append(new_item)
        n_items += 1
    return new_dataset

In [None]:
new_dataset = rot13_dataset(dataset)
new_hf_dataset = Dataset.from_list(new_dataset)
rot13_save_path = os.path.join(model_storage_dir, 'lm_sys_responses_rot13')
new_hf_dataset.save_to_disk(rot13_save_path)

Saving the dataset (0/1 shards):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [12]:
counts = content_token_counts(new_dataset, tokenizer)
counts = np.array(counts)
counts.max()

np.int64(2048)

In [13]:
counts = content_token_counts(new_dataset, tokenizer, role='user')
counts = np.array(counts)
counts.max()

np.int64(200)

In [15]:
new_hf_dataset[0]

{'conversation': [{'content': 'how can identity protection services help protect me against identity theft',
   'role': 'user'},
  {'content': "Vqragvgl cebgrpgvba freivprf pna uryc cebgrpg lbh ntnvafg vqragvgl gursg va frireny jnlf:\n\n1. **Zbavgbevat**: Gurl zbavgbe lbhe crefbany naq svanapvny vasbezngvba, fhpu nf perqvg ercbegf, onax nppbhagf, naq fbpvny zrqvn, sbe nal fhfcvpvbhf npgvivgl.\n2. **Nyregf**: Gurl fraq lbh nyregf naq abgvsvpngvbaf vs gurl qrgrpg nal hahfhny npgvivgl, fhpu nf n arj perqvg vadhvel be n punatr va lbhe perqvg fpber.\n3. **Perqvg Ercbeg Zbavgbevat**: Gurl zbavgbe lbhe perqvg ercbegf sebz gur guerr znwbe perqvg ohernhf (Rkcrevna, GenafHavba, naq Rdhvsnk) naq nyreg lbh gb nal punatrf be reebef.\n4. **Qnex Jro Zbavgbevat**: Gurl fpna gur qnex jro sbe lbhe crefbany vasbezngvba, fhpu nf lbhe Fbpvny Frphevgl ahzore, qevire'f yvprafr ahzore, be cnffcbeg ahzore, gb frr vs vg'f orra pbzcebzvfrq.\n5. **Vqragvgl Gursg Vafhenapr**: Gurl bssre vafhenapr gung pna uryc pbi

In [24]:
new_hf_dataset

Dataset({
    features: ['conversation'],
    num_rows: 20000
})

# Load to Huggingface

In [None]:
new_hf_dataset.push_to_hub("chingfang17/test_dataset")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/chingfang17/test_dataset/commit/02c777810b430e7d2cb2c5c436a64c69b4718105', commit_message='Upload dataset', commit_description='', oid='02c777810b430e7d2cb2c5c436a64c69b4718105', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/chingfang17/test_dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='chingfang17/test_dataset'), pr_revision=None, pr_num=None)