In [7]:
import spacy
import csv
from collections import Counter
from tqdm import tqdm, trange
import json

'''
    collect dataset collection and vocab
'''

base_data_path = 'datasets/sst-2/badnets/train.csv'
vocab_save_path = 'config/sst-2/badnets/vocab.json'
nlp = spacy.load("en_core_web_sm")


with open(base_data_path, 'r') as f:
    reader = csv.reader(f)
    text_list = [value[0] for value in list(reader)[1:]]

tokens = []
for text in tqdm(text_list):
    doc = nlp(text)
    tokens.append([token.text for token in doc])
flattened_list = [item for sublist in tokens for item in sublist]
counter = Counter(flattened_list)
counter_dict = dict(counter)

count = 0
print(f'All vocab size: {len(counter_dict)}')
for key in counter_dict:
    counter_dict[key] = count
    count += 1 

counter_json = json.dumps(counter_dict, indent=4)
with open(vocab_save_path, 'w') as file:
    file.write(counter_json)

100%|██████████| 6920/6920 [00:42<00:00, 160.95it/s]


All vocab size: 13936


In [3]:
import re

def match_strings(strings, pattern):
    # 编译正则表达式
    regex = re.compile(f'^{pattern}$')
    
    # 存储匹配的字符串下标
    matches = []
    
    # 遍历字符串数组并匹配
    for i, string in enumerate(strings):
        if regex.match(string):
            matches.append(i)
    
    # 输出结果
    if matches:
        print(','.join(map(str, matches)))
    else:
        print(-1)

# 示例输入
strings_input = input().strip()
pattern_input = input().strip()

# 将输入字符串转换为列表
strings = strings_input.split()

# 调用函数进行匹配
match_strings(strings, pattern_input)

1


In [10]:
import pandas as pd
import json
import spacy
import os 

'''
    construct mark data
'''

ori_data_path = 'datasets/sst-2/clean/train-clean.csv'
vocab_save_path = 'config/sst-2/badnets/vocab.json'
mask_save_dir = 'datasets/sst-2/clean/mask'
k = 5
os.makedirs(mask_save_dir, exist_ok=True)

train_df = pd.read_csv(ori_data_path)
with open(vocab_save_path, 'r') as f:
    vocab = json.load(f)
nlp = spacy.load('en_core_web_sm')

def process_text(text, vocab, k):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    ids = [vocab.get(token) for token in tokens] 
    
    mask_list = []
    for i in range(k):
        masked_tokens = [token if ids[j] % k != i else '[MASK]' for j, token in enumerate(tokens)]
        mask_list.append(' '.join(masked_tokens))
    
    return mask_list

processed_data = []
for text in tqdm(train_df['sentence']):
    mask_list = process_text(text, vocab, k)
    processed_data.append(mask_list)

for i in range(k):
    mask_list = []
    for index, row in train_df.iterrows():  
        mask_list.append((processed_data[index][i], row['label']))
    with open(f'{mask_save_dir}/train_{i}.csv', 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['sentence', 'label'])
        writer.writerows(mask_list)

100%|██████████| 6920/6920 [00:43<00:00, 157.79it/s]


In [15]:
import pandas as pd
import json
from tqdm import tqdm
'''
    construct batch data for API
'''
k=5

for i in range(k):
    dataset_path = f"datasets/sst-2/clean/mask/train_{i}.csv"
    batch_save_dir = "datasets/sst-2/clean/api_batch"
    os.makedirs(batch_save_dir, exist_ok=True)

    df = pd.read_csv(dataset_path)
    model = 'gpt-4o'
    prompt = 'Your goal is to only fill the [MASK] for the provided sequence and directly return the whole filled sequence.'

    tasks = []
    for index, row in df.iterrows():
        description = row.iloc[0]
        task = {
            "custom_id": f"task-{index}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                # This is what you would have in your Chat Completions API call
                "model": model,
                "temperature": 0.1,
                "response_format": { 
                    "type": "json_object"
                },
                "messages": [
                    {
                        "role": "system",
                        "content": prompt
                    },
                    {
                        "role": "user",
                        "content": description
                    }
                ],
            }
        }
        
        tasks.append(task)

    with open(f'{batch_save_dir}/train_{i}.jsonl', 'w') as file:
        for obj in tasks:
            file.write(json.dumps(obj) + '\n')

In [16]:
df = pd.read_csv('datasets/sst-2/clean/train-clean.csv')

# Convert the DataFrame to a dictionary
data_dict = df.to_dict(orient='list')

In [19]:
len(data_dict['sentence'])

6920

In [None]:
'''
    Upload file to API
'''
import os
from openai import OpenAI
os.environ['OPENAI_API_KEY'] = ''
batch_save_dir = "datasets/sst-2/clean/api_batch"
k = 5
client = OpenAI()
for i in range(k):
  batch_file = client.files.create(
    file=open(f'{batch_save_dir}/train_{i}.jsonl', "rb"),
    purpose="batch"
  )
  print(batch_file)
  batch_job = client.batches.create(
    input_file_id=batch_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h"
  )
  print(batch_job.id)

In [6]:
batch_job = client.batches.retrieve('batch_PpQXpZCJAtageB9fLvXMHyGA')
print(batch_job)

Batch(id='batch_PpQXpZCJAtageB9fLvXMHyGA', completion_window='24h', created_at=1719760911, endpoint='/v1/chat/completions', input_file_id='file-f1dn8zCYu4oiUrGz5OI6ccgj', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1719776416, error_file_id='file-1Rd3lNQlQeUBSK1MwP2bsyOi', errors=None, expired_at=None, expires_at=1719847311, failed_at=None, finalizing_at=1719776410, in_progress_at=1719760911, metadata=None, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=100, total=100))


In [None]:
import os
import json
import asyncio
from openai import OpenAI
# os.environ['OPENAI_API_KEY'] = ''
os.environ['OPENAI_API_KEY'] = ''
batch_save_dir = "datasets/sst-2/clean/api_batch"
k = 5
seed = 1234
client = OpenAI()

mask_list = []
with open(f'{batch_save_dir}/train_0.jsonl', 'r') as f:
    for line in f:
        mask_list.append(json.loads(line))

: 

In [12]:
async def get_response(message, seed=1234):
    chat_completion = await client.chat.completions.create(
        messages=message,
        model="gpt-4o",
        seed=seed
    )
    return chat_completion.choices[0].message.content

# responses = [get_response(value['body']['messages']) for value in mask_list[:10]]
async def main(mask_list, res_save_dir="datasets/sst-2/clean/api_filled"):
    responses = await asyncio.gather(*[get_response(value['body']['messages']) for value in mask_list])
    print(responses)
    # res_save_dir = "datasets/sst-2/clean/api_filled"
    # os.makedirs(res_save_dir, exist_ok=True)
    # # Print each response
    # with open(f'{res_save_dir}/train_0.txt', 'w') as f:
    #     f.writelines([value+'\n' for value in responses])

In [14]:
loop = asyncio.get_event_loop()
loop.run_until_complete(main(mask_list))

  return compile(source, filename, mode, flags,


RuntimeError: This event loop is already running