In [1]:
from datasets import load_dataset

ds = load_dataset("cvmistralparis/Q20LLM", "games")
words = load_dataset("cvmistralparis/Q20LLM", "words")

Downloading readme:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.71M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/336k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/70950 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5294 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/31.9k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.96k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2282 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/233 [00:00<?, ? examples/s]

In [2]:
for data in ds['train']:
    print(data)
    break

{'eid': 5, 'model': 'llama3-70b-8192', 'keyword': '3D Printing', 'turn': 1, 'question': 'Is it a noun?', 'answer': 'Yes'}


In [3]:
from typing import Iterable
import itertools

class LlamaFormatter:
    _bos_token = '<|begin_of_text|>'
    _start_header_token = '<|start_header_id|>'
    _end_header_token = '<|end_header_id|>'
    _end_token = '<|eot_id|>'

    def __init__(self, system_prompt: str = None, few_shot_examples: Iterable = None):
        self._system_prompt = system_prompt
        self._few_shot_examples = few_shot_examples
        self._turn_system = f"{self._start_header_token}system{self._end_header_token}\n\n{{}}{self._end_token}"
        self._turn_user = f"{self._start_header_token}user{self._end_header_token}\n\n{{}}{self._end_token}"
        self._turn_model = f"{self._start_header_token}assistant{self._end_header_token}\n\n{{}}{self._end_token}"
        self.reset()

    def __repr__(self):
        return self._state
    
    def system(self, prompt):
        self._state += self._turn_system.format(prompt)
        return self
    
    def user(self, prompt):
        self._state += self._turn_user.format(prompt)
        return self

    def model(self, prompt):
        self._state += self._turn_model.format(prompt)
        return self

    def start_user_turn(self):
        self._state += f"{self._start_header_token}user{self._end_header_token}\n\n"
        return self

    def start_model_turn(self):
        self._state += f"{self._start_header_token}assistant{self._end_header_token}\n\n"
        return self

    def end_turn(self):
        self._state += f"{self._end_token}\n"
        return self

    def reset(self):
        self._state = ""
        self._state += self._bos_token
        if self._system_prompt is not None:
            self.system(self._system_prompt)
        if self._few_shot_examples is not None:
            self.apply_turns(self._few_shot_examples, start_agent='user')
        return self

    def apply_turns(self, turns: Iterable, start_agent: str):
        formatters = [self.model, self.user] if start_agent == 'model' else [self.user, self.model]
        formatters = itertools.cycle(formatters)
        for fmt, turn in zip(formatters, turns):
            fmt(turn)
        return self

In [19]:
import random
from tqdm import tqdm
import pandas as pd


def get_conversations(ds):
    curr_keyword = ''
    curr_eid = -1
    keyword_cnt = 1
    prompt = []
    data_points = []
    
    i = 0
    for data in tqdm(ds):
        if data['eid'] != curr_eid:
            if prompt:
                if len(prompt) <= 40:
                    prompt[-1] += " Now guess the keyword."
                    if (keyword_cnt % 3) != 0:    #3번에 한 번은 오답
                        prompt.append('**'+curr_keyword+'**')
                        prompt.append('Correct!')
                        i+=1
                    else:
                        prompt.append('**'+get_random_word(exclude_word=curr_keyword)+'**')
                        prompt.append('Wrong!')
                    data_points.append(prompt)
            prompt = []
            curr_eid = data['eid']
            if data['keyword'] == curr_keyword:
                keyword_cnt += 1
            else:
                curr_keyword = data['keyword']
                keyword_cnt = 1
                
        prompt.append(data['question'])
        prompt.append(data['answer'].lower())
    print(i)
    return data_points

def get_conversations(ds):
    curr_keyword = ''
    curr_eid = -1
    prompt = []
    data_points = {'prompt': [], 'label': []}
    
    i = 0
    for data in tqdm(ds):
        if data['eid'] != curr_eid:
            if prompt:
                if len(prompt) <= 40:
#                     prompt[-1] += " Now guess the keyword."
                    data_points['prompt'].append(prompt)
                    data_points['label'].append(curr_keyword)
            prompt = []
            curr_eid = data['eid']
            curr_keyword = data['keyword']
                
        prompt.append(data['question'])
        prompt.append(data['answer'].lower())
    print(i)
    return pd.DataFrame(data_points)

In [39]:
train_df = get_conversations(ds['train'])
val_df = get_conversations(ds['test'])

100%|██████████| 70950/70950 [00:04<00:00, 15732.81it/s]


0


100%|██████████| 5294/5294 [00:00<00:00, 15450.65it/s]

0





In [41]:
train_df.to_csv('train.csv')
val_df.to_csv('validation.csv')


In [36]:
def get_random_word(exclude_word=''):
    w = random.choice(words['train']['keyword'])
    while w == exclude_word:
        w = random.choice(words['train']['keyword'])
    return w

system_prompt = "You are an AI assistant designed to play the 20 Questions game. In this game, the Answerer thinks of a keyword and responds to yes-or-no questions by the Questioner. The keyword is a specific places or things."
prompts = []
rear_keyword = ''
keyword_cnt = 1
for row in tqdm(train_df.itertuples(index=False), total=len(train_df)):
    if rear_keyword == row.label:
        keyword_cnt += 1
    else:
        keyword_cnt = 1
        rear_keyword = row.label
    formatter = LlamaFormatter(system_prompt=system_prompt)
    formatter.user("Let's play 20 Questions. You are playing the role of the Questioner. The keyword is a specific places or things.")
    formatter.apply_turns(turns=row.prompt, start_agent='model')
    formatter.user('Now guess the keyword.')
    if keyword_cnt % 3 != 0:
        formatter.model(row.label)
        formatter.user('Correct!')
    else:
        formatter.model(get_random_word(row.label))
        formatter.user('Wrong!')
  
    prompts.append(formatter._state)
    del formatter
# ds['train'] = ds['train'].add_column("prompt", prompts)
print(prompts[0])

100%|██████████| 5356/5356 [00:02<00:00, 2150.76it/s]
