In [None]:
# https://huggingface.co/HuggingFaceH4


In [None]:
# https://github.com/yaodongC/awesome-instruction-dataset


In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch
from easynmt import EasyNMT
from optimum.bettertransformer import BetterTransformer
from datasets import load_from_disk, load_dataset
import os
import pandas as pd

class Translator:
    def __init__(self, 
        model_name: str,
        device = 'cuda'
    ):
        self.model_name = model_name
        self.device = device
        self.model = None
        self.tokenizer = None
        self.init()
    
    def init(self):
        print("Init model.")
        if self.model_name == "facebook/nllb-200-3.3B":
            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                self.model_name, 
                use_auth_token=True,
            )
            self.model = BetterTransformer.transform(self.model)
            self.model.eval()
            self.model = torch.compile(self.model)
            self.model = self.model.to(self.device)
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                use_auth_token=True,
            )
        elif self.model_name == "facebook/wmt21-dense-24-wide-en-x":
            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                self.model_name, 
                use_auth_token=True,
            )
            self.model = BetterTransformer.transform(self.model)
            self.model.eval()
            self.model = torch.compile(self.model)
            self.model = self.model.to(self.device)
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                use_auth_token=True,
            )
        elif self.model_name == "opus-mt":
            self.model = EasyNMT(self.model_name)
            
        print("Model is initialized.")
    
    def translate(self, text: str):
        func_map = {
            "facebook/nllb-200-3.3B": self.nllb_translate,
            "opus-mt": self.opusmt_translate,
            "facebook/wmt21-dense-24-wide-en-x": self.wmt21_translate
        }
        
        with torch.no_grad():
            return func_map[self.model_name](text)
    
    def __call__(self, text: str):
        return self.translate(text=text)
    
    def nllb_translate(self, text: str):
        inputs = self.tokenizer(text, return_tensors="pt")
        inputs = self.to_device(inputs=inputs)
        translated_tokens = self.model.generate(
            **inputs, 
            forced_bos_token_id=self.tokenizer.lang_code_to_id["rus_Cyrl"],
        )
        return self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    
    def opusmt_translate(self, text: str):
        return self.model.translate(
            text,
            source_lang="en" ,
            target_lang='ru'
        )

    def wmt21_translate(self, text):
        inputs = self.tokenizer(text, return_tensors="pt")
        inputs = self.to_device(inputs=inputs)
        translated_tokens = self.model.generate(
            **inputs, 
            forced_bos_token_id=self.tokenizer.get_lang_id("ru"),
        )
        return self.tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    
    def to_device(self, inputs):
        for key in inputs.keys():
            inputs[key] = inputs[key].to(self.device)
        return inputs

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
EasyNMT("facebook/wmt21-dense-24-wide-en-x")

In [7]:
# 2 min 40 sec - 50
# model_name = "facebook/nllb-200-3.3B"
# 6 min 5 sec - 50
# model_name = "facebook/wmt21-dense-24-wide-en-x"
# 45 sec - 50
model_name = "opus-mt"
device = torch.device("cuda:0")
translator = Translator(model_name=model_name, device=device)

Init model.
Model is initialized.


In [8]:
translator("hello world")



'Приветствую мир'

### databricks/databricks-dolly-15k

In [9]:
data = load_dataset("databricks/databricks-dolly-15k")
data


100%|██████████| 1/1 [00:00<00:00, 642.41it/s]


DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'response', 'category'],
        num_rows: 15014
    })
})

In [10]:
data["train"][0]


{'instruction': 'When did Virgin Australia start operating?',
 'context': "Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.[3] It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.[4]",
 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.',
 'category': 'closed_qa'}

In [None]:
base_folder = "/home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/datasets/translations_examples/"
model_path = model_name.replace("/", "_")
save_path = f"{base_folder}{model_path}.csv"

assert not os.path.isfile(save_path), f'File {model_path} exists'

fields = ["context", "instruction", "response"]

dataset_map = {item: [] for item in fields}
for item in fields:
    dataset_map[f'{item}_translated'] = []

for i, example in enumerate(data["train"]):
    print("Progress ",i)
    for field in fields:
        print(f"Field name: {field}")
        print("Original: ", example[field])
        text = example[field]
        translated = translator(text=text)
        print("Translated: ", translated)
        dataset_map[field].append(example[field])
        dataset_map[f'{field}_translated'].append(translated)
        print()
    print("==" * 100)

    if i > 50:
        break
pd.DataFrame(data=dataset_map).to_csv(save_path, index=False)


### OpenAssistant Conversations Dataset (OASST1)

In [10]:
import pandas as pd
dataset = pd.read_json(path_or_buf="/home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/datasets/pure_datasets/2023-04-12_oasst_all.messages.jsonl",lines=True)
dataset = dataset[dataset['lang'] == 'es']

message_tree_ids = dataset['message_tree_id'].unique()
# message_tree_ids = [list(dataset.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")['text']) for message_tree_id in message_tree_ids]
len(message_tree_ids)

22308

In [7]:
# dataset['lang'].value_counts()

lang
es    43288
Name: count, dtype: int64

In [34]:
len(dataset)

72567

In [35]:
dataset.columns

Index(['message_id', 'user_id', 'created_date', 'text', 'role', 'lang',
       'review_count', 'review_result', 'deleted', 'synthetic', 'emojis',
       'labels', 'detoxify', 'message_tree_id', 'tree_state', 'parent_id',
       'rank', 'model_name'],
      dtype='object')

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/wmt21-dense-24-wide-en-x")



  from .autonotebook import tqdm as notebook_tqdm


346

In [11]:

lengths = []
max_len = 0
message_tree_ids_map = {}
# for i, message_tree_id in enumerate(message_tree_ids):
#     # print(message_tree_id)
#     texts = list(dataset.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")['text'])
    
#     for message in texts:
#         print(message)
#         # lengths.append(len())
#         # input_ids = tokenizer(message, truncation=True, max_length=2048)['input_ids']
#         # max_len = max(len(input_ids), max_len)
#         print("="*100)
#     print("|"*100)
#     print("|"*100)
#     if i > 100:
#         break

for i in range(len(dataset)):
    ids = str(dataset.iloc[i]['message_tree_id'])
    if message_tree_ids_map.get(ids, None) is None:
        message_tree_ids_map[ids] = 1
    else:
        message_tree_ids_map[ids] += 1

In [12]:
message_tree_ids = []
for key in message_tree_ids_map.keys():
    if message_tree_ids_map.get(key) > 1:
        message_tree_ids.append(key)

In [25]:
# dataset = pd.read_json(path_or_buf="../datasets/pure_datasets/2023-04-12_oasst_ready.trees.jsonl",lines=True)
# dataset = dataset[dataset['lang'] == 'en']

In [6]:
import json
import pandas as pd
with open("../datasets/openass_translated_en2ru/openass_translated_en2ru.json") as f:
    data = json.load(f)
    data = json.loads(data)
en2ru_dataset = pd.DataFrame(data=data)

In [7]:
en2ru_dataset.columns

Index(['message_id', 'user_id', 'created_date', 'text', 'role', 'lang',
       'review_count', 'review_result', 'deleted', 'synthetic', 'emojis',
       'labels', 'detoxify', 'message_tree_id', 'tree_state', 'parent_id',
       'rank', 'model_name', 'text_translated'],
      dtype='object')

In [15]:
message_tree_ids = en2ru_dataset['message_tree_id'].unique()
for i in range(len(message_tree_ids)):
    message_tree_id = message_tree_ids[i]
    texts = list(en2ru_dataset.query(f"message_tree_id == '{message_tree_id}'").sort_values("created_date")['text_translated'])
    print(message_tree_id)
    for j in range(len(texts)):
        text = texts[j]
        print(text)
        print("-"*10)
    print("|||"*100)
    if i > 100:
        break

6ab24d72-0181-4594-a9cd-deaf170242fb
Можете ли вы написать краткое введение о значении термина "монопсония" в экономике? Пожалуйста, используйте примеры, связанные с потенциальными монопсониями на рынке труда, и приведите соответствующие исследования.
----------
Монопсония - это рыночная структура, при которой на рынке есть один покупатель. В контексте рынков труда монопсония возникает, когда в определенной отрасли или географической области есть только один работодатель, что дает этому работодателю значительную власть над заработной платой и условиями труда работников. Это приводит к ситуации, когда работодатель имеет возможность подавлять заработную плату и уменьшать переговорную силу работников. Исследования показали, что существование монопсонии на рынках труда может иметь негативные последствия для работников, такие как более низкая заработная плата, сокращение пособий и ограниченные возможности трудоустройства. Например, исследование Autor, Dorn и Hanson (2013) обнаружило доказат

In [51]:
import json
import pprint

with open('../datasets/pure_datasets/2023-04-12_oasst_ready.trees.jsonl', 'r') as json_file:
    json_list = list(json_file)

conversation_example = []
for i, json_str in enumerate(json_list):
    record = json.loads(json_str)
    # if len(result['prompt']['replies']) == 2 and result['prompt']['lang'] == 'en':
    if record['message_tree_id'] == '7cce4047-8f87-42c4-9d75-a590c02be5b1':
        # print(record['prompt']['text'])
        # pprint.pprint(record)
        conversation_example = record 
        break
    
    # if i > 20:
        # break

In [54]:
from collections.abc import Iterable

# https://codebeautify.org/python-formatter-beautifier
def get_all_threads(sample):
    initial_message = f"{sample['role'].upper()} {sample['text']} |"
    def recursive_helper(node, current_path):
        
        if len(node['replies']) == 0:
            if node['role'] == "assistant":
                current_path += f"\n|||||||||||||\n {node['role'].upper()} {node['text']}"
            
            return current_path + "<END>"
        
        current_path += f"\n|||||||||||||\n {node['role'].upper()} {node['text']}"
        paths = []
        for reply in node['replies']:
            if node['labels']['spam']['value'] < 0.5:
                path = recursive_helper(reply, current_path)
                paths.append(path) 
        return paths
    
    paths = []
    for reply in sample['replies']:
        path = recursive_helper(reply, initial_message) 
        paths.append(path)
        
    return paths
    
all_threads = get_all_threads(sample=conversation_example['prompt'])


def flatten(xs):
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x

all_threads = flatten(all_threads)

for thread in all_threads:
    if "<END>" in thread:
        pprint.pprint(thread)

('PROMPTER I want to start doing astrophotography as a hobby, any suggestions '
 'what could i do? |\n'
 '|||||||||||||\n'
 ' ASSISTANT Getting started in astrophotography can seem daunting, but with '
 'some patience and practice, you can become a master of the craft. To begin, '
 "you'll need a good camera and lens, a tripod, and a dark sky location free "
 'of light pollution. You will also need to learn about the basics of '
 'astrophotography, such as what camera settings to use, how to capture star '
 'trails, and the best techniques for tracking celestial objects. You can also '
 'purchase or rent different types of telescopes, depending on what types of '
 "objects you want to capture. Additionally, it's important to keep up with "
 'the latest astrophotography news and trends. Once you have the necessary '
 'equipment and knowledge, you can start shooting and experimenting with '
 'different techniques to get the best results.\n'
 '|||||||||||||\n'
 ' PROMPTER Can you tell me 

In [47]:
len(list(iter(all_threads)))

0

In [53]:
conversation_example['prompt']

{'message_id': '7cce4047-8f87-42c4-9d75-a590c02be5b1',
 'user_id': '8974fb6d-49e1-4c99-aed3-e957ae5b3269',
 'created_date': '2023-02-05T13:18:52.133806+00:00',
 'text': 'I want to start doing astrophotography as a hobby, any suggestions what could i do?',
 'role': 'prompter',
 'lang': 'en',
 'review_count': 3,
 'review_result': True,
 'deleted': False,
 'synthetic': False,
 'emojis': {'+1': 7},
 'replies': [{'message_id': '19cb94ab-d6ab-4dbd-b408-70ffbd9b595c',
   'parent_id': '7cce4047-8f87-42c4-9d75-a590c02be5b1',
   'user_id': 'df411b83-5030-4708-bf81-0fe9fa1398b2',
   'created_date': '2023-02-12T05:35:47.692786+00:00',
   'text': "Getting started in astrophotography can seem daunting, but with some patience and practice, you can become a master of the craft. To begin, you'll need a good camera and lens, a tripod, and a dark sky location free of light pollution. You will also need to learn about the basics of astrophotography, such as what camera settings to use, how to capture star

In [27]:
conversation_example['prompt']

{'message_id': '6ab24d72-0181-4594-a9cd-deaf170242fb',
 'user_id': 'c3fe8c76-fc30-4fa7-b7f8-c492f5967d18',
 'created_date': '2023-02-05T14:23:50.983374+00:00',
 'text': 'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.',
 'role': 'prompter',
 'lang': 'en',
 'review_count': 3,
 'review_result': True,
 'deleted': False,
 'synthetic': False,
 'emojis': {'+1': 10, '_skip_reply': 1, '_skip_ranking': 4},
 'replies': [{'message_id': 'c8e83833-ecbc-44fe-b6db-735228c25a1c',
   'parent_id': '6ab24d72-0181-4594-a9cd-deaf170242fb',
   'user_id': '2c96e467-66f0-4be7-9693-bda51356a424',
   'created_date': '2023-02-06T13:50:44.657083+00:00',
   'text': '"Monopsony" refers to a market structure where there is only one buyer for a particular good or service. In economics, this term is particularly relevant in the labor market, where a monopsony employer has s

## chip2_instruct_alpha

In [53]:
from datasets import load_dataset
import pandas as pd
# load_dataset("json", "../datasets/pure_datasets/chip2_instruct_alpha/chip2_instruct_alpha_v6a_2.json")
data = pd.read_json(path_or_buf="../datasets/pure_datasets/chip2_instruct_alpha/chip2_instruct_alpha_v6a_4.json")

In [54]:
len(data)

48289

In [55]:
instruction_dataset = []
for i in range(len(data)):
    line = data.iloc[i][0]
    user, bot = line.split("Chip2: ")
    user = user.replace("User: ", "")
    user = user.replace("\n\n", "")
    bot = bot.replace("<|endoftext|>", "")
    instruction_dataset.append({
        "user": user,
        "bot": bot
    })

In [56]:
instruction_dataset[1000]

{'user': "I'm looking for a new job. What questions should I ask a potential employer during an interview?",
 'bot': '\n1. What is the job role and what responsibilities does it involve?\n2. What skills and experiences are you seeking in the most successful candidate? \n3. What challenges will I face in this role and what support systems are available to help me meet these challenges?\n4. How does this job fit in the overall organizational structure?\n5. What opportunities for professional development and growth are available?\n6. What are the expectations for performance in the role?\n7. How will success in this role be measured?\n8. How often will I receive feedback on my performance?\n9. How quickly is the company expecting me to transition into this role and hit the ground running?\n10. What advantages would I bring to the team that would make me the right fit for the job?'}

In [52]:
import json
with open("../datasets/pure_datasets/chip2_instruct_alpha/chip2_instruct_alpha_v6a_4_formatted.json", 'w', encoding='utf-8') as outfile:
	json.dump(instruction_dataset, outfile)

In [40]:
# translated_data = pd.read_json(path_or_buf="/home/kosenko/deepspeed/DeepSpeedExamples/applications/DeepSpeed-Chat/training/step1_supervised_finetuning/datasets/chip2_instruct_alpha/chip2_instruct_alpha_v6a_1_translated.json")
# translated_data.to_csv("./test.csv")

In [1]:
import json

with open("../datasets/chip2_instruct_alpha/chip2_instruct_alpha_v6a_2_translated.json") as f:
    data = json.load(f)
    # data = json.loads(data)

In [2]:
import pprint
for i, item in enumerate(data):
    pprint.pprint(item)
    if i > 100:
    	break
# len(data)
    

{'bot': 'A will is a legal document that specifies how your property should be '
        'distributed after you die. It can also specify who should care for '
        "any children or other dependents you may have. It's important to make "
        'sure that your will is valid and up-to-date, since the laws governing '
        'wills vary from state to state.',
 'bot_translated': 'Завещание - это юридический документ, который определяет, '
                   'как ваше имущество должно быть распределено после вашей '
                   'смерти. В нем также может быть указано, кто должен '
                   'заботиться о любых детях или других иждивенцах, которые у '
                   'вас могут быть. Важно убедиться, что ваша воля '
                   'действительна и актуальна, поскольку законы, регулирующие '
                   'завещания, варьируются от штата к штату.',
 'user': "I've heard that it's a good idea to have a will. What is a will?",
 'user_translated': 'Я слышал, что х