# Building a German MT Conversation Dataset

Datasets that we examined:

#### [UltraChat-200k / not used](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
- Original [UltraChat](https://github.com/thunlp/UltraChat) is a huge conversational dataset entirely generated by ChatGPT, even the initial questions.
- This is a filtered version of UltraChat: truecasing, correction of grammatical errors, removal of unhelpful assistant answers.
- Over 500,000 rows
- Sadly only English - no examples in German, but can be translated.


#### [OpenAssistant / selected](https://huggingface.co/datasets/OpenAssistant/oasst1)
- Human-generated assistant-style conversation corpus crowd-sourced by over 13,500 volunteers.
- Over 10,000 conversations trees
- 3k messages in German
- Must be processed to reform the conversation trees.
- As this is entierly human-generated, we think examples are better quality than in UltraChat - and since it is large enough we will use only this dataset.

In [None]:
!pip install torch
!pip install datasets
!pip install transformers[sentencepiece]
!pip install more-itertools
!pip install matplotlib
!pip install huggingface-hub
!pip install tqdm

In [None]:
import datasets
import copy
import requests
import getpass
import asyncio
import pandas as pd
import time
from tqdm import tqdm
from transformers import AutoTokenizer
from aiohttp import ClientSession, ClientTimeout
from huggingface_hub import notebook_login

### Load OpenAssistant and prepare data

In [None]:
open_assistant = datasets.load_dataset("A-Roucher/Open_Assistant_Conversation_Chains")
open_assistant = open_assistant['train']

open_assistant_de = open_assistant.filter(lambda l: l['lang'] == 'de')
open_assistant_en = open_assistant.filter(lambda l: l['lang'] == 'en')

In [None]:
all_messages = open_assistant_en['messages']

def transcribe_to_list(conversation):
    return [message['content'] for message in conversation]
    
all_messages_list = []
for conversation in tqdm(all_messages):
    all_messages_list += transcribe_to_list(conversation) 

In [None]:
def detect_code(message):
    suspicious = ['):\n', ';\n', '//', ' # ', 'def ', '{}', 'const ', 'var ', '.delete', '.add', '/>', '</', '==', '!=', 'if __']
    return any([el in message for el in suspicious]) or ('example' in message and ('code' in message or 'script' in message))

open_assistant_en = open_assistant_en.map(lambda example: {'could_be_code': any([detect_code(text['content'].lower()) for text in example['messages']])})

In [None]:
open_assistant_en = open_assistant_en.filter(lambda example: not example['could_be_code'])

# Translate to German

In [None]:
API_URL = 'https://ecfcd7jkenav3ri3.us-east-1.aws.endpoints.huggingface.cloud'
bearer_token = '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'

In [None]:
HEADERS = {
    "Authorization": f"Bearer {bearer_token}",
    "Content-Type": "application/json"
}

In [None]:
def query(payload):
    response = requests.post(API_URL, headers=HEADERS, json=payload)
    return response.json()


def translate(message):
    return query({
        "inputs": message,
    })

### Single thread test

In [None]:
translate(['Hello there.'])

### With concurrent requests
Here we take care to limit each request under the max number of tokens accepted by the model. Else it crashed the inference endpoint.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')

all_conversations = open_assistant_en['messages'].copy()

In [None]:
MAX_LEN_TOKENIZED = 508

def split_long_message(text, max_len):
    words = iter(text.split())
    lines, current = [], next(words)
    for word in words:
        if len(current) + 1 + len(word) > max_len:
            lines.append(current)
            current = word
        else:
            current += " " + word
    lines.append(current)
    return lines

def split_if_too_long(message, tokenizer, max_len_tokenized=MAX_LEN_TOKENIZED, max_len_text=500):
    tokenized = tokenizer.encode(message)
    if len(tokenized) > max_len_tokenized:
        return split_long_message(message, max_len_text)
    else:
        return message

In [None]:
async def request(document, semaphore):
    # Semaphore guard
    async with semaphore:
        payload = {
            "inputs": document['content'],
            "truncate": True,
            'CUDA_LAUNCH_BLOCKING':'1',
            "model": {'image': {'custom': {'env': {"MAX_CONCURRENT_REQUESTS": "512", 'CUDA_LAUNCH_BLOCKING':'1'}}}}
        }
        
        timeout = ClientTimeout(total=200)  # Set a timeout for requests (10 seconds here)

        async with ClientSession(timeout=timeout, headers=HEADERS) as session:
            async with session.post(API_URL, json=payload) as resp:
                #if resp.status != 200:
                #    raise RuntimeError(await resp.text())
                try:
                    result = await resp.json()
                except:
                    print(resp.text())
        try:
            if isinstance(document['content'], list):
                document['translation'] = ''.join([el['translation_text'] for el in result])
            else:
                document['translation'] = result[0]['translation_text']
            return result
        except:
            print("Error on", document)

async def call_all(conversations):
    # Semaphore to limit concurrent requests. Adjust the number as needed.
    semaphore = asyncio.BoundedSemaphore(16)

    # Creating a list of tasks
    output = []
    for convo in conversations:
        for document in convo:
            document['content'] = split_if_too_long(document['content'], tokenizer)
            output.append(request(document, semaphore))
    
    # Using tqdm to show progress. It's been integrated into the async loop.
    for f in tqdm(asyncio.as_completed(output), total=len(output)):
        await f

In [None]:
start = time.perf_counter()

await call_all(all_conversations)

# Print elapsed time
elapsed_time = time.perf_counter() - start
minutes, seconds = divmod(elapsed_time, 60)
print(f"{int(minutes)} min {seconds:.2f} sec")

### Export results

In [None]:
copy_conversations = copy.deepcopy(all_conversations)

Check missing translations:

In [None]:
counter=0
for el in copy_conversations:
    for submessage in el:
        if 'translation' not in submessage.keys() or len(submessage['translation']) == 0:
            counter+=1
print("Number of missing translations:", counter)

In [None]:
translated_messages = []
for el in copy_conversations:
    conv = []
    for submessage in el:
        conv.append({'role':submessage['role'], 'content':submessage['translation']})
    translated_messages.append(conv)

In [None]:
open_assistant_en = open_assistant_en.add_column('messages_german', translated_messages)

In [None]:
notebook_login()

In [None]:
open_assistant_en.push_to_hub('A-Roucher/Open_Assistant_Chains_German_Translation')