# Building a German MT Conversation Dataset

Datasets that we examined:

#### [OpenAssistant](https://huggingface.co/datasets/OpenAssistant/oasst1)
- Human-generated assistant-style conversation corpus crowd-sourced by over 13,500 volunteers.
- Over 10,000 conversations trees
- 3k messages in German
- Must be processed to reform the conversation trees

#### [UltraChat-200k / not used](https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k)
- Original [UltraChat](https://github.com/thunlp/UltraChat) is a huge conversational dataset entirely generated by ChatGPT, even the initial questions.
- This is a filtered version of UltraChat: truecasing, correction of grammatical errors, removal of unhelpful assistant answers.
- Over 500,000 rows
- Sadly only English - no examples in German, but can be translated.

In [None]:
!pip install datasets
!pip install fasttext-langdetect
!pip install transformers[sentencepiece]
!pip install torch
!pip install more-itertools
!pip install matplotlib

In [None]:
import datasets
from ftlangdetect import detect
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer
import requests
import getpass

### Load OpenAssistant and prepare data

In [None]:
open_assistant = datasets.load_dataset("A-Roucher/Open_Assistant_Conversation_Chains")
open_assistant = open_assistant['train']

open_assistant_de = open_assistant.filter(lambda l: l['lang'] == 'de')
open_assistant_en = open_assistant.filter(lambda l: l['lang'] == 'en')

In [None]:
all_messages = open_assistant_en['messages']

def transcribe_to_list(conversation):
    return [message['content'] for message in conversation]
    
all_messages_list = []
for conversation in tqdm(all_messages):
    all_messages_list += transcribe_to_list(conversation) 

In [None]:
def detect_code(message):
    suspicious = ['):\n', ';\n', '//', ' # ', 'def ', '{}', 'const ', 'var ', '.delete', '.add', '/>', '</', '==', '!=', 'if __']
    return any([el in message for el in suspicious]) or ('example' in message and ('code' in message or 'script' in message))

open_assistant_en = open_assistant_en.map(lambda example: {'could_be_code': any([detect_code(text['content'].lower()) for text in example['messages']])})

In [None]:
open_assistant_en = open_assistant_en.filter(lambda example: not example['could_be_code'])

# Translate to German

In [None]:
API_URL = 'https://ecfcd7jkenav3ri3.us-east-1.aws.endpoints.huggingface.cloud'
bearer_token = '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'

In [None]:
HEADERS = {
    "Authorization": f"Bearer {bearer_token}",
    "Content-Type": "application/json"
}

### Single thread

In [None]:

tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
MAX_LEN_TOKENIZED = 508


def split_long_message(text, max_len):
    words = iter(text.split())
    lines, current = [], next(words)
    for word in words:
        if len(current) + 1 + len(word) > max_len:
            lines.append(current)
            current = word
        else:
            current += " " + word
    lines.append(current)
    return lines

def split_if_too_long(message, tokenizer, max_len_tokenized=MAX_LEN_TOKENIZED, max_len_text=500):
    tokenized = tokenizer.encode(message)
    if len(tokenized) > max_len_tokenized:
        return split_long_message(message, max_len_text)
    else:
        return message

In [None]:
def query(payload):
    # payload["model"] = {'image': {'custom': {'env': {"MAX_CONCURRENT_REQUESTS": MAX_WORKERS, 'CUDA_LAUNCH_BLOCKING':'1'}}}}
    response = requests.post(API_URL, headers=HEADERS, json=payload)
    return response.json()


def translate(message):
    return query({
        "inputs": message,
    })

In [None]:
translate(['Hello I eat.'])

In [None]:
translate_conversation([(False, 'Hello I eat.'), (True, ['Hello, I eat.', 'Hello, I eat.'])])

### With concurrent requests

In [None]:
tokenizer = AutoTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')

all_conversations = open_assistant_en['messages'].copy()

In [None]:
import asyncio
from aiohttp import ClientSession, ClientTimeout
import time

async def request(document, semaphore):
    # Semaphore guard
    async with semaphore:
        payload = {
            "inputs": document['content'],
            "truncate": True,
            'CUDA_LAUNCH_BLOCKING':'1',
            "model": {'image': {'custom': {'env': {"MAX_CONCURRENT_REQUESTS": "512", 'CUDA_LAUNCH_BLOCKING':'1'}}}}
        }
        
        timeout = ClientTimeout(total=200)  # Set a timeout for requests (10 seconds here)

        async with ClientSession(timeout=timeout, headers=HEADERS) as session:
            async with session.post(API_URL, json=payload) as resp:
                #if resp.status != 200:
                #    raise RuntimeError(await resp.text())
                try:
                    result = await resp.json()
                except:
                    print(resp.text())
        try:
            if isinstance(document['content'], list):
                document['translation'] = ''.join([el['translation_text'] for el in result])
            else:
                document['translation'] = result[0]['translation_text']
            return result
        except:
            print("Error on", document)

async def call_all(conversations):
    # Semaphore to limit concurrent requests. Adjust the number as needed.
    semaphore = asyncio.BoundedSemaphore(16)

    # Creating a list of tasks
    output = []
    for convo in conversations:
        for document in convo:
            document['content'] = split_if_too_long(document['content'], tokenizer)
            output.append(request(document, semaphore))
    
    # Using tqdm to show progress. It's been integrated into the async loop.
    for f in tqdm(asyncio.as_completed(output), total=len(output)):
        await f

In [None]:
start = time.perf_counter()

await call_all(all_conversations)

# Print elapsed time
elapsed_time = time.perf_counter() - start
minutes, seconds = divmod(elapsed_time, 60)
print(f"{int(minutes)} min {seconds:.2f} sec")

### Export results

In [None]:
import copy
import pickle

cop = copy.deepcopy(all_conversations)

with open('data.obj', 'wb') as file:
    pickle.dump(cop, file)

Check missing translations:

In [None]:
counter=0
for el in cop:
    for submessage in el:
        if 'translation' not in submessage.keys() or len(submessage['translation']) == 0:
            counter+=1
print("Number of missing translations:", counter)

In [None]:
translated_messages = []
for el in cop:
    conv = []
    for submessage in el:
        conv.append({'role':submessage['role'], 'content':submessage['translation']})
    translated_messages.append(conv)

In [None]:
open_assistant_en = open_assistant_en.add_column('messages_german', translated_messages)

In [None]:
open_assistant_en = open_assistant_en.rename_column("messages", "messages_original")
open_assistant_en = open_assistant_en.rename_column("lang", "lang_original")

In [None]:
from huggingface_hub import login

login(token='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')

In [None]:
open_assistant_en_2.push_to_hub('A-Roucher/Open_Assistant_Chains_German_Translation')

# Test Inference Endpoint performance

In [None]:
import time
n_messages=1
t = time.time()
for i in range(n_messages):
    print(translate(all_messages_list[i]))
print((time.time() - t)/n_messages)

Batched version:

In [None]:
from datasets import Dataset
small = Dataset.from_dict(open_assistant_en[:5])

In [None]:
open_assistant_en

In [None]:
for message in tqdm(all_messages_list):
    tokenizer.encode(message)

In [None]:
import matplotlib.pyplot as plt

all_lengths = [len(message) for message in all_messages_list]
all_lengths = pd.Series(all_lengths)

In [None]:
all_lengths.min(), all_lengths.max()

In [None]:
for message in all_messages_list:
    if len(message) < 10:
        print(message)

In [None]:
open_assistant_en = open_assistant_en.map(lambda example: {'translated_messages': translate_conversation(example['messages'])})

In [None]:
open_assistant_en.save_to_disk("translations.hf")

In [None]:
from more_itertools import chunked

t=time.time()

with open('translation2.txt', 'w') as output_file:
    for batch in tqdm(chunked(all_messages_list, 8)):
        for el in translate(batch):
            output_file.write(f"{el['translation_text']}\n")

print((time.time() - t)/n_examples)

Scales more or less linearly, so chaining messages does not help. A this pace it would take 10hours to translate everything: so we have to switch to a faster model.

In [None]:
# open_assistant_en = open_assistant_en.map(lambda example: {'messages_de': translate_conversation(example['messages'])})

### Test local translation

In [None]:
from transformers import pipeline

pipe = pipeline(
    "translation",
    model= 'Helsinki-NLP/opus-mt-en-de',
)
n_examples = 12

# KeyDataset(dataset, "text")
t = time.time()
for out in tqdm(pipe(all_messages_list[:n_examples], batch_size=4)):
    print(out)
print((time.time()-t)/n_examples)

In [None]:
with open('output.txt', 'w') as f:
    for line in lines:
        f.write(f"{line}\n")

### NLLB Distilled model

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M", src_lang="en", tgt_lang='de')
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [None]:
def translate(sentences):
    outputTranslation = []
    # Sentence-by-sentence
    # for sentence in sentences:
    #     inputs = tokenizer([sentence], return_tensors="pt", padding=True)
    #     output_sequences = model.generate(
    #         input_ids=inputs["input_ids"],
    #         attention_mask=inputs["attention_mask"],
    #         do_sample=False,  # disable sampling to test if batching affects output
    #         forced_bos_token_id=tokenizer.lang_code_to_id['deu_Latn']
    #         )
    #     outputTranslation += tokenizer.batch_decode(output_sequences, skip_special_tokens=True)

    # Group of sentences
    inputs = tokenizer(sentences, return_tensors="pt", padding=True)
    output_sequences = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            do_sample=False,  # disable sampling to test if batching affects output
            forced_bos_token_id=tokenizer.lang_code_to_id['deu_Latn']
        )
    outputTranslation += tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
    
    print("### " + " Output Translation")
    for sentence in outputTranslation:
        print(sentence)

    print("#### translate() returning.")
    return outputTranslation

In [None]:
t=time.time()
n_examples = 12
translate(all_messages_list[:n_examples])
print((time.time()-t)/n_examples)

10.23s/example without batching, 4.66 with it !

In [None]:
pipe = pipeline(
    "translation",
    model= 'facebook/nllb-200-distilled-600M', #'Helsinki-NLP/opus-mt-en-de',
)