# Setup

## Imports

In [110]:
from tqdm import tqdm
import os
import json

from datasets import (
    load_dataset,
    concatenate_datasets,
    load_from_disk,
    Features,
    Sequence,
    Value,
    logging,
)

## Logging

In [111]:
logging.set_verbosity_error()
logging.disable_progress_bar()

# Data

## Load datasets

In [112]:
def read_annotations_from_file(path: str, file: str):
    features = Features(
        {
            "PTC": Sequence(feature=Value(dtype="string", id=None), length=-1, id=None),
            "Evidence": Sequence(
                feature=Value(dtype="string", id=None), length=-1, id=None
            ),
            "Medium": Sequence(
                feature=Value(dtype="string", id=None), length=-1, id=None
            ),
            "Topic": Sequence(
                feature=Value(dtype="string", id=None), length=-1, id=None
            ),
            "Cue": Sequence(feature=Value(dtype="string", id=None), length=-1, id=None),
            "Addr": Sequence(
                feature=Value(dtype="string", id=None), length=-1, id=None
            ),
            "Message": Sequence(
                feature=Value(dtype="string", id=None), length=-1, id=None
            ),
            "Source": Sequence(
                feature=Value(dtype="string", id=None), length=-1, id=None
            ),
        }
    )
    ds = load_dataset(
        "json",
        data_files=os.path.join(path, file),
        field="Annotations",
        split="train",
        features=features,
    )
    ds = ds.add_column("FileName", [file] * len(ds))
    return ds

In [113]:
def read_sentences_from_file(path: str, file: str):
    ds = load_dataset(
        "json", data_files=os.path.join(path, file), field="Sentences", split="train"
    )
    ds = ds.add_column("FileName", [file] * len(ds))
    ds = ds.add_column("Sentence", [" ".join(t) for t in ds["Tokens"]])
    return ds


In [114]:
def read_annotations_from_path(path: str):
    dataset = None

    for file in tqdm(sorted(os.listdir(path))):
        if not dataset:
            dataset = read_annotations_from_file(path, file)
        else:
            dataset = concatenate_datasets(
                [dataset, read_annotations_from_file(path, file)]
            )

    return dataset


In [115]:
def read_sentences_from_path(path: str):
    dataset = None

    for file in tqdm(sorted(os.listdir(path))):
        if not dataset:
            dataset = read_sentences_from_file(path, file)
        else:
            dataset = concatenate_datasets(
                [dataset, read_sentences_from_file(path, file)]
            )

    dataset = dataset.add_column("id", range(len(dataset)))
    return dataset

In [116]:
def read_sentences_dataset(ds_name: str):
    path_to_dataset = "./transformed_datasets/" + ds_name + "/sentences"

    if os.path.isdir(path_to_dataset):
        result = load_from_disk(path_to_dataset)
    else:
        result = read_sentences_from_path(
            "./SpkAtt-2023/data/"
            + ds_name
            + "/task1"
            + ("_test/" if ds_name == "eval" else "/")
        )
        os.makedirs(path_to_dataset, exist_ok=True)
        result.save_to_disk(path_to_dataset)

    return result

In [117]:
def read_annotations_dataset(ds_name: str):
    path_to_dataset = "./transformed_datasets/" + ds_name + "/annotations"

    if os.path.isdir(path_to_dataset):
        return load_from_disk(path_to_dataset)

    result = read_annotations_from_path(
        "./SpkAtt-2023/data/"
        + ds_name
        + "/task1"
        + ("_test/" if ds_name == "eval" else "/")
    )
    os.makedirs(path_to_dataset, exist_ok=True)
    result.save_to_disk(path_to_dataset)
    return result

In [118]:
train_sentences_dataset = read_sentences_dataset("train")
val_sentences_dataset = read_sentences_dataset("dev")
test_sentences_dataset = read_sentences_dataset("eval")


In [119]:
train_annotations_dataset = read_annotations_dataset("train")
val_annotations_dataset = read_annotations_dataset("dev")

## Format datasets for usage in langchain

In [120]:
def get_text_from_label(train_sentences_dataset, row, annotations):
    tokens = []
    for anno in annotations:
        if int(anno.split(":")[0]) == row["SentenceId"]:
            tokens.append(row["Tokens"][int(anno.split(":")[1])])
        else:
            temp_row = train_sentences_dataset.filter(
                lambda r: r["FileName"] == row["FileName"]
                and r["SentenceId"] == int(anno.split(":")[0])
            )[0]
            tokens.append(temp_row["Tokens"][int(anno.split(":")[1])])
    return tokens


In [121]:
def build_complete_dataset(sentences_dataset, annotations_dataset, dataset_name):
    path_to_dataset = "./transformed_datasets/" + dataset_name + "/complete"
    if os.path.isdir(path_to_dataset):
        return load_from_disk(path_to_dataset)

    ptc, ptc_temp, ptc_mapped, ptc_mapped_temp = [], [], [], []
    evidence, evidence_temp, evidence_mapped, evidence_mapped_temp = [], [], [], []
    medium, medium_temp, medium_mapped, medium_mapped_temp = [], [], [], []
    topic, topic_temp, topic_mapped, topic_mapped_temp = [], [], [], []
    cue, cue_temp, cue_mapped, cue_mapped_temp = [], [], [], []
    addr, addr_temp, addr_mapped, addr_mapped_temp = [], [], [], []
    message, message_temp, message_mapped, message_mapped_temp = [], [], [], []
    source, source_temp, source_mapped, source_mapped_temp = [], [], [], []
    (
        sentence_extended,
        tokens_extended,
        sentence_extended_ids,
    ) = (
        [],
        [],
        [],
    )

    index_in_anno_ds = 0

    for i, row in tqdm(enumerate(sentences_dataset)):
        context = row["Sentence"]
        tokens = row["Tokens"]
        ids = [row["SentenceId"]] * len(row["Tokens"])
        if (
            i + 1 < len(sentences_dataset)
            and sentences_dataset[i + 1]["FileName"] == row["FileName"]
        ):
            context = context + " " + sentences_dataset[i + 1]["Sentence"]
            tokens.extend(sentences_dataset[i + 1]["Tokens"])
            ids.extend(
                [sentences_dataset[i + 1]["SentenceId"]]
                * len(sentences_dataset[i + 1]["Tokens"])
            )
        if (
            i + 2 < len(sentences_dataset)
            and sentences_dataset[i + 2]["FileName"] == row["FileName"]
        ):
            context = context + " " + sentences_dataset[i + 2]["Sentence"]
            tokens.extend(sentences_dataset[i + 2]["Tokens"])
            ids.extend(
                [sentences_dataset[i + 2]["SentenceId"]]
                * len(sentences_dataset[i + 2]["Tokens"])
            )
        sentence_extended.append(context)
        tokens_extended.append(tokens)
        sentence_extended_ids.append(ids)

        if annotations_dataset is not None:
            id_of_next_sentence_with_annotation = (
                int(annotations_dataset[index_in_anno_ds]["Cue"][0].split(":")[0])
                if index_in_anno_ds != len(annotations_dataset)
                else -1
            )

            if row["SentenceId"] != id_of_next_sentence_with_annotation:
                ptc.append([])
                ptc_mapped.append([])
                evidence.append([])
                evidence_mapped.append([])
                medium.append([])
                medium_mapped.append([])
                topic.append([])
                topic_mapped.append([])
                cue.append([])
                cue_mapped.append([])
                addr.append([])
                addr_mapped.append([])
                message.append([])
                message_mapped.append([])
                source.append([])
                source_mapped.append([])
                continue

            while row["SentenceId"] == id_of_next_sentence_with_annotation:
                ptc_temp.append(annotations_dataset[index_in_anno_ds]["PTC"])
                evidence_temp.append(annotations_dataset[index_in_anno_ds]["Evidence"])
                medium_temp.append(annotations_dataset[index_in_anno_ds]["Medium"])
                topic_temp.append(annotations_dataset[index_in_anno_ds]["Topic"])
                cue_temp.append(annotations_dataset[index_in_anno_ds]["Cue"])
                addr_temp.append(annotations_dataset[index_in_anno_ds]["Addr"])
                message_temp.append(annotations_dataset[index_in_anno_ds]["Message"])
                source_temp.append(annotations_dataset[index_in_anno_ds]["Source"])

                ptc_mapped_temp.append(
                    get_text_from_label(sentences_dataset, row, ptc_temp[-1])
                )
                evidence_mapped_temp.append(
                    get_text_from_label(sentences_dataset, row, evidence_temp[-1])
                )
                medium_mapped_temp.append(
                    get_text_from_label(sentences_dataset, row, medium_temp[-1])
                )
                topic_mapped_temp.append(
                    get_text_from_label(sentences_dataset, row, topic_temp[-1])
                )
                cue_mapped_temp.append(
                    get_text_from_label(sentences_dataset, row, cue_temp[-1])
                )
                addr_mapped_temp.append(
                    get_text_from_label(sentences_dataset, row, addr_temp[-1])
                )
                message_mapped_temp.append(
                    get_text_from_label(sentences_dataset, row, message_temp[-1])
                )
                source_mapped_temp.append(
                    get_text_from_label(sentences_dataset, row, source_temp[-1])
                )

                index_in_anno_ds += 1
                if index_in_anno_ds == len(annotations_dataset):
                    break
                id_of_next_sentence_with_annotation = int(
                    annotations_dataset[index_in_anno_ds]["Cue"][0].split(":")[0]
                )

            ptc.append(ptc_temp)
            ptc_mapped.append(ptc_mapped_temp)
            evidence.append(evidence_temp)
            evidence_mapped.append(evidence_mapped_temp)
            medium.append(medium_temp)
            medium_mapped.append(medium_mapped_temp)
            topic.append(topic_temp)
            topic_mapped.append(topic_mapped_temp)
            cue.append(cue_temp)
            cue_mapped.append(cue_mapped_temp)
            addr.append(addr_temp)
            addr_mapped.append(addr_mapped_temp)
            message.append(message_temp)
            message_mapped.append(message_mapped_temp)
            source.append(source_temp)
            source_mapped.append(source_mapped_temp)

            ptc_temp, ptc_mapped_temp = [], []
            evidence_temp, evidence_mapped_temp = [], []
            medium_temp, medium_mapped_temp = [], []
            topic_temp, topic_mapped_temp = [], []
            cue_temp, cue_mapped_temp = [], []
            addr_temp, addr_mapped_temp = [], []
            message_temp, message_mapped_temp = [], []
            source_temp, source_mapped_temp = [], []

    res = sentences_dataset.add_column("sentence_extended", sentence_extended)
    res = res.add_column("tokens_extended", tokens_extended)
    res = res.add_column("sentence_extended_ids", sentence_extended_ids)

    if annotations_dataset is not None:
        res = res.add_column("ptc", ptc)
        res = res.add_column("ptc_mapped", ptc_mapped)
        res = res.add_column("evidence", evidence)
        res = res.add_column("evidence_mapped", evidence_mapped)
        res = res.add_column("medium", medium)
        res = res.add_column("medium_mapped", medium_mapped)
        res = res.add_column("topic", topic)
        res = res.add_column("topic_mapped", topic_mapped)
        res = res.add_column("cue", cue)
        res = res.add_column("cue_mapped", cue_mapped)
        res = res.add_column("addr", addr)
        res = res.add_column("addr_mapped", addr_mapped)
        res = res.add_column("message", message)
        res = res.add_column("message_mapped", message_mapped)
        res = res.add_column("source", source)
        res = res.add_column("source_mapped", source_mapped)

    os.makedirs(path_to_dataset, exist_ok=True)
    res.save_to_disk(path_to_dataset)

    return res


In [122]:
train_ds = build_complete_dataset(
    train_sentences_dataset, train_annotations_dataset, "train"
)
val_ds = build_complete_dataset(val_sentences_dataset, val_annotations_dataset, "dev")
test_ds = build_complete_dataset(test_sentences_dataset, None, "eval")


In [123]:
inputs = test_sentences_dataset.rename_column("Sentence", "Satz")


## Dataset Showcase

In [124]:
train_ds[52]


{'Tokens': ['-',
  'Letzter',
  'Redner',
  'in',
  'der',
  'Debatte',
  ':',
  'Bernd',
  'Westphal',
  'für',
  'die',
  'SPD-Fraktion',
  '.'],
 'SentenceId': 52,
 'FileName': '19002_Zusatzpunkt_3_CDUCSU_Jung_ID19209800_21.11.2017.json',
 'Sentence': '- Letzter Redner in der Debatte : Bernd Westphal für die SPD-Fraktion .',
 'id': 52,
 'sentence_extended': '- Letzter Redner in der Debatte : Bernd Westphal für die SPD-Fraktion .',
 'tokens_extended': ['-',
  'Letzter',
  'Redner',
  'in',
  'der',
  'Debatte',
  ':',
  'Bernd',
  'Westphal',
  'für',
  'die',
  'SPD-Fraktion',
  '.'],
 'sentence_extended_ids': [52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52, 52],
 'ptc': [[]],
 'ptc_mapped': [[]],
 'evidence': [[]],
 'evidence_mapped': [[]],
 'medium': [[]],
 'medium_mapped': [[]],
 'topic': [[]],
 'topic_mapped': [[]],
 'cue': [['52:5']],
 'cue_mapped': [['Debatte']],
 'addr': [[]],
 'addr_mapped': [[]],
 'message': [[]],
 'message_mapped': [[]],
 'source': [[]],
 'source_mapped': 

In [125]:
train_ds[15]


{'Tokens': ['Dazu',
  'muss',
  'man',
  'nur',
  'mit',
  'den',
  'Landwirten',
  'sprechen',
  ',',
  'die',
  'sagen',
  ':',
  'Ja',
  ',',
  'auch',
  'früher',
  'gab',
  'es',
  'extreme',
  'Ereignisse',
  ',',
  'auch',
  'früher',
  'gab',
  'es',
  'Naturkatastrophen',
  ',',
  'aber',
  'in',
  'einem',
  'Jahr',
  'den',
  'Hagel',
  ',',
  'im',
  'anderen',
  'Jahr',
  'eine',
  'Dürre',
  'und',
  'im',
  'dritten',
  'Jahr',
  ',',
  'wie',
  'in',
  'diesem',
  'Jahr',
  ',',
  'die',
  'Frostschäden',
  ',',
  'unter',
  'denen',
  'die',
  'Obstbauern',
  'zu',
  'leiden',
  'hatten',
  ',',
  'diese',
  'Häufung',
  'hatten',
  'wir',
  'früher',
  'so',
  'nicht',
  ',',
  'also',
  'tut',
  'etwas',
  'gegen',
  'den',
  'Klimawandel',
  '.'],
 'SentenceId': 15,
 'FileName': '19002_Zusatzpunkt_3_CDUCSU_Jung_ID19209800_21.11.2017.json',
 'Sentence': 'Dazu muss man nur mit den Landwirten sprechen , die sagen : Ja , auch früher gab es extreme Ereignisse , auch früh

## Build lmsys format json

In [126]:
def map_cues_to_string(mapped):
    if mapped == []:
        return "#UNK#"
    return ", ".join(["[" + ", ".join(val) + "]" for val in mapped])


In [127]:
def map_roles_to_string(mapped):
    if mapped == []:
        return "#UNK#"
    return ", ".join(mapped)


In [128]:
def build_lmsys_format(train_ds, val_ds):
    result = []

    index = 0
    for row in concatenate_datasets([train_ds, val_ds]):
        if len(row["cue_mapped"]) == 0:
            element = {"id": "identity_" + str(index)}
            index += 1
            conversations = [
                {
                    "from": "human",
                    "value": 'A cue is the lexical items in a sentence that indicate that speech, writing, or thought is being reproduced.\nI want you to extract all cuee in the text below.\nIf you find multiple words for one cue, you output them separated by commas.\nIf no cue can be found in the given text, you output the string #UNK# as cue.\nNow extract all cues from the following sentence.\nUse the prefix "Cues: ".\nSentence: '
                    + row["Sentence"],
                },
                {
                    "from": "gpt",
                    "value": "Cues: " + map_cues_to_string(row["cue_mapped"]),
                },
            ]
            element["conversations"] = conversations
            result.append(element)
            continue
        for i, cue in enumerate(row["cue_mapped"]):
            element = {"id": "identity_" + str(index)}
            index += 1
            conversations = [
                {
                    "from": "human",
                    "value": 'A cue is the lexical items in a sentence that indicate that speech, writing, or thought is being reproduced.\nI want you to extract all cuee in the text below.\nIf you find multiple words for one cue, you output them separated by commas.\nIf no cue can be found in the given text, you output the string #UNK# as cue.\nNow extract all cues from the following sentence.\nUse the prefix "Cues: ".\nSentence: '
                    + row["Sentence"],
                },
                {
                    "from": "gpt",
                    "value": "Cues: " + map_cues_to_string(row["cue_mapped"]),
                },
                {
                    "from": "human",
                    "value": "Now I give you again the sentence only in addition with the two following sentences, because the roles can be partially contained in the following sentences.\nText: "
                    + row["sentence_extended"]
                    + "\n\nNow find all roles in the sentence associated with the cue '"
                    + ", ".join(cue)
                    + "' you found in the beginning sentence.",
                },
                {
                    "from": "gpt",
                    "value": "cue: "
                    + ", ".join(cue)
                    + "\nptc: "
                    + map_roles_to_string(row["ptc_mapped"][i])
                    + "\nevidence: "
                    + map_roles_to_string(row["evidence_mapped"][i])
                    + "\nmedium: "
                    + map_roles_to_string(row["medium_mapped"][i])
                    + "\ntopic: "
                    + map_roles_to_string(row["topic_mapped"][i])
                    + "\naddr: "
                    + map_roles_to_string(row["addr_mapped"][i])
                    + "\nmessage: "
                    + map_roles_to_string(row["message_mapped"][i])
                    + "\nsource: "
                    + map_roles_to_string(row["source_mapped"][i]),
                },
            ]
            element["conversations"] = conversations
            result.append(element)

    with open("lmsys.json", "w", encoding="utf8") as outfile:
        json.dump(result, outfile, indent=3, ensure_ascii=False)

In [129]:
build_lmsys_format(train_ds, val_ds)


# QLoRA Fine-Tuning