# Set Up Data


Using code from [this mangoes notebook](https://gitlab.inria.fr/magnet/mangoes/-/blob/master/notebooks/BERT%20for%20Co-reference%20Resolution%20-%20Ontonotes.ipynb)

Use [this page](https://cemantix.org/conll/2012/data.html) to get ontonotes in conll format. **Open it on chrome. not FF**

In [1]:
import os
import glob
import time
from mangoes.modeling import BERTForCoreferenceResolution, MangoesCoreferenceDataset
from transformers import BertTokenizerFast

In [15]:
def normalize_word(word, language):
    if language == "arabic":
        word = word[:word.find("#")]
    if word == "/." or word == "/?" or word == "/-":
        return word[1:]
    else:
        return word
    

def parse_document(path, language="english"):
    """
    Parses a single data file
    Returns the data from whatever documents are in the file.
    
    returns:
        words: Lists of Lists of Lists of strings. list of sentences. One sentence is a list of words.
        cluster_ids: Lists of Lists of Lists of ints or tuple(ints). Words that aren't mentions have either -1 as id
        speaker_ids: Lists of Lists of Lists of ints.
        doc_keys: List of document keys
    """
    doc_keys = []
    doc_sents = []
    doc_cluster_ids = []
    doc_speaker_ids = []
    sentences = []
    sentence_cluster_ids = []
    sentence_speaker_ids = []
    cur_sentence_words = []
    cur_sentence_cluster_ids = []
    cur_sentence_speaker_ids = []
    current_clusters = []
    docs = 0
    with open(path, "r") as input_file:
        for line in input_file:
            if line.startswith("#begin document"):
                doc_key = line.split()[2][:-1]
                doc_keys.append(doc_key[1:-1])
                docs += 1
            elif line.startswith("#end document"):
                assert len(sentences) == len(sentence_cluster_ids) == len(sentence_speaker_ids)
                assert cur_sentence_words == []
                doc_sents.append(sentences)
                doc_cluster_ids.append(sentence_cluster_ids)
                doc_speaker_ids.append(sentence_speaker_ids)
                sentences = []
                sentence_cluster_ids = []
                sentence_speaker_ids = []
            else:
                data = line.split()
                sentence_end = len(data) == 0
                if sentence_end:
                    sentences.append(cur_sentence_words)
                    sentence_cluster_ids.append(cur_sentence_cluster_ids)
                    sentence_speaker_ids.append(cur_sentence_speaker_ids)
                    cur_sentence_words = []
                    cur_sentence_cluster_ids = []
                    cur_sentence_speaker_ids = []
                else:
                    cur_sentence_words.append(normalize_word(data[3], language))
                    cur_sentence_speaker_ids.append(data[9])
                    raw_cluster_id = data[-1]
                    if raw_cluster_id == "-":
                        if len(current_clusters) == 0:
                            cluster_id = -1
                        elif len(current_clusters) == 1:
                            cluster_id = int(list(current_clusters)[0])
                        else:
                            cluster_id = tuple(int(item) for item in current_clusters)
                    else:
                        for part in raw_cluster_id.split("|"):
                            if "(" in part:
                                current_clusters.append(part[1:-1] if ")" in part else part[1:])
                        if len(current_clusters) == 1:
                            cluster_id = int(list(current_clusters)[0])
                        else:
                            cluster_id = tuple(int(item) for item in current_clusters)
                        for part in raw_cluster_id.split("|"):
                            if ")" in part:
                                current_clusters.remove(part[1:-1] if "(" in part else part[:-1])
                    cur_sentence_cluster_ids.append(cluster_id)
        assert len(doc_sents) == docs
        for i in range(docs):
            doc_keys[i] += f"_{i}"
        return doc_sents, doc_cluster_ids, doc_speaker_ids, doc_keys

    
def parse_dataset(path):
    """
    Parses a directory of Ontonotes data files (train, dev, or test)
    
    
    returns:
        words: Lists of Lists of Lists of strings. list of sentences. One sentence is a list of words.
        cluster_ids: Lists of Lists of Lists of ints or tuple(ints). Words that aren't mentions have either -1 as id
        speaker_ids: Lists of Lists of Lists of ints.
        doc_keys: List of document keys.
        

    """
    dataset_sents = []
    dataset_clusters = []
    dataset_speakers = []
    dataset_genres = []
    dataset_doc_keys = []
    for path in glob.iglob(path):
        genre = path.split("/")[-4]
        doc_sents, doc_cluster_ids, doc_speaker_ids, doc_keys = parse_document(path)
        assert len(doc_sents) == len(doc_cluster_ids) == len(doc_speaker_ids) == len(doc_keys)
        dataset_sents += doc_sents
        dataset_clusters += doc_cluster_ids
        dataset_genres += [genre] * len(doc_sents)
        dataset_doc_keys += doc_keys
        for d in range(len(doc_speaker_ids)):
            speakers = doc_speaker_ids[d]
            speakers_to_ids = {speaker: i for i, speaker in
                               enumerate(list(set([item for sublist in speakers for item in sublist])))}
            for i in range(len(speakers)):
                for j in range(len(speakers[i])):
                    speakers[i][j] = speakers_to_ids[speakers[i][j]]
            dataset_speakers.append(speakers)
    gen_to_id = {g: i for i, g in enumerate(set(dataset_genres))}
    return dataset_sents, dataset_clusters, dataset_speakers, dataset_genres, dataset_doc_keys, gen_to_id

In [16]:
train_sents, train_clusters, train_speakers, train_genres, train_doc_keys, _ = \
    parse_dataset("data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/*/*/*/*gold_conll")

In [27]:
valid_docids = []
for docid in range(len(train_clusters)):
    for sentid in range(len(train_clusters[docid])):
        if set(train_clusters[docid][sentid]).__len__() > 1:
#             print(docid, sentid, train_clusters[docid][sentid])
            valid_docids.append(docid)
            
print(len(train_clusters), len(set(valid_docids)))

11401 2775


# Scratchpad

In [8]:
path = "../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/*/*/*/*gold_conll"
# glob.iglob(path)


In [9]:
for path in glob.iglob(path):
    print(path)

../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/dev_09_c2e/01/dev_09_c2e_0116.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/dev_09_c2e/01/dev_09_c2e_0105.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/dev_09_c2e/01/dev_09_c2e_0127.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/dev_09_c2e/01/dev_09_c2e_0118.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/dev_09_c2e/01/dev_09_c2e_0117.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/dev_09_c2e/01/dev_09_c2e_0111.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/dev_09_c2e/01/dev_09_c2e_0107.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/dev_09_c2e/01/dev_09_c2e_0113.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/

../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/sel_9503.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/sel_9581.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/sel_9501.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/sel_9543.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/sel_9512.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/sel_9556.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/sel_9514.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/sel_9593.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/sel_9504.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/wb/sel/95/

../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0058.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0047.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0022.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0007.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0046.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0018.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0012.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0044.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0037.gold_conll
../data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/tc/ch/00/ch_0024.gold_conll
