# Set Up Data


Using code from [this mangoes notebook](https://gitlab.inria.fr/magnet/mangoes/-/blob/master/notebooks/BERT%20for%20Co-reference%20Resolution%20-%20Ontonotes.ipynb)

Use [this page](https://cemantix.org/conll/2012/data.html) to get ontonotes in conll format. **Open it on chrome. not FF**

In [1]:
import os
import glob
import time
from mangoes.modeling import BERTForCoreferenceResolution, MangoesCoreferenceDataset
from transformers import BertTokenizerFast

In [15]:
def normalize_word(word, language):
    if language == "arabic":
        word = word[:word.find("#")]
    if word == "/." or word == "/?" or word == "/-":
        return word[1:]
    else:
        return word
    

def parse_document(path, language="english"):
    """
    Parses a single data file
    Returns the data from whatever documents are in the file.
    
    returns:
        words: Lists of Lists of Lists of strings. list of sentences. One sentence is a list of words.
        cluster_ids: Lists of Lists of Lists of ints or tuple(ints). Words that aren't mentions have either -1 as id
        speaker_ids: Lists of Lists of Lists of ints.
        doc_keys: List of document keys
    """
    doc_keys = []
    doc_sents = []
    doc_cluster_ids = []
    doc_speaker_ids = []
    sentences = []
    sentence_cluster_ids = []
    sentence_speaker_ids = []
    cur_sentence_words = []
    cur_sentence_cluster_ids = []
    cur_sentence_speaker_ids = []
    current_clusters = []
    docs = 0
    with open(path, "r") as input_file:
        for line in input_file:
            if line.startswith("#begin document"):
                doc_key = line.split()[2][:-1]
                doc_keys.append(doc_key[1:-1])
                docs += 1
            elif line.startswith("#end document"):
                assert len(sentences) == len(sentence_cluster_ids) == len(sentence_speaker_ids)
                assert cur_sentence_words == []
                doc_sents.append(sentences)
                doc_cluster_ids.append(sentence_cluster_ids)
                doc_speaker_ids.append(sentence_speaker_ids)
                sentences = []
                sentence_cluster_ids = []
                sentence_speaker_ids = []
            else:
                data = line.split()
                sentence_end = len(data) == 0
                if sentence_end:
                    sentences.append(cur_sentence_words)
                    sentence_cluster_ids.append(cur_sentence_cluster_ids)
                    sentence_speaker_ids.append(cur_sentence_speaker_ids)
                    cur_sentence_words = []
                    cur_sentence_cluster_ids = []
                    cur_sentence_speaker_ids = []
                else:
                    cur_sentence_words.append(normalize_word(data[3], language))
                    cur_sentence_speaker_ids.append(data[9])
                    raw_cluster_id = data[-1]
                    if raw_cluster_id == "-":
                        if len(current_clusters) == 0:
                            cluster_id = -1
                        elif len(current_clusters) == 1:
                            cluster_id = int(list(current_clusters)[0])
                        else:
                            cluster_id = tuple(int(item) for item in current_clusters)
                    else:
                        for part in raw_cluster_id.split("|"):
                            if "(" in part:
                                current_clusters.append(part[1:-1] if ")" in part else part[1:])
                        if len(current_clusters) == 1:
                            cluster_id = int(list(current_clusters)[0])
                        else:
                            cluster_id = tuple(int(item) for item in current_clusters)
                        for part in raw_cluster_id.split("|"):
                            if ")" in part:
                                current_clusters.remove(part[1:-1] if "(" in part else part[:-1])
                    cur_sentence_cluster_ids.append(cluster_id)
        assert len(doc_sents) == docs
        for i in range(docs):
            doc_keys[i] += f"_{i}"
        return doc_sents, doc_cluster_ids, doc_speaker_ids, doc_keys

    
def parse_dataset(path):
    """
    Parses a directory of Ontonotes data files (train, dev, or test)
    
    
    returns:
        words: Lists of Lists of Lists of strings. list of sentences. One sentence is a list of words.
        cluster_ids: Lists of Lists of Lists of ints or tuple(ints). Words that aren't mentions have either -1 as id
        speaker_ids: Lists of Lists of Lists of ints.
        doc_keys: List of document keys.
        

    """
    dataset_sents = []
    dataset_clusters = []
    dataset_speakers = []
    dataset_genres = []
    dataset_doc_keys = []
    for path in glob.iglob(path):
        genre = path.split("/")[-4]
        doc_sents, doc_cluster_ids, doc_speaker_ids, doc_keys = parse_document(path)
        assert len(doc_sents) == len(doc_cluster_ids) == len(doc_speaker_ids) == len(doc_keys)
        dataset_sents += doc_sents
        dataset_clusters += doc_cluster_ids
        dataset_genres += [genre] * len(doc_sents)
        dataset_doc_keys += doc_keys
        for d in range(len(doc_speaker_ids)):
            speakers = doc_speaker_ids[d]
            speakers_to_ids = {speaker: i for i, speaker in
                               enumerate(list(set([item for sublist in speakers for item in sublist])))}
            for i in range(len(speakers)):
                for j in range(len(speakers[i])):
                    speakers[i][j] = speakers_to_ids[speakers[i][j]]
            dataset_speakers.append(speakers)
    gen_to_id = {g: i for i, g in enumerate(set(dataset_genres))}
    return dataset_sents, dataset_clusters, dataset_speakers, dataset_genres, dataset_doc_keys, gen_to_id

In [16]:
train_sents, train_clusters, train_speakers, train_genres, train_doc_keys, _ = \
    parse_dataset("data/raw/ontonotes/conll-2012/v5/data/train/data/english/annotations/*/*/*/*gold_conll")

In [27]:
valid_docids = []
for docid in range(len(train_clusters)):
    for sentid in range(len(train_clusters[docid])):
        if set(train_clusters[docid][sentid]).__len__() > 1:
#             print(docid, sentid, train_clusters[docid][sentid])
            valid_docids.append(docid)
            
print(len(train_clusters), len(set(valid_docids)))

11401 2775


# Scratchpad

In [11]:
documents = [['They', 'also', 'pay', 'out', 'salary', 'specially', 'providing', 'for', 'a', 'large', 'gang', 'of', 'female', 'duty', 'officers', ',', 'who', 'watch', 'the', 'screens', 'all', 'day', 'monitoring', 'intersections', ',', 'and', 'not', 'a', 'few', 'minutes', 'go', 'by', 'without', 'a', 'negative', 'report', '.'], ['That', 'female', 'voice', 'is', 'pure', 'misery', '.'], ['I', 'am', 'No.', 'xx', 'duty', 'officer', ',', 'according', 'to', 'the', 'screen', 'at', 'the', 'moment', ',', 'such', 'and', 'such', 'an', 'intersection', 'is', 'badly', 'backed', 'up', 'with', 'traffic', '.'], ['Please', ',', 'drivers', 'out', 'there', 'who', 'are', 'able', 'to', 'do', 'so', ',', 'make', 'an', 'attempt', 'to', 'bypass', '.'], ['It', "'s", 'bad', 'enough', 'as', 'it', 'is', 'churning', 'out', 'all', 'these', 'negative', 'reports', ',', 'but', 'then', 'commanding', 'others', 'to', 'listen', 'to', 'your', 'orders', ',', 'are', "n't", 'you', 'being', 'even', 'more', 'reactionary', '?'], ['And', 'even', 'that', "'s", 'not', 'enough', ','], ['every', 'day', 'they', 'work', 'on', 'pedestrians', 'to', 'ring', 'in', 'and', 'report', ',', 'such', 'a', 'place', 'is', 'blocked', 'up', 'again', ';', 'and', 'then', 'they', 'interrogate', 'them', ',', 'what', "'s", 'the', 'reason', 'for', 'the', 'jam', ',', 'how', 'long', 'has', 'it', 'been', 'jammed', 'for', ',', 'has', 'there', 'been', 'an', 'accident', ',', 'are', 'there', 'any', 'casualties', ',', 'are', 'there', 'any', '......', '.'], ['All', 'in', 'all', ',', 'not', 'a', 'single', 'nice', 'thing', 'to', 'say', '.'], ['Just', 'in', 'the', 'end', 'insincerely', 'coming', 'up', 'with', 'a', 'couple', 'of', 'words', 'of', 'warning', ',', 'drive', 'well', '.'], ['Is', "n't", 'that', 'crap', '?'], ['A', 'perfectly', 'good', 'mood', 'early', 'in', 'the', 'morning', 'totally', 'ruined', 'by', 'you', '.'], ['Misled', 'by', 'you', 'into', 'this', 'state', ',', 'how', 'could', 'I', 'not', 'get', 'into', 'an', 'accident', '?'], ['Even', 'more', 'impressive', ',', 'they', 'use', 'a', 'few', 'bad', 'boys', ',', 'one', 'called', 'Dong', 'Sheng', ',', 'one', 'called', 'Zhi', 'Yong', '.'], ['They', 'are', 'supposedly', 'teachers', '.'], ['Every', 'day', 'for', 'hours', 'on', 'air', ',', 'straight', 'out', 'insulting', 'and', 'abusing', '.'], ['If', 'you', 'are', 'abusing', 'ordinary', 'people', ',', 'then', 'that', "'s", 'one', 'thing', ',', 'but', 'it', "'s", 'always', 'abusing', 'the', 'authorities', ',', 'some', 'official', 'is', "n't", 'sympathetic', ',', 'does', "n't", 'accomplish', 'anything', ',', 'does', "n't", 'do', 'his', 'job', ',', 'basically', 'it', "'s", 'serious', 'defamation', '.']]
clusters = [[-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1]]

In [14]:
cluster = [15, (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17, 17), (15, 17, 17), (15, 17, 17, 17z), (15, 17, 17), (15, 17, 17), (15, 17), (15, 17, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), (15, 17), -1, -1, 15, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, (14, 1), 14, 14, 14, 14, 14, 14, 14, (14, 1), 14, 14, 14, 14, 14, (14, 14), 14, 14, 14, 14, 14, 14, (14, 14), 14, 14, -1, -1, -1, -1, -1, -1, 14, 14, -1, -1]

In [17]:
stack = []
open_spans= []


In [None]:
for span, c in cluster:
    
    if c == -1:
        continue
    
    if type(c) is int:
        # There is only one annotation
        if c in op.keys():
            # We've seen this cluster before
            # Check if this is an ongoing annotation or a new one
            if c in [x[0] for x in open_spans]:
                
        else:
            # Its a new cluster
            op[c] = [(span,)]
            open_spans.append((c, span))

In [24]:
cluster = {15, 17}
expected = {12, 17}

In [26]:
ins = expected.intersection(cluster)
expected - ins, cluster-ins

({12}, {15})

In [28]:
for val in expected - ins:
    print(val)

12


In [46]:
from typing import List
from copy import deepcopy
def subtraction(a: List[int], b: List[int]) -> List[int]:
    """ Subtraction of one list from another (with duplicates) """
    if not b: 
        return deepcopy(a)
    
    b_ = deepcopy(b)
    res = []
    for ele in a:
        if ele in b_:
            b_.pop(b_.index(ele))  
        else:
            res.append(ele)
        print(res, ele, b_)
            
    return res

In [51]:
a = [1,2,2,3,5,5, 5]
b = [2, 3,4,5,5,5, 'potato']

subtraction(a, b)

[1] 1 [2, 3, 4, 5, 5, 5, 'potato']
[1] 2 [3, 4, 5, 5, 5, 'potato']
[1, 2] 2 [3, 4, 5, 5, 5, 'potato']
[1, 2] 3 [4, 5, 5, 5, 'potato']
[1, 2] 5 [4, 5, 5, 'potato']
[1, 2] 5 [4, 5, 'potato']
[1, 2] 5 [4, 'potato']


[1, 2]

In [54]:
import re
def find_all(pattern: str, data: str):
    """Yields all the positions of
    the pattern p in the string s."""
    return [a.start() for a in list(re.finditer(pattern, data))]

In [63]:
ppattern = r'(\([a-zA-Z]*)|\)'
pattern = r"\([a-zA-Z]*|\)"
strings = \
'''(POOP
*
(A(B(C)(D))
)
*)'''.split('\n')

In [64]:
strings

['(POOP', '*', '(A(B(C)(D))', ')', '*)']

In [65]:
for string in strings:
    print(string)
    print()
    for match in re.findall(pattern, string):
        print(match)
    print('---------')

(POOP

(POOP
---------
*

---------
(A(B(C)(D))

(A
(B
(C
)
(D
)
)
---------
)

)
---------
*)

)
---------


In [68]:
match.__class__

str