In [1]:
import glob
import os
import os.path
import spacy
import pandas as pd
from tqdm.auto import tqdm
tqdm.pandas(desc="progress")

# 00 - Setup
- Define Path and Install spaCy Model

In [2]:
path_gold_data = "data/gold/"

In [3]:
#!python -m spacy download de_core_news_sm

# previous code used within google colab, following code for usage in different environment

import sys
!{sys.executable} -m spacy download de_core_news_sm

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting de-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.2.0/de_core_news_sm-3.2.0-py3-none-any.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


# 01 - Extract Minimal Entities from Gold Data
- Preexisting Code, minor extensions added to save data in files for fine-tuning the baseline model and to save minimal extracted entities for coverage check of generated lexicon.

In [4]:

"""
Read list of files that are ready for processing.
Returns list with file names.
"""
def read_file_list(fname):
    files = []
    with open(fname, "r") as inf:
        for line in inf:
            files.append(line.strip())
    return files


"""
Read tsv file and return as list of dictionaries.
"""
def read_file(infile):
    data = []; dic = { 'sent': [], 'annot': [], 'tid': [], 'offset': [], 'sid': 0 }
    with open(infile, "r") as inf:
        for line in inf:
            #print("LINE", line)
            if line.startswith('#'):
                continue
            if line.isspace():
                if len(dic['sent']) > 0:
                    data.append(dic)
                dic = { 'sent': [], 'annot': [], 'sid': 0, 'tid': [], 'offset': [] }
            else:
                toks = line.strip().split("\t")

                dic['sid'] = toks[0].split('-')[0]
                dic['tid'].append(toks[0].split('-')[1])
                dic['offset'].append(toks[1])
                dic['sent'].append(toks[2])
                dic['annot'].append(toks[3])

    return data


# get a set of ids and a spacy doc and find the highest node
# in the tree amongst the ids
def find_local_root(ids, l, doc):
    do_print = 0 # turn on/off debugging messages      
    if do_print:
        for t in doc:
            print(t.text, t.dep_, t.head)
    if len(ids) == 1:
        if do_print:
            print("RETURN", ids[0])
        return ids[0]
    elif len(ids) < 1:
        return None
    local_root = ids[0]
    if do_print:
        print("START", l, local_root)
    for i in ids:
        for j in ids:
            if do_print:
                print("\t=>", i, j, local_root, doc[i].text, doc[j].text)
            if i == j:
                continue
            if doc[local_root] in doc[j].children:
                if do_print:
                    print("\tnew local root", local_root, doc[j].text)
                local_root = j
        if do_print:
            print("RETURN", local_root)
        return local_root



"""
Take a dictionary and label information 
and reduce the span annotations to the
local root (head) of the span.
Return the dic with minimal span annotations.
"""
def get_minimal_annotation(dic, labeldic, doc):
    dic['min'] = ['_' for n in dic['new']]
    for l in labeldic:
        local_root = find_local_root(labeldic[l], l, doc)
        if local_root != None:
            dic['min'][local_root] = dic['new'][local_root]
    return dic


def check_for_redundant_spans(ldic, d, doc):
    if len(ldic) > 0 and doc[ldic[-1]].pos_ in ['PUNCT', 'PREP', 'CCONJ', 'ADP']:
        d['new'][ldic[-1]] = '_'
        ldic.remove(ldic[-1])

    if len(ldic) > 0 and doc[ldic[-1]].pos_ in ['PUNCT', 'PREP', 'CCONJ', 'ADP']:
        ldic, d = check_for_redundant_spans(ldic, d, doc)

    return ldic, d



"""
Check for common errors:
- missing annotations for ich/mir/mich
- missed determiners
Print annotation report with potential errors.
"""
def get_minimal_annots(data, nlp):  
    dfs = []
    dfs_excl_ents = []

    label_id = 1
    for d in data:
        labeldic = {}; idxdic = {}
        for i in range(len(d['tid'])):
            if d['annot'][i] != '_':
                if '|' in d['annot'][i]:
                    labels = d['annot'][i].split('|')
                else:
                    labels = [d['annot'][i]]
                for l in labels:
                    # some labels don't have a label id
                    # make sure that they are uniq 
                    if '[' not in l:
                        l += '[' + str(label_id) + ']'
                        label_id += 1

                    if l not in labeldic:
                        labeldic[l] = [i]
                    else:
                        labeldic[l].append(i)

                    if i not in idxdic:
                        idxdic[int(i)] = [l]
                    else:
                        idxdic[int(i)].append(l)

        # now compare the selected labels
        # start with the first label and check if the tokens
        # are included in another label set; if not -> assign labels
        # initialise a new annotation layer with default _ labels
        d['new'] = ['_' for annot in d['annot']]

        for i in range(len(d['sent'])):
            if i in idxdic:
                if len(idxdic[i]) == 1:
                    d['new'][i] = d['annot'][i]
                    continue
                else:
                    labels = idxdic[i]
                    # find the label with the shortest span => min_label
                    min_label = labels[0]
                    for idx in range(1, len(labels)):
                        if len(labeldic[labels[idx]]) < len(labeldic[min_label]):
                            min_label = labels[idx]
                    # we keep the label for the shortest span only
                    # and remove those ids from the other label dic
                    for l_id in labeldic[min_label]:
                        d['new'][l_id] = min_label
                        for l in labels:
                            if l != min_label and l_id in labeldic[l]:  
                                labeldic[l].remove(l_id)

        # finally, check the label_dic for spans that only have punctuation or conjunctions
        # and remove those from the label set
        doc = nlp(" ".join(d['sent']))
        for l in labeldic:
            go = 0
            for idx in labeldic[l]:
                # this set of words is a heuristic and probably not complete...
                if d['sent'][idx] not in [',', '.', ';', '!', '?', '-', 'und', 'oder', 'aber']:
                    go += 1
            if go == 0:
                d['new'] = ['_' if new_l == l else new_l for new_l in d['new'] ]
            # we also want to check the last element of the label list:
            # if it's a conjunction or punctuation, then we want to remove it
            labeldic[l], d = check_for_redundant_spans(labeldic[l], d, doc)

        # for the subtree, we first need to find the highest element in the tree
        # and then we can extract its subtree
        d = get_minimal_annotation(d, labeldic, doc)
        

        #########################################
        # implementation below is added or changed to preexisting implementation
        #########################################



        df = pd.DataFrame(d)[['sid', 'sent', 'min']]
        df.columns = ['sentence_id', 'token', 'label']
        df_tokens = []
        df_labels = []
        df_ids = []
        
        excl_ents = []
        excl_ents_l = []


        # structure gold data (with minimal extracted entities) as paragraphs
        test = df.groupby(['sentence_id'])
        for name,group in test:
            df_tokens.append(group.token.values.tolist())
            df_labels.append(group.label.values.tolist())
            df_ids.append(name)

            # collect all minimal extracted entities from gold data for coverage check of generated lexicon 
            for i in range(0, len(group.token.values.tolist())):
              if group.label.values.tolist()[i] != '_':
                excl_ents.append(group.token.values.tolist()[i])
                excl_ents_l.append(group.label.values.tolist()[i])



        df = pd.DataFrame({'sentence_id' : df_ids, 'token' : df_tokens, 'label' : df_labels})    
        dfs.append(df)
        df_excl_ents = pd.DataFrame({'entity' : excl_ents, 'label' : excl_ents_l })
        dfs_excl_ents.append(df_excl_ents)
    return dfs, dfs_excl_ents



all_gold_data_files = glob.glob(os.path.join(path_gold_data, '*.csv'))
nlp = spacy.load('de_core_news_sm')

count = 0
data = pd.DataFrame()
ents = pd.DataFrame()
for infile in all_gold_data_files:

    print("#FILENAME =", infile)
    f = read_file(infile)
    
    dfs, dfs_excl_ents = get_minimal_annots(f, nlp)

    # combine all gold data (with minimal extracted entities) paragraphs to one file
    for df in dfs:
        data = pd.concat([data, df])
    count += 1

    # combine all minimal extracted entities for coverage check of generated lexicon
    for df_excl in dfs_excl_ents:
      ents = pd.concat([ents, df_excl])


data = data.reset_index(drop=True)
ents = ents.reset_index(drop=True)
data['sentence_id'] = data.index

print('processed ' + str(count) + ' files')
data.to_csv(path_gold_data + 'processed/min_annotations_gold.csv', index=False)


# some problems in the extraction:
# Frau Staatssekretärin
# Herr Minister Müller
# uns als X
# wir als X
# ich als X
# ...

#FILENAME = data/gold/20003_Zusatzpunkt_2_CDU_CSU_Staffler_ID20306400_18.11.2021.csv
#FILENAME = data/gold/19192_Tagesordnungspunkt_13_CDU_CSU_Kramp-Karrenbauer_ID1919206400_19.11.2020.csv
#FILENAME = data/gold/19058_Tagesordnungspunkt_14_CDU_CSU_Steiniger_ID195810900_18.10.2018.csv
#FILENAME = data/gold/19232_Tagesordnungspunkt_9_LINKE_Pflueger_ID1923212000_09.06.2021.csv
#FILENAME = data/gold/19232_Tagesordnungspunkt_9_CDU_CSU_Lehmann_ID1923211700_09.06.2021.csv
#FILENAME = data/gold/19124_Tagesordnungspunkt_3_FDP_Martens_ID1912401000_07.11.2019.csv
#FILENAME = data/gold/19058_Tagesordnungspunkt_14_GRUENE_Brantner_ID195811200_18.10.2018.csv
#FILENAME = data/gold/19071_Tagesordnungspunkt_4_AfD_Chrupalla_ID197101300_13.12.2018.csv
#FILENAME = data/gold/19092_Tagesordnungspunkt_5_CDU_CSU_Otte_ID199204100_04.04.2019.csv
#FILENAME = data/gold/19124_Tagesordnungspunkt_3_AfD_Seitz_ID1912400200_07.11.2019.csv
#FILENAME = data/gold/19196_Tagesordnungspunkt_25_SPD_Brunner_ID1919603600_27.11.20

# 02 - Convert Labels - Minimal Entities for Coverage Check of Lexicon
- The original labels from the manually labeled gold data differ to those used for the lexicon generation. To align the labels, I convert the original labels to the respective labels used in the lexicon/generated training data.

In [None]:
import ast

class_mapping = {0: '_',
 1: 'epPol',
 3: 'epWirt',
 5: 'epFinanz',
 7: 'epMedia',
 9: 'epSci',
 11: 'epRel',
 13: 'epKult',
 15: 'epMil',
 17: 'epNgo',
 19: 'epMov',
 21: 'epOwn',
 23: 'eoPol',
 25: 'eoWirt',
 27: 'eoFinanz',
 29: 'eoMedia',
 31: 'eoSci',
 33: 'eoRel',
 35: 'B-EO_KULT',
 37: 'eoMil',
 39: 'eoNgo',
 41: 'eoMov',
 43: 'pNat',
 45: 'pEth',
 47: 'pFunk',
 49: 'pAge',
 51: 'pSoz',
 53: 'pGen',
 55: 'GPE',
}

def replace_labels(row):
    row = row.replace('[', '').replace(']', '')
    label = ''.join([i for i in row if not i.isdigit()])
    label = label.replace('Pfunk', 'pFunk').replace('pfunk', 'pFunk').replace('EGpol', 'eoPol').replace('EPown', 'epOwn').replace('page', 'pAge')
    label = list(class_mapping.keys())[list(class_mapping.values()).index(label)]
    return label


ents['label'] = ents['label'].apply(replace_labels)
ents = ents.drop_duplicates()
ents = ents.sort_values(['label', 'entity'], ascending = (True, True))
ents

Unnamed: 0,entity,label
3473,16,1
3278,265,1
3265,287,1
3266,314,1
3280,330,1
...,...,...
580,unser,55
2449,wir,55
4219,Äthiopien,55
18,Österreich,55


In [None]:
ents.to_csv(path_gold_data + 'processed/lex_min_annotations_gold.csv', index=False)

# 03 - Convert Labels - Gold-Training Data
- The original labels from the manually labeled gold data differ to those used for the automatically generated training data. To align the labels, I convert the original labels to the respective labels used in the lexicon/generated training data.

In [None]:
data_gold = pd.read_csv(path_gold_data + 'processed/min_annotations_gold.csv')
data_gold

Unnamed: 0,sentence_id,token,label
0,0,"['Sehr', 'geehrter', 'Herr', 'Präsident', '!',...","['_', '_', 'epPol[1]', '_', '_', '_', '_', '_'..."
1,1,"['Es', 'gibt', 'nach', 'der', 'derzeitigen', '...","['_', '_', '_', '_', '_', '_', '_', '_', '_', ..."
2,2,"['Obwohl', 'Schengen', 'und', 'Dublin', 'von',...","['_', '_', '_', '_', '_', '_', '_', '_', '_', ..."
3,3,"['Erstens', 'wissen', 'wir', 'alle', ',', 'das...","['_', '_', '_', '_', '_', '_', '_', '_', '_', ..."
4,4,"['Der', 'Sachverständige', 'Gerald', 'Knaus', ...","['_', 'epSci[9]', '_', '_', '_', '_', '_', '_'..."
...,...,...,...
4791,4791,"['Sie', 'arbeiten', 'hier', ',', 'sie', 'zahle...","['_', '_', '_', '_', '_', '_', '_', '_', '_', ..."
4792,4792,"['Sie', 'tragen', 'dazu', 'bei', ',', 'dass', ...","['_', '_', '_', '_', '_', '_', '_', '_', '_', ..."
4793,4793,"['Und', 'Sie', 'von', 'der', 'AfD', 'wollen', ...","['_', 'eoPol[37]', '_', '_', '_', '_', '_', 'p..."
4794,4794,"['Für', 'uns', 'gilt', ':', 'Wer', 'in', 'unse...","['_', '_', '_', '_', '_', '_', '_', '_', '_', ..."


In [None]:
import ast

class_mapping = {0: '_',
 1: 'epPol',
 3: 'epWirt',
 5: 'epFinanz',
 7: 'epMedia',
 9: 'epSci',
 11: 'epRel',
 13: 'epKult',
 15: 'epMil',
 17: 'epNgo',
 19: 'epMov',
 21: 'epOwn',
 23: 'eoPol',
 25: 'eoWirt',
 27: 'eoFinanz',
 29: 'eoMedia',
 31: 'eoSci',
 33: 'eoRel',
 35: 'B-EO_KULT',
 37: 'eoMil',
 39: 'eoNgo',
 41: 'eoMov',
 43: 'pNat',
 45: 'pEth',
 47: 'pFunk',
 49: 'pAge',
 51: 'pSoz',
 53: 'pGen',
 55: 'GPE',
}

def replace_labels(row):
  new_list = []
  label_list = ast.literal_eval(row)
    
  for label in label_list:
    
    label = label.replace('[', '').replace(']', '')
    label = ''.join([i for i in label if not i.isdigit()])
    label = label.replace('Pfunk', 'pFunk').replace('pfunk', 'pFunk').replace('EGpol', 'eoPol').replace('EPown', 'epOwn').replace('page', 'pAge')
    label = list(class_mapping.keys())[list(class_mapping.values()).index(label)]
    new_list.append(label)

  return new_list

data_gold['label'] = data_gold['label'].progress_apply(replace_labels)
print(data_gold)

progress:   0%|          | 0/4796 [00:00<?, ?it/s]

      sentence_id                                              token  \
0               0  ['Sehr', 'geehrter', 'Herr', 'Präsident', '!',...   
1               1  ['Es', 'gibt', 'nach', 'der', 'derzeitigen', '...   
2               2  ['Obwohl', 'Schengen', 'und', 'Dublin', 'von',...   
3               3  ['Erstens', 'wissen', 'wir', 'alle', ',', 'das...   
4               4  ['Der', 'Sachverständige', 'Gerald', 'Knaus', ...   
...           ...                                                ...   
4791         4791  ['Sie', 'arbeiten', 'hier', ',', 'sie', 'zahle...   
4792         4792  ['Sie', 'tragen', 'dazu', 'bei', ',', 'dass', ...   
4793         4793  ['Und', 'Sie', 'von', 'der', 'AfD', 'wollen', ...   
4794         4794  ['Für', 'uns', 'gilt', ':', 'Wer', 'in', 'unse...   
4795         4795  ['Sozialdumping', 'auf', 'dem', 'Rücken', 'von...   

                                                  label  
0     [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 23, 0,...  
1     [0, 0, 0, 0, 

In [None]:
data_gold.to_csv(path_gold_data + 'processed/min_annotations_gold_num.csv', index=False)

# 04 - Compute Class Distribution
- Count the number of minimal extracted entities per class in the gold data used to fine-tune the baseline model.

In [None]:
class_count = {0: 0,
 1: 0,
 3: 0,
 5: 0,
 7: 0,
 9: 0,
 11: 0,
 13: 0,
 15: 0,
 17: 0,
 19: 0,
 21: 0,
 23: 0,
 25: 0,
 27: 0,
 29: 0,
 31: 0,
 33: 0,
 35: 0,
 37: 0,
 39: 0,
 41: 0,
 43: 0,
 45: 0,
 47: 0,
 49: 0,
 51: 0,
 53: 0,
 55: 0,
}


list_labels = data_gold.label.to_list()
for l in list_labels:
  for lab in l:
    class_count[lab] +=1
print(class_count)
count_labels = pd.DataFrame([class_count]).T
count_labels = count_labels.rename(columns={0: 'count'}).sort_values(by=['count'])
count_labels['label'] = count_labels.index
count_labels


{0: 199172, 1: 1623, 3: 14, 5: 9, 7: 6, 9: 48, 11: 1, 13: 21, 15: 6, 17: 5, 19: 15, 21: 1866, 23: 2378, 25: 123, 27: 154, 29: 33, 31: 21, 33: 3, 35: 0, 37: 160, 39: 44, 41: 12, 43: 232, 45: 160, 47: 1422, 49: 465, 51: 262, 53: 224, 55: 1235}


Unnamed: 0,count,label
35,0,35
11,1,11
33,3,33
17,5,17
7,6,7
15,6,15
5,9,5
41,12,41
3,14,3
19,15,19


In [None]:
count_labels.to_csv(path_gold_data + 'processed/count_labels_gold_data.csv', index=False)