In [1]:
import pandas as pd
import spacy
import glob
import os
import re
from timeit import default_timer as timer
from datetime import timedelta
from tqdm.auto import tqdm
import itertools
import json

# 00 - Install spaCy Model


In [14]:
#!python -m spacy download de_core_news_sm

# previous code used within google colab, following code for usage in different environment

import sys
!{sys.executable} -m spacy download de_core_news_sm

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting de-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.2.0/de_core_news_sm-3.2.0-py3-none-any.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


# 00 - Define Paths

In [2]:
# directory containing the raw data files
path_data = 'data/training/unlabeled_data/raw/tsv/'

# directory for all processed files 
path_processed_data = 'data/training/unlabeled_data/processed/'

# directory containing gold data
path_gold_data = 'data/gold/'

# directory containing the seed entity lists from knowledge base
path_classes = 'data/training/class_lists/'

# directory for all extraction lists (original/lemmatized) (unlabeled/labeled)
path_found = 'data/training/unlabeled_data/processed/extract/'

# directory for annotated training data
path_training_data = 'data/training/training_data/'

# path for lexicon data
path_lexicon = 'data/lexicon/'

# 00 - Tool Methods

In [3]:
def cleanup(row):
    return row.replace('(', '').replace(')', '').replace('\"', '').replace('“', '').replace('„', '').replace('usw.', 'und so weiter').replace('sog.', 'sogenanntes').replace('usf.', 'und so fort').replace('z.B.', 'zum Beispiel') 
                                                                                                                                                        
def split(row):
    return row.split()

def split_and_explode(input):
    return(input['0'].apply(split).explode())

def lemmatize(row):
    doc = nlp(row)
    lemma = " ".join(token.lemma_ for token in doc)
    return lemma

def tokenize(row):
    doc = nlp(row)
    return " ".join(token.text for token in doc)

# labels entity according to IOB2 format
def labeler(row, key):
    l = len(row.split())
    return [key+1 if i !=0 else key for i in range(l)]

# used to extract seed entities from unlabeled data
def extract(input, group):
    result = input['token'].str.extractall(f'({group})')
    return result

# used to split and label extracted seed entities
def label(input, key):
    i = input.columns.get_loc('0')
    df2 = input['0'].apply(split)
    spl = pd.concat([input.iloc[:, :i], df2, input.iloc[:, i+1:]], axis=1).explode('0')
    spl.columns = ['sentence_id', 'match', 'token']
    spl = spl[['sentence_id', 'token']]
    input = input['0'].apply(labeler, key=key).explode()
    input = input.rename('label')
    spl = pd.concat([spl, input], axis=1)
    spl = spl[['sentence_id', 'token', 'label']]
    return spl

# used to annotate unlabeled training data
def merge_text_label(text, label, on):
  label = label.drop_duplicates(subset=['sentence_id', 'token'])
  text = pd.merge(text, label, on=on, how='left')
  if len(text.columns) > 3:
      text['label'] = text['label_x'].fillna(text['label_y'])
      text = text.rename(columns={'sentence_id_x': 'sentence_id'})
      text = text[['sentence_id', 'token', 'label']]
  return text


# heuristic approach to correct labels based on most frequently found tokens with incorrect labels, may be manually extended
def correct_labels(input, c):
  len_old = len(input)
  if c == 'EO_POL':
    indexNames = input[input['token'].isin(['Deutschland', 'Bundeswehr', 'Bayern'])].index
    input = input.drop(indexNames)
  elif c == 'P_SOZ':
    indexNames = input[input['token'].isin(['Arbeit', 'Macht', 'Zivilgesellschaft', 'Leid', 'Vermieter', 'Eigentümer', 'Vermietern', 'Kolonialismus', 'Kapitalismus', 'Elite', 'Eliten', 'Dominanz', 'Legitimität'])].index
    input = input.drop(indexNames)
  elif c == 'EO_MEDIA':
    indexNames = input[input['token'].isin(['Welt'])].index
    input = input.drop(indexNames)
  elif c == 'P_FUNC':
    indexNames = input[input['token'].isin(['Bundeskanzlerin', 'Unternehmer', 'Wissenschaftler', 'Unternehmern', 'Wissenschaftlern'])].index
    input = input.drop(indexNames)
  elif c == 'EO_MOV':
    indexNames = input[input['token'].isin(['Reichsbürger'])].index
    input = input.drop(indexNames)
  elif c == 'P_ETH':
    indexNames = input[input['token'].isin(['Russen'])].index
    input = input.drop(indexNames)
  elif c == 'P_NAT':
    indexNames = input[input['token'].isin(['Berliner', 'Pariser'])].index
    input = input.drop(indexNames)
  print('Removed ' + str(len_old - len(input)) + ' ambiguous/incorrect tokens for class ' + c)
  return input

def count_found_entities(input):
  input = input[input['label'] != 0]
  input = input[['token', 'label']].value_counts()
  return pd.DataFrame(input)

# used to remove incorrect I-labels without correct B-label
def remove_inner_labels(row):
  label_list = json.loads(row)
  new_list = []
  inLabel = False
  last_Label_start = 0
  count_removed_inner = 0
  for i in label_list:
    if i == 0:
      new_list.append(0)
      inLabel = False
      last_Label = 0
    elif i % 2 == 1:
      new_list.append(int(i))
      inLabel = True
      last_Label_start = int(i)
    elif inLabel:
      if int(i) == last_Label_start + 1:
        new_list.append(int(i))
      else: 
        new_list.append(0)
        inLabel = False
        count_removed_inner += 1
    else: 
      new_list.append(0)
      inLabel = False
      count_removed_inner += 1
  return new_list



# 01 - Preprocessing Unlabeled Data
- Check for duplicates in raw data files
- Read in all raw data files that are not also contained in the gold data and that do not have duplicates
- Save processed data to file

In [5]:
print('++++++++++ Collecting files... ++++++++++')

all_data_files = glob.glob(os.path.join(path_data, '*.csv'))
all_gold_data_files = glob.glob(os.path.join(path_gold_data, '*.csv'))


print(str(len(all_data_files)) + ' files found in data path.')
print(str(len(all_gold_data_files)) + ' files found in gold data path.')

print('++++++++++ Checking for duplicate file names in data files... ++++++++++')

count_dups = 0
all_data_files.sort()
for k, g in itertools.groupby(all_data_files, lambda f: os.path.splitext(f)[0]):
     duplicates = list(g)
     if len(duplicates) > 1:
        count_dups +=1
print(str(count_dups) + ' duplicates found in data based on file name.')


count_files = 0
count_sentences = 0
max_length = 0
input = []
read_files = []
count_coliding = 0

print('++++++++++ Reading in data files... ++++++++++')
start = timer()

for f in all_data_files:
    mock_file_name_gold = path_gold_data + f.replace(path_data, '')
    if mock_file_name_gold  in all_gold_data_files:
        count_coliding +=1
    
    # only process files not also contained in gold set
    if f not in read_files and mock_file_name_gold not in all_gold_data_files:
        read_files.append(f)
        count_files += 1
        file = open(f, 'r')
        Lines = file.readlines()

        for line in Lines:

            # extract the raw text from csv files
            if line.startswith('#Text='):
                line = line.replace('#Text=', '')
                input.append(line)
                count_sentences += 1

                if len(line.split()) > max_length:
                    max_length = len(line.split())
        
        if count_files % 50 == 0:
            print('Processed ' + str(count_sentences) + ' paragraphs from ' + str(count_files) + ' data files.')
    
    
data = pd.DataFrame({'token' : input})
data.to_csv(path_processed_data+'processed_data.csv', index=False)

end = timer()

print('Read in ' + str(count_sentences) + ' paragraphs from ' + str(count_files) + ' unlabeled data files, excluding ' + str(count_dups) + ' duplicate files and ' + str(count_coliding) + ' files also found in gold data.')
print('Duration: ' + str(timedelta(seconds=end - start)))
print('Saved paragraphs in directory: ' + str(path_processed_data))
print("Max Sentence length = " + str(max_length))

++++++++++ Collecting files... ++++++++++
25427 files found in data path.
268 files found in gold data path.
++++++++++ Checking for duplicate file names in data files... ++++++++++
116 duplicates found in data based on file name.
++++++++++ Reading in data files... ++++++++++
Processed 685 paragraphs from 50 data files.
Processed 1440 paragraphs from 100 data files.
Processed 2273 paragraphs from 150 data files.
Processed 2986 paragraphs from 200 data files.
Processed 3708 paragraphs from 250 data files.
Processed 4074 paragraphs from 300 data files.
Processed 4730 paragraphs from 350 data files.
Processed 5459 paragraphs from 400 data files.
Processed 5843 paragraphs from 450 data files.
Processed 6721 paragraphs from 500 data files.
Processed 7631 paragraphs from 550 data files.
Processed 8325 paragraphs from 600 data files.
Processed 8663 paragraphs from 650 data files.
Processed 9766 paragraphs from 700 data files.
Processed 10592 paragraphs from 750 data files.
Processed 11519 pa

# 02 A - Tokenizing/Lemmatizing Unlabeled Data
- Tokenize the unlabeled data
- Create a second version of the unlabeled data, lemmatize this second version
- Save both versions to files

In [None]:
print('++++++++++ Reading in processed data ... ++++++++++')
data = pd.read_csv(path_processed_data + 'processed_data.csv')
print(data)


print('++++++++++ Loading spaCy model ... ++++++++++')
nlp = spacy.load('de_core_news_sm', disable=['tagger', 'parser', 'ner'])

print('++++++++++ Cleaning data ... ++++++++++')
data['token'] = data['token'].apply(cleanup)
data = data[['token']]

print('++++++++++ Tokenizing data ... ++++++++++')
tqdm.pandas(desc='Tokenizing Data')
data['token'] = data['token'].progress_apply(tokenize)
print(data)
data.to_csv(path_processed_data + 'data_tokenized.csv', index=False)
print('Saved tokenized data set: ' + path_processed_data + 'data_tokenized.csv')

print('++++++++++ Creating lemmatized data set ... ++++++++++')
data_lem = data.copy()
tqdm.pandas (desc='Lemmatizing data')
data_lem['token'] = data_lem['token'].progress_apply(lemmatize)
print(data_lem)
data_lem.to_csv(path_processed_data + 'lemmatized/data_lemmatized.csv', index=False)
print('Saved lemmatized data set: ' + path_processed_data + 'lemmatized/data_lemmatized.csv')

++++++++++ Reading in processed data ... ++++++++++
                                                    token
0       Guten Morgen, liebe Kolleginnen und Kollegen! ...
1       Meine sehr verehrten Damen und Herren! Liebe K...
2       § 1 Absatz 2 der Geschäftsordnung des Deutsche...
3       Die Fraktion der AfD widerspricht diesem Verfa...
4       Enthaltungen? – Der Antrag ist damit mit den S...
...                                                   ...
334271  Der letzte Punkt; Frau Deligöz hat es gerade a...
334272  mit der Begründung, die Bundesregierung sei ja...
334273  An die Kommunen und an die Familien sende ich ...
334274                                 Herzlichen Dank.\n
334275  Vielen Dank, Frau Kollegin. – Als letzter Redn...

[334276 rows x 1 columns]
++++++++++ Loading spaCy model ... ++++++++++
++++++++++ Cleaning data ... ++++++++++
++++++++++ Tokenizing data ... ++++++++++


Tokenizing data:   0%|          | 0/334276 [00:00<?, ?it/s]

                                                    token
0       Guten Morgen , liebe Kolleginnen und Kollegen ...
1       Meine sehr verehrten Damen und Herren ! Liebe ...
2       § 1 Absatz 2 der Geschäftsordnung des Deutsche...
3       Die Fraktion der AfD widerspricht diesem Verfa...
4       Enthaltungen ? – Der Antrag ist damit mit den ...
...                                                   ...
334271  Der letzte Punkt ; Frau Deligöz hat es gerade ...
334272  mit der Begründung , die Bundesregierung sei j...
334273  An die Kommunen und an die Familien sende ich ...
334274                               Herzlichen Dank . \n
334275  Vielen Dank , Frau Kollegin . – Als letzter Re...

[334276 rows x 1 columns]
Saved tokenized data set to directory: ./drive/MyDrive/model/data/training/unlabeled_data/processed/
++++++++++ Creating lemmatized data set ... ++++++++++


Lemmatizing data:   0%|          | 0/334276 [00:00<?, ?it/s]

                                                    token
0       Guten Morgen , lieb Kollegin und Kollege ! Neh...
1       Meine sehr verehren Dame und Herr ! lieben Kol...
2       § 1 Absatz 2 der Geschäftsordnung der Deutsche...
3       der Fraktion der AfD widersprechen dies Verfah...
4       Enthaltung ? – der Antrag sein damit mit der S...
...                                                   ...
334271  der letzte Punkt ; Frau Deligöz haben ich gera...
334272  mit der Begründung , der Bundesregierung sein ...
334273  an der Kommune und an der Familie senden ich a...
334274                                Herzliche Dank . \n
334275  viel Dank , Frau Kollegin . – als letzt Redner...

[334276 rows x 1 columns]
Saved lemmatized data set to directory: ./drive/MyDrive/model/data/training/unlabeled_data/processed/lemmatized/


# 02 B - Reading in Tokenized and Lemmatized Data




In [5]:
data = pd.read_csv(path_processed_data + 'data_tokenized.csv')
data_lem = pd.read_csv(path_processed_data + 'lemmatized/data_lemmatized.csv')
print(data)
print(data_lem)

                                                    token
0       Guten Morgen , liebe Kolleginnen und Kollegen ...
1       Meine sehr verehrten Damen und Herren ! Liebe ...
2       § 1 Absatz 2 der Geschäftsordnung des Deutsche...
3       Die Fraktion der AfD widerspricht diesem Verfa...
4       Enthaltungen ? – Der Antrag ist damit mit den ...
...                                                   ...
334271  Der letzte Punkt ; Frau Deligöz hat es gerade ...
334272  mit der Begründung , die Bundesregierung sei j...
334273  An die Kommunen und an die Familien sende ich ...
334274                               Herzlichen Dank . \n
334275  Vielen Dank , Frau Kollegin . – Als letzter Re...

[334276 rows x 1 columns]
                                                    token
0       Guten Morgen , lieb Kollegin und Kollege ! Neh...
1       Meine sehr verehren Dame und Herr ! lieben Kol...
2       § 1 Absatz 2 der Geschäftsordnung der Deutsche...
3       der Fraktion der AfD widersprechen di

# 03 - Processing Class Files
- Read in the seed entities extracted from Wikidata
- Create regex for each class
- Collect seed entities for seed lexicon

In [None]:
print('++++++++++ Processing class files... ++++++++++')

class_list = {}
count_entities = 0
count_faults = 0
count_classes = 1

# lists for seed lexicon
lex_ents = []
lex_labs = []

all_class_files = glob.glob(os.path.join(path_classes, '*.csv'))

for cf in all_class_files:
    cfile = open(cf, 'r')
    cLines = cfile.readlines()
    cLabel = cf.replace('.csv', '').replace(path_classes, '')

    entity_string = ''

    
    for line in cLines:
        line = cleanup(line)
        if line.startswith('itemLabel'):
          # ignore irrelevant lines
            continue
        elif re.match('Q\d+', line):
          # ignore irrelevant lines
            count_faults += 1
            continue
        else:
          # process seed entities
            entity_string += line.replace('\n', '') + '|'
            lex_ents.append(line.replace('\n', ''))
            lex_labs.append(cLabel)
            count_entities += 1

    # save regular expression for each class
    class_list[cLabel] = entity_string[:-1]
    print('Processed ' + str(count_entities) + ' final sample entities for class ' + str(count_classes) + ' : ' + cLabel)
    print('Dropped ' + str(count_faults) + ' entities with missing labels for class ' + str(count_classes) + ' : ' + cLabel)
    print('')

    count_entities = 0
    count_faults = 0
    count_classes += 1


print("++++++++++ Finished processing class files... ++++++++++")

++++++++++ Processing class files... ++++++++++
Processed 491 final sample entities for class 1 : EO_SCI
Dropped 9 entities with missing labels for class 1 : EO_SCI
Processed 500 final sample entities for class 2 : EP_WIRT
Dropped 0 entities with missing labels for class 2 : EP_WIRT
Processed 493 final sample entities for class 3 : EP_MIL
Dropped 7 entities with missing labels for class 3 : EP_MIL
Processed 170 final sample entities for class 4 : EP_MOV
Dropped 9 entities with missing labels for class 4 : EP_MOV
Processed 483 final sample entities for class 5 : EP_NGO
Dropped 17 entities with missing labels for class 5 : EP_NGO
Processed 490 final sample entities for class 6 : EP_KULT
Dropped 10 entities with missing labels for class 6 : EP_KULT
Processed 491 final sample entities for class 7 : EP_SCI
Dropped 8 entities with missing labels for class 7 : EP_SCI
Processed 489 final sample entities for class 8 : EP_FINANZ
Dropped 11 entities with missing labels for class 8 : EP_FINANZ
Pro

# 04 - Create Seed Lexicon 
- Create seed lexicon from processed class lists
- save seed lexicon to file

In [None]:
seed_lex = pd.DataFrame({'entity' : lex_ents, 'label' : lex_labs})
seed_lex = seed_lex.sort_values(['label'])
seed_lex.to_csv(path_lexicon + 'seed_lex.csv', index = False)
print('Saved seed lexicon to directory: ' + path_lexicon)

# count number of entities per class in seed lexicon
seed_lex.label.value_counts()

EP_WIRT      500
EP_POL       500
EP_MEDIA     500
EO_WIRT      499
P_ETH        498
P_FUNC       497
EO_KULT      495
EP_MIL       493
EP_SCI       491
EO_SCI       491
EP_KULT      490
EP_FINANZ    489
EP_NGO       483
EO_POL       479
EP_REL       475
EO_MIL       457
EO_NGO       429
EO_MOV       423
EO_FINANZ    300
GPE          262
EO_MEDIA     252
EO_REL       233
P_NAT        181
EP_MOV       170
P_SOZ        158
P_AGE         22
EP_OWN        12
P_GEN          5
Name: label, dtype: int64

# 05 A - Identifying and Extracting Instances of Seed Entities in Unlabeled Data
- Use previously defined regex to extract instances of seed entities from both versions of the unlabeled data
- save extracted seed entities to files, one file per class and version (original/tokenized)

In [None]:
# estimated duration: 40 min
print('++++++++++ Extracting entity samples in data ... ++++++++++')

extracted_lists = {}
extracted_lists_lem = {}
count_findings = {}
count_findings_lem = {}
count_class = 1

start = timer()

for c, l in class_list.items():
    print('Extracting class ' + str(count_class) + ': ' + str(c) +  ' from normal data...')

    # extract seed entities from original unlabeled data with regular expression
    res = extract(data, str(l))

    # save extracted entities 
    res.to_csv(path_found + str(c) + '_extracted_normal.csv', index=True)

    extracted_lists[c] = res
    print('' + str(c) + ' -> ' + str(len(extracted_lists[c])) + ' entities found.')
    count_findings[c] = len(extracted_lists[c])
    count_class += 1

count_class = 1
for c, l in class_list.items():
    print('Extracting class: ' + str(count_class) + ' ' + str(c) +  ' from lemmatized data...')

    # extract seed entities from lemmatized unlabeled data with regular expression
    res = extract(data_lem, str(l))

    # save extracted entities
    res.to_csv(path_found + str(c) + '_extracted_lemma.csv', index=True)
    
    extracted_lists_lem[c] = res
    print('' + str(c) + ' -> ' + str(len(extracted_lists_lem[c])) + ' entities found.')
    count_findings_lem[c] = len(extracted_lists_lem[c])
    count_class += 1

end = timer()

print('Found ' + str(sum(v for v in count_findings.values())) + ' entities in normal data... ')
print('Found ' + str(sum(v for v in count_findings_lem.values())) + ' entities in lemmatized data... ')
print('Duration: ' + str(timedelta(seconds=end - start)))

print('Saved extracted entities to directory: ' + path_found)
print('++++++++++ Finished extracting found entity samples in data ... ++++++++++')

++++++++++ Extracting entity samples in data ... ++++++++++
Extracting class: 1 EO_SCI from normal data...
EO_SCI -> 284 entities found.
Extracting class: 2 EP_WIRT from normal data...
EP_WIRT -> 38 entities found.
Extracting class: 3 EP_MIL from normal data...
EP_MIL -> 548 entities found.
Extracting class: 4 EP_MOV from normal data...
EP_MOV -> 7 entities found.
Extracting class: 5 EP_NGO from normal data...
EP_NGO -> 1453 entities found.
Extracting class: 6 EP_KULT from normal data...
EP_KULT -> 196 entities found.
Extracting class: 7 EP_SCI from normal data...
EP_SCI -> 418 entities found.
Extracting class: 8 EP_FINANZ from normal data...
EP_FINANZ -> 204 entities found.
Extracting class: 9 EP_MEDIA from normal data...
EP_MEDIA -> 289 entities found.
Extracting class: 10 EP_REL from normal data...
EP_REL -> 17 entities found.
Extracting class: 11 EP_POL from normal data...
EP_POL -> 12596 entities found.
Extracting class: 12 EO_POL from normal data...
EO_POL -> 50336 entities found

# 05 B - Loading Instance Extractions

In [None]:
print("++++++++++ Reading in extracted entity samples ... ++++++++++")

all_found_entities_files = glob.glob(os.path.join(path_found, '*_extracted_normal.csv'))
all_found_entities_lemmas_files = glob.glob(os.path.join(path_found, '*_extracted_lemma.csv'))

extracted_lists = {}
extracted_lists_lem = {}

for f in all_found_entities_files:
  df = pd.read_csv(f)
  c = f.replace('_extracted_normal.csv', '').replace(path_found, '')
  extracted_lists[c] = df

for f in all_found_entities_lemmas_files:
  df = pd.read_csv(f)
  c = f.replace('_extracted_lemma.csv', '').replace(path_found, '')
  extracted_lists_lem[c] = df

++++++++++ Reading in extracted entity samples ... ++++++++++


# 06 - Splitting Unlabeled Data
- split paragraphs into separate tokens

In [None]:
print("++++++++++ Splitting data into tokens ... ++++++++++")

data = pd.DataFrame(data['token'].apply(split).explode())
data_lem = pd.DataFrame(data_lem['token'].apply(split).explode())
data["sentence_id"] = data.index
data_lem["sentence_id"] = data_lem.index
print(data)
print(data_lem)

++++++++++ Splitting data into tokens ... ++++++++++
                   token  sentence_id
0                  Guten            0
0                 Morgen            0
0                      ,            0
0                  liebe            0
0            Kolleginnen            0
...                  ...          ...
334275               die       334275
334275  CDU/CSU-Fraktion       334275
334275               das       334275
334275              Wort       334275
334275                 .       334275

[16459859 rows x 2 columns]
                   token  sentence_id
0                  Guten            0
0                 Morgen            0
0                      ,            0
0                   lieb            0
0               Kollegin            0
...                  ...          ...
334275               der       334275
334275  CDU/CSU-Fraktion       334275
334275               der       334275
334275              Wort       334275
334275                 .       334275

[1645

# (Compare tokenized and lemmatized paragraph length)
- Annotation method requires identical length of both data sets

In [None]:
test = data.sentence_id.value_counts()
test_lem = data_lem.sentence_id.value_counts()
both = pd.DataFrame(test)
both['lem'] = test_lem
both.columns = ['normal', 'lem']

print(both)
print(both[both.normal != both.lem])

        normal  lem
295194     385  385
286652     335  335
294668     327  327
76607      326  326
39603      325  325
...        ...  ...
9099         1    1
20860        1    1
141545       1    1
186419       1    1
154359       1    1

[334276 rows x 2 columns]
Empty DataFrame
Columns: [normal, lem]
Index: []
['Es', 'reicht', 'auch', 'nicht', ',', 'zu', 'sagen', ':', 'Das', 'war', 'der', 'Nationalsozialismus', ',', 'und', 'das', 'machen', 'wir', 'wieder', 'gut', '.', '–', 'Denn', 'es', 'ist', 'gefordert', ',', 'Selbstkritik', 'zu', 'üben', 'und', 'auf', 'uns', 'selbst', 'zu', 'blicken', '.', 'Einer', 'der', 'bekanntesten', 'Widerstandskämpfer', 'gegen', 'den', 'Nationalsozialismus', 'war', 'der', 'Sozialdemokrat', 'Wilhelm', 'Leuschner', '.', 'Zur', 'Wahrheit', 'gehört', 'aber', 'auch', ',', 'dass', 'in', 'seine', 'Zeit', 'als', 'hessischer', 'Innenminister', 'das', 'Gesetz', 'zur', 'Bekämpfung', '–', 'und', 'ich', 'verwende', 'bewusst', 'nicht', 'den', 'Begriff', '–', 'des', 'Z',

# 07 A - Split and Label Extracted Instances
- split extracted instances (see 05 A) and label them with numeric labels
- save labeled/extracted entities to files, one file per class and version (original/tokenized)
- for further usage: clean entities based on heuristics

In [None]:
class_mapping = {0: 'O',
 1: 'B-EP_POL',
 2: 'I-EP_POL',
 3: 'B-EP_WIRT',
 4: 'I-EP_WIRT',
 5: 'B-EP_FINANZ',
 6: 'I-EP_FINANZ',
 7: 'B-EP_MEDIA',
 8: 'I-EP_MEDIA',
 9: 'B-EP_SCI',
 10: 'I-EP_SCI',
 11: 'B-EP_REL',
 12: 'I-EP_REL',
 13: 'B-EP_KULT',
 14: 'I-EP_KULT',
 15: 'B-EP_MIL',
 16: 'I-EP_MIL',
 17: 'B-EP_NGO',
 18: 'I-EP_NGO',
 19: 'B-EP_MOV',
 20: 'I-EP_MOV',
 21: 'B-EP_OWN',
 22: 'I-EP_OWN',
 23: 'B-EO_POL',
 24: 'I-EO_POL',
 25: 'B-EO_WIRT',
 26: 'I-EO_WIRT',
 27: 'B-EO_FINANZ',
 28: 'I-EO_FINANZ',
 29: 'B-EO_MEDIA',
 30: 'I-EO_MEDIA',
 31: 'B-EO_SCI',
 32: 'I-EO_SCI',
 33: 'B-EO_REL',
 34: 'I-EO_REL',
 35: 'B-EO_KULT',
 36: 'I-EO_KULT',
 37: 'B-EO_MIL',
 38: 'I-EO_MIL',
 39: 'B-EO_NGO',
 40: 'I-EO_NGO',
 41: 'B-EO_MOV',
 42: 'I-EO_MOV',
 43: 'B-P_NAT',
 44: 'I-P_NAT',
 45: 'B-P_ETH',
 46: 'I-P_ETH',
 47: 'B-P_FUNC',
 48: 'I-P_FUNC',
 49: 'B-P_AGE',
 50: 'I-P_AGE',
 51: 'B-P_SOZ',
 52: 'I-P_SOZ',
 53: 'B-P_GEN',
 54: 'I-P_GEN',
 55: 'B-GPE',
 56: 'I-GPE'}

print('++++++++++ Label Found Entities ... ++++++++++')


for c, df in extracted_lists.items():
  key = list(class_mapping.keys())[list(class_mapping.values()).index("B-"+c)]

  # label found entities from original data
  res = label(df, key)

  # save labeled/found entities
  res.to_csv(path_found + str(c) + '_extracted_normal_labels.csv', index=False)

  # for further usage -> remove incorrect labels based on heuristic
  extracted_lists[c] = correct_labels(res,c)

  print('Labeled found entities in normal data for class: ' + str(c) + ' and saved in directory: ' + path_found)
 
  

for c, df in extracted_lists_lem.items():
  key = list(class_mapping.keys())[list(class_mapping.values()).index("B-"+c)]

  # label found entities from lemmatized data
  res = label(df, key)

  # save labeled/found entities
  res.to_csv(path_found + str(c) + '_extracted_lemma_labels.csv', index=False)

  # for further usage -> remove incorrect labels based on heuristic
  extracted_lists_lem[c] = correct_labels(res,c)
  
  print('Labeled found entities in lemmatized data for class: ' + str(c) + ' and saved in directory: ' + path_found)
 


++++++++++ Create labels for found entities ... ++++++++++
Labeled found entities in normal data for class: EO_SCI
     sentence_id                     token label
0            980  Friedrich-Ebert-Stiftung    31
1           1440                Initiative    31
1           1440                      Neue    32
1           1440                   Soziale    32
1           1440           Marktwirtschaft    32
..           ...                       ...   ...
281       325332            Weltwirtschaft    32
282       325616  Friedrich-Ebert-Stiftung    31
283       331709          Potsdam-Institut    31
283       331709                       für    32
283       331709      Klimafolgenforschung    32

[682 rows x 3 columns]
Labeled found entities in normal data for class: EP_WIRT
    sentence_id      token label
0         82068     Roland     3
0         82068     Baader     4
1        100965  Christian     3
1        100965        von     4
1        100965    Stetten     4
..          ...   

# 07 B - Loading Found & Labeled Instances
- for further usage: clean entities based on heuristics

In [None]:
print('++++++++++ Loading Found Entity Samples with Labels ... ++++++++++')

all_found_entities_files = glob.glob(os.path.join(path_found, '*_extracted_normal_labels.csv'))
all_found_entities_lemmas_files = glob.glob(os.path.join(path_found, '*_extracted_lemma_labels.csv'))

extracted_lists = {}
extracted_lists_lem = {}

for f in all_found_entities_files:
  df = pd.read_csv(f)
  c = f.replace('_extracted_normal_labels.csv', '').replace(path_found, '')
  extracted_lists[c] = correct_labels(df, c)
  
print('Read in ' + str(len(extracted_lists)) + ' entity lists with labels from normal data') 

for f in all_found_entities_lemmas_files:
  df = pd.read_csv(f)
  c = f.replace('_extracted_lemma_labels.csv', '').replace(path_found, '')
  extracted_lists_lem[c] = correct_labels(df, c)
print('Read in ' + str(len(extracted_lists_lem)) + ' entity lists with labels from lemma data') 


++++++++++ Reading in found entity samples with labels ... ++++++++++
Removed 0 ambiguous/incorrect tokens for class EO_SCI
Removed 0 ambiguous/incorrect tokens for class EP_MEDIA
Removed 0 ambiguous/incorrect tokens for class EP_WIRT
Removed 0 ambiguous/incorrect tokens for class EP_REL
Removed 0 ambiguous/incorrect tokens for class EP_POL
Removed 0 ambiguous/incorrect tokens for class EP_FINANZ
Removed 0 ambiguous/incorrect tokens for class EP_SCI
Removed 0 ambiguous/incorrect tokens for class EP_NGO
Removed 0 ambiguous/incorrect tokens for class EP_KULT
Removed 0 ambiguous/incorrect tokens for class EP_MOV
Removed 0 ambiguous/incorrect tokens for class EP_MIL
Removed 34568 ambiguous/incorrect tokens for class EO_POL
Removed 0 ambiguous/incorrect tokens for class EO_MIL
Removed 0 ambiguous/incorrect tokens for class EO_REL
Removed 9553 ambiguous/incorrect tokens for class EO_MEDIA
Removed 0 ambiguous/incorrect tokens for class EO_KULT
Removed 4277 ambiguous/incorrect tokens for class

# 08 - Annotating Training Data + Separating Data for Lexicon Generation
- Use found & labeled entities to annotate data
- Separating data for lexicon generation by removing this from the data dedicated to fine-tuning the language model
- Save training data and lexicon data to files

In [None]:
# estimated duration: 10 min

print('++++++++++ Annotating training data ... ++++++++++')
i = 1

start = timer()

for c, normal in extracted_lists.items():
    print('Merging normal data with class ' + str(i) + ': ' + (str(c)))

    # subsequently merge original data with labeled/extracted entities based on sentence id and token to annotate data
    data = merge_text_label(data, normal, ['sentence_id', 'token'])
    i+=1

i = 1
for c, lemma in extracted_lists_lem.items():
    print('Merging lemma data with class ' + str(i) + ': ' + (str(c)))
    # subsequently merge lemmatized data with labeled/extracted entities based on sentence id and token to annotate data
    data_lem = merge_text_label(data_lem, lemma, ['sentence_id', 'token'])
    i+=1

# merge original and lemmatized data
data['label'] = data['label'].fillna(data_lem['label'])

# create O-labels
data['label'] = data['label'].fillna(0)
print('Annotated data based on extracted entities from normal and lemmatized data.')

# separate data used for lexicon generation
lex_data = data.iloc[int(len(data)/2):]
data = data.iloc[:int(len(data)/2)]

print(len(lex_data))
print(len(data))

print('Partitioned data for lexicon generation.')
lex_data.to_csv(path_lexicon + 'lex_data.csv', index=False)
print('Saved data for lexicon generation in directory: ' + path_lexicon)

data.to_csv(path_training_data + 'training_data.csv', index=False)
print('Saved training data in directory: ' + path_training_data)

end = timer()

print('Total Duration: ' + str(timedelta(seconds=end - start)))

++++++++++ Annotating training data ... ++++++++++
Merging normal data with class 1: EO_SCI
Merging normal data with class 2: EP_MEDIA
Merging normal data with class 3: EP_WIRT
Merging normal data with class 4: EP_REL
Merging normal data with class 5: EP_POL
Merging normal data with class 6: EP_FINANZ
Merging normal data with class 7: EP_SCI
Merging normal data with class 8: EP_NGO
Merging normal data with class 9: EP_KULT
Merging normal data with class 10: EP_MOV
Merging normal data with class 11: EP_MIL
Merging normal data with class 12: EO_POL
Merging normal data with class 13: EO_MIL
Merging normal data with class 14: EO_REL
Merging normal data with class 15: EO_MEDIA
Merging normal data with class 16: EO_KULT
Merging normal data with class 17: P_FUNC
Merging normal data with class 18: EO_WIRT
Merging normal data with class 19: EO_MOV
Merging normal data with class 20: EO_NGO
Merging normal data with class 21: P_NAT
Merging normal data with class 22: GPE
Merging normal data with cl

# 08 B - Load Annotated Training Data

In [None]:
data = pd.read_csv(path_training_data + 'training_data.csv')

# 09 - Count Labeled Entities in Training Data




In [None]:
label_count = count_found_entities(data)
label_count.to_csv(path_training_data + 'label_count_training_data.csv', index=False)
label_count

         sentence_id    token  label
9                  0      Sie   21.0
25                 1      Ich   21.0
27                 1      Sie   21.0
37                 1       Es   21.0
129                3      ich   21.0
...              ...      ...    ...
8229859       167658       es   21.0
8229865       167659       Er   21.0
8229893       167659  unserer   21.0
8229898       167659  unserer   21.0
8229925       167660      wir   21.0

[576124 rows x 3 columns]
token             label
wir               21.0     89380
Sie               21.0     69608
es                21.0     61603
Wir               21.0     39664
ich               21.0     37439
                           ...  
Tschechoslowakei  55.0         1
Müllner           16.0         1
Dichtern          47.0         1
NS-Staat          55.0         1
Grenadiers        43.0         1
Length: 1632, dtype: int64
                            0
token            label       
wir              21.0   89380
Sie              21.0   6

# 10 A - Merging Annotated Training Data Back to Sentences
- The unlabeled data was previously split into tokens. Now the (annotated) training data is restructured into paragraphs for the fine-tuning of the BERT model
- The restructed data is then saved to file

In [None]:
t = []
l = []
s_ids = []

# group paragraphs
paragraphs = data.groupby(['sentence_id'])

for name,group in paragraphs:
  t.append(group.token.values.tolist())
  l.append(group.label.values.tolist())
  s_ids.append(name)
  if name % 20000 == 0:
    print('Built ' + str(name) + ' sentences.')
data = pd.DataFrame({'sentence_id' : s_ids, 'token' : t, 'label' : l })

# save training data in sentences
data.to_csv(path_training_data + 'training_data_sentences.csv', index=False)
data


Built 0 sentences.
Built 20000 sentences.
Built 40000 sentences.
Built 60000 sentences.
Built 80000 sentences.
Built 100000 sentences.
Built 120000 sentences.
Built 140000 sentences.
Built 160000 sentences.


Unnamed: 0,sentence_id,token,label
0,0,"[Guten, Morgen, ,, liebe, Kolleginnen, und, Ko...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,"[Meine, sehr, verehrten, Damen, und, Herren, !...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,"[§, 1, Absatz, 2, der, Geschäftsordnung, des, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,"[Die, Fraktion, der, AfD, widerspricht, diesem...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,"[Enthaltungen, ?, –, Der, Antrag, ist, damit, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
167656,167656,"[Aber, Ihre, Vorschläge, sind, weder, eine, Al...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
167657,167657,"[Bevor, ich, zu, den, Herausforderungen, komme...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
167658,167658,"[Auch, wenn, bei, diesem, Fall, viel, schiefge...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
167659,167659,"[Und, :, Er, ist, in, Haft, ,, liebe, Kollegin...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# 10 B - Loading Training Data as Sentences

In [None]:
data = pd.read_csv(path_training_data + 'training_data_sentences.csv')
data

Unnamed: 0,sentence_id,token,label
0,0,"['Guten', 'Morgen', ',', 'liebe', 'Kolleginnen...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1,"['Meine', 'sehr', 'verehrten', 'Damen', 'und',...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,"['§', '1', 'Absatz', '2', 'der', 'Geschäftsord...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,3,"['Die', 'Fraktion', 'der', 'AfD', 'widersprich...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,4,"['Enthaltungen', '?', '–', 'Der', 'Antrag', 'i...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
167656,167656,"['Aber', 'Ihre', 'Vorschläge', 'sind', 'weder'...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
167657,167657,"['Bevor', 'ich', 'zu', 'den', 'Herausforderung...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
167658,167658,"['Auch', 'wenn', 'bei', 'diesem', 'Fall', 'vie...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
167659,167659,"['Und', ':', 'Er', 'ist', 'in', 'Haft', ',', '...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# 11 - Remove Incorrect Inner Labels
- The annotation caused incorrect inner labels within the training data. These are removed. 
- The corrected training data is saved to a file, replacing the previous version

In [None]:
tqdm.pandas(desc='Removing Faulty Inner Labels')
data['label'] = data['label'].progress_apply(remove_inner_labels)
data.to_csv(path_training_data + 'training_data_sentences.csv', index=False)
data


progress:   0%|          | 0/167661 [00:00<?, ?it/s]

Unnamed: 0,sentence_id,token,label
0,0,"['Guten', 'Morgen', ',', 'liebe', 'Kolleginnen...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,1,"['Meine', 'sehr', 'verehrten', 'Damen', 'und',...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,2,"['§', '1', 'Absatz', '2', 'der', 'Geschäftsord...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,3,"['Die', 'Fraktion', 'der', 'AfD', 'widersprich...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,"['Enthaltungen', '?', '–', 'Der', 'Antrag', 'i...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
167656,167656,"['Aber', 'Ihre', 'Vorschläge', 'sind', 'weder'...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
167657,167657,"['Bevor', 'ich', 'zu', 'den', 'Herausforderung...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
167658,167658,"['Auch', 'wenn', 'bei', 'diesem', 'Fall', 'vie...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
167659,167659,"['Und', ':', 'Er', 'ist', 'in', 'Haft', ',', '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
