In [2]:
import spacy
import pandas as pd

In [3]:
en_nlp = spacy.load("en")

In [4]:
de_nlp = spacy.load("de")

In [5]:
it_nlp = spacy.load("it")

In [6]:
fr_nlp = spacy.load("fr")

## Testing NER

In [7]:
de_lines = [line.strip("\n") for line in open("spacy.de", mode="r", encoding="utf-8").readlines()]

In [8]:
en_lines = [line.strip("\n") for line in open("spacy.en", mode="r", encoding="utf-8").readlines()]

In [9]:
list(zip(de_lines, en_lines))

[('Ja, Herr Evans, ich denke, daß eine derartige Initiative durchaus angebracht ist.',
  'Yes, Mr Evans, I feel an initiative of the type you have just suggested would be entirely appropriate.'),
 ('Frau Schroedter, ich bin gerne bereit, die damit zusammenhängenden Fakten zu prüfen, wenn mir Ihr Brief vorliegt.',
  'Yes, Mrs Schroedter, I shall be pleased to look into the facts of this case when I have received your letter.'),
 ('Diese Abstimmung ist meiner Erinnerung nach so ausgegangen: 422 gegen 180 Stimmen bei einigen wenigen Enthaltungen.',
  'As I recall, the outcome of this vote was 422 votes to 180 with a few abstentions.'),
 ('(Das Parlament lehnt den Antrag mit 164 Ja-Stimmen, 166 Nein-Stimmen und 7 Enthaltungen ab.)',
  '(Parliament rejected the request, with 164 votes for, 166 votes against and 7 abstentions)'),
 ('Wir stimmen jetzt über den Antrag der PPE/DE-Fraktion ab, die mündliche Anfrage über die Kapitalsteuer von der Tagesordnung abzusetzen',
  "We shall proceed to v

In [10]:
en_docs = list(en_nlp.pipe(en_lines))
de_docs =list(de_nlp.pipe(de_lines))

In [11]:
print("NER for English:")
en_ents = []
en_toks = []
for doc in en_docs:
    en_ents.append(doc.ents)
    en_toks.append(doc)
en_ents = [[(ent.text, ent.label_) for ent in entities if ent.label_ in ["ORG", "PERSON"]] for entities in en_ents]
print(en_ents)
#print(en_toks)

NER for English:
[[('Mr Evans', 'PERSON')], [('Mrs Schroedter', 'PERSON')], [], [('Parliament', 'ORG')], [("the PPE-DE Group'", 'ORG')], [('Presidency', 'ORG')], [], [('Linkohr Report', 'ORG')], [], [('Council', 'ORG'), ('Mrs Neyts-Uyttebroeck', 'PERSON'), ('-Office of the Council', 'ORG')]]


In [12]:
print("NER for German:")
de_ents = []
de_toks = []
for doc in de_docs:
    de_ents.append(doc.ents)
    de_toks.append(doc)
de_ents = [[(ent.text, ent.label_) for ent in entities if ent.label_ in ["ORG", "PER"]] for entities in de_ents]
print(de_ents)

NER for German:
[[('Evans', 'PER')], [], [], [('Das Parlament', 'ORG')], [], [], [('Kommission', 'ORG'), ('Gerichtshof', 'ORG')], [], [], [('Neyts-Uyttebroeck', 'PER')]]


In [13]:
print("POS for English/German:")
en_pos = [[(tok.text, tok.pos_, tok.lemma_) for tok in tokens if not tok.is_punct] for tokens in en_toks]
de_pos = [[(tok.text, tok.pos_, tok.lemma_) for tok in tokens if not tok.is_punct] for tokens in de_toks]
ps_df = pd.DataFrame(dict({"en_pos": en_pos, "de_pos": de_pos}))
ps_df.head()

POS for English/German:


Unnamed: 0,en_pos,de_pos
0,"[(Yes, INTJ, yes), (Mr, PROPN, Mr), (Evans, PR...","[(Ja, PART, ja), (Herr, NOUN, Herr), (Evans, P..."
1,"[(Yes, INTJ, yes), (Mrs, PROPN, Mrs), (Schroed...","[(Frau, NOUN, Frau), (Schroedter, PROPN, Schro..."
2,"[(As, ADP, as), (I, PRON, -PRON-), (recall, VE...","[(Diese, DET, Diese), (Abstimmung, NOUN, Absti..."
3,"[(Parliament, PROPN, Parliament), (rejected, V...","[(Das, DET, der), (Parlament, NOUN, Parlament)..."
4,"[(We, PRON, -PRON-), (shall, VERB, shall), (pr...","[(Wir, PRON, ich), (stimmen, VERB, stimmen), (..."


In [14]:
test_str = "12 999 Euro verschwendet"
toks = [tok.text if not tok.is_digit else "NUM" for tok in de_nlp(test_str)]
toks

['NUM', 'NUM', 'Euro', 'verschwendet']

In [15]:
toks = [toks[i] for i in range(len(toks)) if (i==0) or toks[i] != toks[i-1]]

In [16]:
toks

['NUM', 'Euro', 'verschwendet']

In [17]:
import re
print(re.findall(r'[-+]?\d*,\d+|\d+', test_str))
print(re.findall(r'\d+(?:,\d+)?', test_str))

['12', '999']
['12', '999']


In [18]:
num_re = r'\d+(?:,\d+)?'
s = re.sub(num_re, "NUM", test_str).split(" ")

In [19]:
toks = [s[i] for i in range(len(s)) if (i==0) or s[i] != s[i-1]]

In [20]:
list(de_nlp(' '.join(toks)))

[NUM, Euro, verschwendet]

In [78]:
my_str = """
Dividendengutschrift für inländische Wertpapiere

Depotinhaber    : ME

Extag           :  18.04.2013          Bruttodividende
Zahlungstag     :  18.04.2013          pro Stück       :       0,9800 EUR
Valuta          :  18.04.2013

                                       Bruttodividende :        78,40 EUR
                                      *Einbeh. Steuer  :        20,67 EUR
                                       Nettodividende  :        78,40 EUR

                                       Endbetrag       :        57,73 EUR
"""

In [79]:
date_re = r"^([1-9] |1[0-9]| 2[0-9]|3[0-1])(.|-)([1-9] |1[0-2])(.|-|)20[0-9][0-9]$"

In [80]:
date = r'\d+[ -/.]\d+[ -/.]\d+'

In [81]:
match = re.findall(date, my_str)

In [82]:
match

['18.04.2013', '18.04.2013', '18.04.2013']

In [83]:
my_str = re.sub(date_re, "DATE", my_str)
my_str

'\nDividendengutschrift für inländische Wertpapiere\n\nDepotinhaber    : ME\n\nExtag           :  18.04.2013          Bruttodividende\nZahlungstag     :  18.04.2013          pro Stück       :       0,9800 EUR\nValuta          :  18.04.2013\n\n                                       Bruttodividende :        78,40 EUR\n                                      *Einbeh. Steuer  :        20,67 EUR\n                                       Nettodividende  :        78,40 EUR\n\n                                       Endbetrag       :        57,73 EUR\n'

In [85]:
search_dates(my_str.lower())

[('me', datetime.datetime(2019, 6, 19, 0, 0)),
 ('2013', datetime.datetime(2013, 6, 19, 0, 0)),
 ('2013', datetime.datetime(2013, 6, 19, 0, 0)),
 ('9800', datetime.datetime(9800, 6, 19, 0, 0)),
 ('2013', datetime.datetime(2013, 6, 19, 0, 0)),
 ('20,67', datetime.datetime(2067, 6, 20, 0, 0))]

In [27]:
my_str = re.sub(num_re, "NUM", my_str).split("\n")
my_str

['',
 'Dividendengutschrift für inländische Wertpapiere',
 '',
 'Depotinhaber    : ME',
 '',
 'Extag           :  DATE          Bruttodividende',
 'Zahlungstag     :  DATE          pro Stück       :       NUM EUR',
 'Valuta          :  DATE',
 '',
 '                                       Bruttodividende :        NUM EUR',
 '                                      *Einbeh. Steuer  :        NUM EUR',
 '                                       Nettodividende  :        NUM EUR',
 '',
 '                                       Endbetrag       :        NUM EUR',
 '']

In [28]:
lines = list(zip(en_lines, de_lines))

In [29]:
def replace_date(line):
    return re.sub(date, "DATE", line)

def replace_numbers(line):
    return re.sub(num_re, "NUM", line)

In [30]:
lines = [(replace_numbers(replace_date(en_line)), replace_numbers(replace_date(de_line))) for en_line, de_line in lines]

In [31]:
lines

[('Yes, Mr Evans, I feel an initiative of the type you have just suggested would be entirely appropriate.',
  'Ja, Herr Evans, ich denke, daß eine derartige Initiative durchaus angebracht ist.'),
 ('Yes, Mrs Schroedter, I shall be pleased to look into the facts of this case when I have received your letter.',
  'Frau Schroedter, ich bin gerne bereit, die damit zusammenhängenden Fakten zu prüfen, wenn mir Ihr Brief vorliegt.'),
 ('As I recall, the outcome of this vote was NUM votes to NUM with a few abstentions.',
  'Diese Abstimmung ist meiner Erinnerung nach so ausgegangen: NUM gegen NUM Stimmen bei einigen wenigen Enthaltungen.'),
 ('(Parliament rejected the request, with NUM votes for, NUM votes against and NUM abstentions)',
  '(Das Parlament lehnt den Antrag mit NUM Ja-Stimmen, NUM Nein-Stimmen und NUM Enthaltungen ab.)'),
 ("We shall proceed to vote on the PPE-DE Group' s request that the oral question regarding the capital tax be withdrawn from the agenda.",
  'Wir stimmen jetzt

### Use on corpus (EN)

In [32]:
EN_LINES = [line.strip("\n") for line in open("bitext.en", mode="r", encoding="utf-8").readlines()]
DE_LINES = [line.strip("\n") for line in open("bitext.de", mode="r", encoding="utf-8").readlines()]

In [33]:
EN_LINES[:10], DE_LINES[:10]

(['You have requested a debate on this subject in the course of the next few days, during this part-session.',
  "Please rise, then, for this minute' s silence.",
  'You will be aware from the press and television that there have been a number of bomb explosions and killings in Sri Lanka.',
  'Yes, Mr Evans, I feel an initiative of the type you have just suggested would be entirely appropriate.',
  'If the House agrees, I shall do as Mr Evans has suggested.',
  'I would like your advice about Rule 143 concerning inadmissibility.',
  'My question relates to something that will come up on Thursday and which I will then raise again.',
  'It says that this should be done despite the principle of relative stability.',
  'I believe that the principle of relative stability is a fundamental legal principle of the common fisheries policy and a proposal to subvert it would be legally inadmissible.',
  'That is precisely the time when you may, if you wish, raise this question, i.e. on Thursday pr

In [34]:
len(EN_LINES), len(DE_LINES)

(622376, 622376)

In [35]:
all_lines = list(zip(EN_LINES, DE_LINES))

In [36]:
def find_seq_num(lines):
    for src, trg in lines:
        if re.findall(num_re, src):
            yield src, trg

In [37]:
all_lines = list(find_seq_num(all_lines))

In [38]:
all_lines = all_lines[:100]

In [39]:
all_lines

[('I would like your advice about Rule 143 concerning inadmissibility.',
  'Könnten Sie mir eine Auskunft zu Artikel 143 im Zusammenhang mit der Unzulässigkeit geben?'),
 ('Why has there been no Health and Safety Committee meeting since 1998?',
  'Weshalb ist der Arbeitsschutzausschuß seit 1998 nicht ein einziges Mal zusammengetreten?'),
 ('To this end, I would like to remind you of the resolution of 15 September, which recommended that the proposal be presented as soon as possible.',
  'In diesem Sinne erinnere ich an die Entschließung vom 15. September, in der empfohlen wurde, den Vorschlag in der kürzestmöglichen Frist vorzulegen.'),
 ('As I recall, the outcome of this vote was 422 votes to 180 with a few abstentions.',
  'Diese Abstimmung ist meiner Erinnerung nach so ausgegangen: 422 gegen 180 Stimmen bei einigen wenigen Enthaltungen.'),
 ('The Commission will present its programme for the year 2000 in February.',
  'Die Kommission wird das Programm für das Jahr 2000 im Februar vo

In [40]:
sub_lines = [(replace_numbers(src), replace_numbers(trg)) for src, trg in all_lines]

In [41]:
sub_lines[:30]

[('I would like your advice about Rule NUM concerning inadmissibility.',
  'Könnten Sie mir eine Auskunft zu Artikel NUM im Zusammenhang mit der Unzulässigkeit geben?'),
 ('Why has there been no Health and Safety Committee meeting since NUM?',
  'Weshalb ist der Arbeitsschutzausschuß seit NUM nicht ein einziges Mal zusammengetreten?'),
 ('To this end, I would like to remind you of the resolution of NUM September, which recommended that the proposal be presented as soon as possible.',
  'In diesem Sinne erinnere ich an die Entschließung vom NUM. September, in der empfohlen wurde, den Vorschlag in der kürzestmöglichen Frist vorzulegen.'),
 ('As I recall, the outcome of this vote was NUM votes to NUM with a few abstentions.',
  'Diese Abstimmung ist meiner Erinnerung nach so ausgegangen: NUM gegen NUM Stimmen bei einigen wenigen Enthaltungen.'),
 ('The Commission will present its programme for the year NUM in February.',
  'Die Kommission wird das Programm für das Jahr NUM im Februar vorl

In [58]:
[(ent.text, ent.label_) for ent in de_nlp("Ja, Herr Busser, ich denke, daß eine derartige Initiative mit Ratsbeschluss durchaus angebracht ist.").ents]

[('Busser', 'PER')]

In [43]:
[(tok.text, tok.pos_, tok.tag_, tok.lemma_) for tok in de_nlp("Die Modalitäten für die Ausübung dieser der Kommission übertragenen Durchführungsbefugnisse wurden durch Ratsbeschluß vom Juni 2018 neu geregelt".lower())]

[('die', 'DET', 'ART', 'der'),
 ('modalitäten', 'ADJ', 'ADJA', 'modalitäten'),
 ('für', 'ADP', 'APPR', 'für'),
 ('die', 'DET', 'ART', 'der'),
 ('ausübung', 'NOUN', 'NN', 'ausübung'),
 ('dieser', 'PRON', 'PDS', 'dies'),
 ('der', 'DET', 'ART', 'der'),
 ('kommission', 'ADJ', 'ADJA', 'kommission'),
 ('übertragenen', 'ADJ', 'ADJA', 'übertragen'),
 ('durchführungsbefugnisse', 'VERB', 'VVPP', 'durchführungsbefugnisse'),
 ('wurden', 'AUX', 'VAFIN', 'werden'),
 ('durch', 'ADP', 'APPR', 'durch'),
 ('ratsbeschluß', 'ADJ', 'ADJA', 'ratsbeschluß'),
 ('vom', 'ADP', 'APPRART', 'vom'),
 ('juni', 'ADV', 'ADV', 'juni'),
 ('2018', 'NUM', 'CARD', '2018'),
 ('neu', 'ADJ', 'ADJD', 'neu'),
 ('geregelt', 'VERB', 'VVPP', 'regeln')]

In [None]:
#de_docs = list(de_nlp.pipe(DE_LINES, n_threads=5, batch_size=100))

In [66]:
import dateparser
from dateparser.search import search_dates

In [89]:
search_dates("9. Dezember letzten Jahres angenommene gemeinschaftliche Aktionsprogramm für den Zivilschutz ist am 1. Januar 2000 in Kraft getreten.", languages=["de"])

[('9. Dezember', datetime.datetime(2019, 12, 9, 0, 0)),
 ('am 1. Januar 2000 in', datetime.datetime(2000, 1, 1, 0, 0))]

In [92]:
for i, sent in enumerate(EN_LINES):
    dates = search_dates(sent, languages=["en"])
    if i == 100: break
    if dates:
        print(sent)
        print(dates)

Please rise, then, for this minute' s silence.
[("minute'", datetime.datetime(2019, 6, 22, 0, 0))]
I would like your advice about Rule 143 concerning inadmissibility.
[('143', datetime.datetime(1900, 1, 1, 1, 4, 3))]
My question relates to something that will come up on Thursday and which I will then raise again.
[('on Thursday and', datetime.datetime(2019, 6, 20, 0, 0))]
That is precisely the time when you may, if you wish, raise this question, i.e. on Thursday prior to the start of the presentation of the report.
[('may', datetime.datetime(2019, 5, 22, 0, 0)), ('on Thursday', datetime.datetime(2019, 5, 16, 0, 0))]
Now, however, he is to go before the courts once more because the public prosecutor is appealing.
[('Now', datetime.datetime(2019, 6, 22, 17, 54, 18, 818367))]
Mrs Plooij-van Gorsel, I can tell you that this matter is on the agenda for the Quaestors' meeting on Wednesday.
[('on Wednesday', datetime.datetime(2019, 6, 19, 0, 0))]
Why has there been no Health and Safety Commit

In [96]:
re.findall(r'(.*?\D{2,})', my_str)

['\nDividendengutschrift für inländische Wertpapiere\n\nDepotinhaber    : ME\n\nExtag           :  ',
 '18.04.2013          Bruttodividende\nZahlungstag     :  ',
 '18.04.2013          pro Stück       :       ',
 '0,9800 EUR\nValuta          :  ',
 '18.04.2013\n\n                                       Bruttodividende :        ',
 '78,40 EUR\n                                      *Einbeh. Steuer  :        ',
 '20,67 EUR\n                                       Nettodividende  :        ',
 '78,40 EUR\n\n                                       Endbetrag       :        ',
 '57,73 EUR\n']

In [99]:
d = r'\d{4}[-. ]\d?\d[-. ]\d?\d (?:2[0-3]|[01]?[0-9]):[0-5]?[0-9]:[0-5]?[0-9]'
print(re.findall(r'{0}.*?(?=\s*{0}|$)'.format(d), my_str, re.DOTALL))

[]


In [100]:
rex = r"\s+(?=\d{2}(?:\d{2})?-\d{1,2}-\d{1,2}\b)"
s = "2018-03-14 06:08:18, he went on 2018-03-15 06:08:18, lets play"
print(re.split(rex, s))

['2018-03-14 06:08:18, he went on', '2018-03-15 06:08:18, lets play']


In [113]:
nums = [n for n in re.split(r"\D+", "2018-03-14 06:08:18, he went on 2018-03-15 06:08:18, lets play") if n]

In [114]:
nums

['2018', '03', '14', '06', '08', '18', '2018', '03', '15', '06', '08', '18']

In [127]:
sss = "2018-03-14 06:08:18, he went on 2018-03-15 06:08:18, lets play"

In [204]:
def cleanup_numbers(line):
    """
    Ex: Turchi Report [A5-0303/2001] and Linkohr Report (A5-0297/2001) - am 20. Juni 2019
    """
    line = line.translate(str.maketrans('', '', string.punctuation))
    # Turchi Report A503032001 and Linkohr Report A502972001 am 20 Juni 2019
    line = line.strip()
    ### replace digits
    # Turchi Report A503032001 and Linkohr Report A502972001 am NUM Juni NUM
    nums = [n for n in re.split(r"\D+", line) if n]
    line = ' '.join([word if not word in nums else "NUM" for word in line.split(" ")])
    ### Clean up regulations
    ### A503032001 --> LAW
    line = re.sub(r'[a-zA-Z]+[0-9]+',"LAW", line)
    line = remove_adjacent_same_label(line)
    return line

In [209]:
def remove_adjacent_same_label(line):
    if isinstance(line, str):
        line = line.split(" ")
    # Remove adjacent duplicate labels
    toks = [line[i] for i in range(len(line)) if (i==0) or line[i] != line[i-1]]
    line = ' '.join(toks).strip()
    ### remove duplicate spaces
    line = re.sub(r"\s\s+", " ", line)
    return line.strip() # as string

In [205]:
[(ent.text, ent.label_) for ent in en_nlp(cleanup_numbers("European Council meeting (Ghent, 19 October 2001) - June 2919")).ents]

[('European Council', 'ORG'),
 ('Ghent NUM', 'ORG'),
 ('October NUM June', 'DATE')]

In [179]:
[(ent.text, ent.label_) for ent in en_nlp("European Council meeting (Ghent, 19 October 2001) - June 2919").ents]

[('European Council', 'ORG'),
 ('Ghent', 'ORG'),
 ('19 October 2001', 'DATE'),
 ('June 2919', 'DATE')]

In [183]:
[(ent.text, ent.label_) for ent in de_nlp(cleanup_numbers("Herr Luiz Nasario da Lima Ronaldo, Europaeischer Rat von Gent (19. Oktober 2001) - Volkswagen")).ents]


[('Luiz Nasario da', 'PER'),
 ('Lima Ronaldo', 'PER'),
 ('Gent NUM', 'ORG'),
 ('NUM Volkswagen', 'ORG')]

In [206]:
cleanup_numbers("Europaeischer Rat von Gent (19. Oktober 2001) - Volkswagen")

'Europaeischer Rat von Gent NUM Oktober NUM Volkswagen'

In [184]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [207]:
cleanup_numbers("Explanation of vote - Turchi Report [A5-0303/2001] and Linkohr Report (A5-0297/2001):")

'Explanation of vote Turchi Report LAW and Linkohr Report LAW'

In [199]:
cleanup_numbers("Turchi Report [A5-0303/2001] and Linkohr Report (A5-0297/2001) - am 20. Juni 2019")

'Turchi Report LAW and Linkohr Report LAW am NUM Juni NUM'