# Named Entity Recognition (NER)

## Connect to DB

First Task: Connect to Database and retrieve text data from there.

In [65]:
# connect to db
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine("mysql+pymysql://root:root@localhost:3306/lgbtiq_kg")

df = pd.read_sql_table(
    'text_chronik', 
    con=engine)

df["text"] = df["title"] + " - " + df["text"]
#df = df[["id", "text"]]
df = pd.DataFrame(df)
df.head()

Unnamed: 0,created_at,updated_at,id,title,text,location,date,year
0,2022-03-22 10:03:40,NaT,1,Lilamunde Lesbenvocal,Lilamunde Lesbenvocal - Der Chor Lilamunde Les...,München,1993,1993
1,2022-03-22 10:03:40,NaT,2,„Abtreibungs­paragraf“ §218,„Abtreibungs­paragraf“ §218 - Im Zuge des Begi...,München,1971,1971
2,2022-03-22 10:03:40,NaT,3,„Come out“-­Lesbentheater,„Come out“-­Lesbentheater - Erster Auftritt de...,München,10. Februar 1979,1979
3,2022-03-22 10:03:40,NaT,4,„Die Verzauberten“,„Die Verzauberten“ - Die Ausstellung von KGL u...,München,9. – 30. Oktober 2009,2009
4,2022-03-22 10:03:40,NaT,5,"„Frauen­beziehungen, Frauenliebe“","„Frauen­beziehungen, Frauenliebe“ - Informatio...",München,15. April 1978,1978


## Spacy

In [20]:
from spacy.lang.de import German

# create nlp object
nlp = German()

In [21]:
# Prozentzahl finden
doc = nlp(
    "Im Jahr 1990 lebten über 60% der Menschen in Ostasien in äußerster Armut. "
    "Heute sind es nur noch 4%."
)
# Iteriere über die Tokens im Doc
for token in doc:
    # Teste ob der Token einer Zahl ähnelt
    if token.like_num:
        # Wähle den nächsten Token im Doc aus
        next_token = doc[token.i + 1]
        # Überprüfe ob der Text des nächsten Tokens "%"" ist
        if next_token.text == "%":
            print("Prozentzahl gefunden:", token.text)

Prozentzahl gefunden: 60
Prozentzahl gefunden: 4


In [22]:
import spacy
# small German modell
nlp = spacy.load("de_core_news_sm")

doc = nlp("Ich lese auf HackerNews anstatt zu arbeiten. In München.")

In [23]:
for token in doc:
    print(token.text, token.pos_) # attributes with _ at the end: gives text, without get id

Ich PRON
lese VERB
auf ADP
HackerNews PROPN
anstatt SCONJ
zu PART
arbeiten VERB
. PUNCT
In ADP
München PROPN
. PUNCT


In [24]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Ich lese MISC
HackerNews MISC
München. LOC


In [25]:
spacy.explain("LOC")

'Non-GPE locations, mountain ranges, bodies of water'

In [26]:
spacy.explain("MISC")

'Miscellaneous entities, e.g. events, nationalities, products or works of art'

In [27]:
spacy.explain("PROPN")

'proper noun'

In [28]:
text = "Apple wurde 1976 von Steve Wozniak, Steve Jobs und Ron Wayne gegründet."

# Verarbeite den Text
doc = nlp(text)

for token in doc:
    # Greife auf den Text, die Wortart und die Dependenzrelation des Tokens zu
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # Dies dient nur zur Formatierung
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

Apple       PROPN     sb        
wurde       AUX       ROOT      
1976        NUM       mo        
von         ADP       sbp       
Steve       PROPN     pnc       
Wozniak     PROPN     nk        
,           PUNCT     punct     
Steve       PROPN     pnc       
Jobs        PROPN     cj        
und         CCONJ     cd        
Ron         PROPN     pnc       
Wayne       PROPN     cj        
gegründet   VERB      oc        
.           PUNCT     punct     


In [29]:
# Iteriere über die vorhergesagten Entitäten
for ent in doc.ents:
    # Drucke den Text und das Label der Entität
    print(ent.text, ent.label_)


Apple ORG
Steve Wozniak PER
Steve Jobs PER
Ron Wayne PER


### Eigene Entitäten hinzufügen


In [30]:
text = "Apple: Modell IPhone SE kommt im Sommer"

# Verarbeite den Text
doc = nlp(text)

# Iteriere über die Entitäten
for ent in doc.ents:
    # Drucke Text und Label der Entität
    print(ent.text, ent.label_)

# Erstelle eine Span für "IPhone SE"
iphone_se = doc[3:5]

# Drucke den Text der Span
print("Fehlende Entität:", iphone_se.text)

Apple: Modell IPhone LOC
Fehlende Entität: IPhone SE


### Regelbasiertes Matching
- nicht nur Regex
- matcher patterns
- include pos informations or entity info, not just plain strings

In [31]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

# iphone pattern
pattern = [{"TEXT": "iPhone"}, {"TEXT": "X"}]
matcher.add("IPHONE_PATTERN", [pattern])

doc = nlp("Das neue iPhone X erscheint in Deutschland.")

matches = matcher(doc)

for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

# show result
print("Resultat:", [doc[start:end].text for match_id, start, end in matches])

# number of results
print("Anzahl:", len(matches))

iPhone X
Resultat: ['iPhone X']
Anzahl: 1


### add new entities

- done via `span`

In [32]:
from spacy.tokens import Doc, Span

words = ["Ich", "mag", "David", "Bowie"]
spaces = [True, True, True, False]

# Erstelle ein Doc mit den Wörtern und Leerzeichen
doc = Doc(nlp.vocab, words, spaces)
print(doc.text)

# Erstelle eine Span für "David Bowie" und weise ihr das Label "PER" zu
span = Span(doc, 2, 4, label="PER")
print(span.text, span.label_)

# Füge die Span zu den Entitäten des Docs hinzu
doc.ents = [span]

# Drucke den Text und Label der Entitäten
print([(ent.text, ent.label_) for ent in doc.ents])

Ich mag David Bowie
David Bowie PER
[('David Bowie', 'PER')]


### Phrase Matching
- effizienter und schneller als `Matcher`
- wenn lange Wortlisten gefunden werden sollten

In [33]:
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever")
matcher.add("HUND", [pattern])
doc = nlp("Ich habe einen Golden Retriever.")

for match_id, start, end in matcher(doc):
    span = doc[start:end]
    print("Gefundene Spans:", span.text)

Gefundene Spans: Golden Retriever


### Auf DB Daten

In [34]:
# df column text as nlp object
doc = nlp(''.join(str(df.text.tolist())))

# Iteriere über die vorhergesagten Entitäten
for ent in doc.ents:
    # Drucke den Text und das Label der Entität
    print(ent.text, ent.label_)


Der Chor Lilamunde Lesbenvokal MISC
Im Zuge des Beginns MISC
Deutschland LOC
München LOC
Männern MISC
Come out“-Lesbentheaters MISC
Frauenkneipe.', '„Die Verzauberten“ - Die Ausstellung ORG
KGL ORG
Sub PER
Porträts LOC
Susie Knoll PER
Schwabingerbräu ORG
Münchner MISC
Lising Pagenstecher PER
Frauen- und Lesbenbewegung ORG
Den Freunden MISC
Stadt München. LOC
VSG ORG
VSG MISC
VSG ORG
KGL ORG
Barbara Stenzel PER
Stephanie Gerlach PER
Münchner Lesben. MISC
Königreich Bayern LOC
Paul Johann Anselm Feuerbach PER
Feuerbach LOC
deutschen MISC
Kunstverein München ORG
Bundesgebiet. LOC
Berlin LOC
München LOC
BRD LOC
Westberlin MISC
München LOC
Aktion 218 MISC
Adele Spitzeder PER
München LOC
Adelheid Lissmann PER
Adelheid Lissmann PER
Kommunistische Partei ORG
Partei ORG
Lebensgemeinschaft LOC
Der Maßnahmenkatalog von Staatssekretär MISC
Peter Gauweiler PER
Aids PER
Kraft: LOC
Bayern LOC
HIV-Tests MISC
August Lang PER
CSU ORG
Schwule. MISC
Horst Seehofer PER
Großdemos PER
Kraft. LOC
AIDS-Memoria

### Spacy NER and log to rubrix

In [70]:
import rubrix as rb
from tqdm.auto import tqdm

records = []

for record in df.index:
    # We only need the text of each instance
    text = df['text'][record]
    # get id for rubrix record metadata
    id = df['id'][record].tolist() # not allowed to be int64
    date = df['date'][record]
    year = df['year'][record].tolist() # not allowed to be int64
    
    
    # spaCy Doc creation
    doc = nlp(text)
    # Entity annotations
    entities = [
        (ent.label_, ent.start_char, ent.end_char)
        for ent in doc.ents
    ]

    # Pre-tokenized input text
    tokens = [token.text for token in doc]

    # Rubrix TokenClassificationRecord list
    records.append(
        rb.TokenClassificationRecord(
            text=text,
            tokens=tokens,
            metadata={'id': id, 'date': date, 'year': year}, # log the intents for exploration of specific intents
            prediction=entities,
            prediction_agent="de_core_news_sm",
        )
    )

rb.log(records=records, name="chronik_ner")


[A
100%|██████████| 233/233 [00:01<00:00, 159.26it/s]

233 records logged to http://localhost:6900/ws/rubrix/chronik_ner





BulkResponse(dataset='chronik_ner', processed=233, failed=0)

## Flair


### Tutorial

https://rubrix.readthedocs.io/en/stable/tutorials/07-zeroshot_ner.html

In [72]:
from datasets import load_dataset

# download data set
dataset = load_dataset("wnut_17", split="test")


  0%|          | 0/233 [14:48<?, ?it/s]
  0%|          | 0/233 [07:34<?, ?it/s]
Downloading builder script: 7.46kB [00:00, 3.74MB/s]                   
Downloading metadata: 4.28kB [00:00, 1.94MB/s]                   


Downloading and preparing dataset wnut_17/wnut_17 (download: 782.18 KiB, generated: 1.66 MiB, post-processed: Unknown size, total: 2.43 MiB) to /Users/kabr/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9...


Downloading data: 494kB [00:00, 7.35MB/s]                   ]
Downloading data: 115kB [00:00, 15.2MB/s]                    .89it/s]
Downloading data: 192kB [00:00, 8.77MB/s]                    .28it/s]
Downloading data files: 100%|██████████| 3/3 [00:01<00:00,  2.32it/s]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1464.83it/s]
                                                                                     

Dataset wnut_17 downloaded and prepared to /Users/kabr/.cache/huggingface/datasets/wnut_17/wnut_17/1.0.0/077c7f08b8dbc800692e8c9186cdf3606d5849ab0e7be662e6135bb10eba54f9. Subsequent calls will reuse this data.




In [75]:
# define labels


labels = ['corporation', 'creative-work', 'group', 'location', 'person', 'product']

In [78]:
# sentencepiece install problems solved witj https://github.com/google/sentencepiece/issues/378#issuecomment-969896519
from flair.models import TARSTagger

# load zero-shot NER tagger
tars = TARSTagger.load('tars-ner')

# define labels for named entities using wnut labels
tars.add_and_switch_to_new_task('task 1', labels, label_type='ner')

2022-03-23 18:11:58,117 https://nlp.informatik.hu-berlin.de/resources/models/tars-ner/tars-ner.pt not found in cache, downloading to /var/folders/lw/44xzqfx12wl1q4vdpdpnc3rm0000gn/T/tmp3c1ckfes


100%|██████████| 1421680237/1421680237 [03:40<00:00, 6452163.97B/s] 

2022-03-23 18:15:38,657 copying /var/folders/lw/44xzqfx12wl1q4vdpdpnc3rm0000gn/T/tmp3c1ckfes to cache at /Users/kabr/.flair/models/tars-ner.pt





2022-03-23 18:15:38,943 removing temp file /var/folders/lw/44xzqfx12wl1q4vdpdpnc3rm0000gn/T/tmp3c1ckfes
2022-03-23 18:15:38,990 loading file /Users/kabr/.flair/models/tars-ner.pt


Downloading: 100%|██████████| 878k/878k [00:00<00:00, 1.33MB/s] 
Downloading: 100%|██████████| 446k/446k [00:00<00:00, 988kB/s] 
Downloading: 100%|██████████| 1.29M/1.29M [00:00<00:00, 2.08MB/s]
Downloading: 100%|██████████| 482/482 [00:00<00:00, 78.2kB/s]


In [80]:
from flair.data import Sentence
import rubrix as rb

# build records for the first 100 examples
records = []
for record in dataset.select(range(100)):
    input_text = " ".join(record["tokens"])

    sentence = Sentence(input_text)
    tars.predict(sentence)
    prediction = [
        (entity.get_labels()[0].value, entity.start_pos, entity.end_pos)
        for entity in sentence.get_spans("ner")
    ]

    # building TokenClassificationRecord
    records.append(
        rb.TokenClassificationRecord(
            text=input_text,
            tokens=[token.text for token in sentence],
            prediction=prediction,
            prediction_agent="tars-ner",
        )
    )

# log the records to Rubrix
rb.log(records, name='flair_tars_ner_wnut_17', metadata={"split": "test"})

100%|██████████| 100/100 [00:01<00:00, 53.12it/s]

100 records logged to http://localhost:6900/ws/rubrix/flair_tars_ner_wnut_17





BulkResponse(dataset='flair_tars_ner_wnut_17', processed=100, failed=0)

### Demo

In [93]:
from flair.data import Sentence
from flair.models import SequenceTagger

# load tagger
tagger = SequenceTagger.load("flair/ner-german-large")

# make example sentence
sentence = Sentence("George Washington ging nach Washington")

# predict NER tags
tagger.predict(sentence)

# print sentence
print(sentence)

# print predicted NER spans
print('The following NER tags are found:')
# iterate over entities and print
for entity in sentence.get_spans('ner'):
    print(entity)

2022-03-23 18:35:09,651 loading file /Users/kabr/.flair/models/ner-german-large/6b8de9edd73722050be2547acf64c037b2df833c6e8f0e88934de08385e26c1e.4b0797effcc6ebb1889d5d29784b97f0a099c1569b319d87d7c387e44e2bba48


KeyboardInterrupt: 

In [None]:
#extracting the tweet part#
text = df['text'][1].tolist()

In [94]:
import rubrix as rb
from tqdm.auto import tqdm

records = []

df_head = df.head(1)
for record in df_head.index:
    # We only need the text of each instance
    text = df['text'][record]
    # get id for rubrix record metadata
    id = df['id'][record].tolist() # not allowed to be int64
    date = df['date'][record]
    year = df['year'][record].tolist() # not allowed to be int64
    
    
    sentence = Sentence(text)
    # predict NER tags
    tagger.predict(sentence)

    # print sentence
    print(sentence)
    # iterate over entities and print
    for entity in sentence.get_spans('ner'):
        print(entity)

    print(sentence.to_dict(tag_type='ner'))

   

Sentence: "Lilamunde Lesbenvocal - Der Chor Lilamunde Lesbenvokal gründet sich ."   [− Tokens: 10  − Token-Labels: "Lilamunde <B-ORG> Lesbenvocal <E-ORG> - Der Chor Lilamunde <B-ORG> Lesbenvokal <E-ORG> gründet sich ."]
Span [1,2]: "Lilamunde Lesbenvocal"   [− Labels: ORG (1.0)]
Span [6,7]: "Lilamunde Lesbenvokal"   [− Labels: ORG (1.0)]
{'text': 'Lilamunde Lesbenvocal - Der Chor Lilamunde Lesbenvokal gründet sich.', 'labels': [], 'entities': [{'text': 'Lilamunde Lesbenvocal', 'start_pos': 0, 'end_pos': 21, 'labels': [ORG (1.0)]}, {'text': 'Lilamunde Lesbenvokal', 'start_pos': 33, 'end_pos': 54, 'labels': [ORG (1.0)]}]}


### Find multiple Patterns

* https://stackoverflow.com/a/63694154
* https://stackoverflow.com/a/67747719


* Fuzzy Matcher: https://pypi.org/project/spaczz/
* https://towardsdatascience.com/structured-natural-language-processing-with-pandas-and-spacy-7089e66d2b10