In [None]:
!python -m spacy download de_core_news_lg
!pip install spacy

Collecting de-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.7.0/de_core_news_lg-3.7.0-py3-none-any.whl (567.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.8/567.8 MB[0m [31m707.4 kB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
#identifies accounts that are mentioned with @
def get_accounts(comment: str) -> list:
  if '@' in comment:
    comment_seperated = comment.replace('"', '').split(' ')
    accounts = []
    for word in comment_seperated:
      #
      accounts.extend([word for index, character in enumerate(word) if (character == '@' and index == 0 and len(word) != 1)])
    return accounts
  else:
    return []

In [None]:
import spacy  # version 3.0.6'
import hashlib
import re
import csv
import pandas as pd
import numpy as np

#load language modell
nlp = spacy.load("de_core_news_lg")


#load cities and countries
cities_countries = pd.read_csv("cities_countries.csv", encoding='unicode_escape')
country_name = cities_countries['country_name'].unique()
cities_name = cities_countries['name'].unique()
combined = np.append(country_name, cities_name)

abstract_location_patterns = [{"label": "ABS_LOC", "pattern": name} for name in combined]

#open list of public personas
file = open("public_figures_5.csv", "r")
public_figures = list(csv.reader(file, delimiter=","))[0]

public_figures_pattern = [{"label": "PER_PUBLIC", "pattern": name} for name in public_figures]

#open list of comments
tiktok_comments = pd.read_csv("comments_example.csv", encoding='unicode_escape', header=None)[0]
account_list = []
for comment in tiktok_comments:
  account_list.extend(get_accounts(comment))

user_patterns = [{"label": "PER_USER", "pattern": name} for name in account_list]

#add patterns to spacy pipe
ruler = nlp.add_pipe("entity_ruler", before='ner')
ruler.add_patterns(public_figures_pattern)
ruler.add_patterns(user_patterns)
ruler.add_patterns(abstract_location_patterns)

#set SALT for encription
SALT ='asdt@'


In [None]:
def anno(text:str, salt: str, nlp) -> str:
  #filter IBAN
  iban_pattern = "[A-Z]{2}\d{2} ?\d{4} ?\d{4} ?\d{4} ?\d{4} ?[\d]{0,2}"
  text  = re.sub(iban_pattern,'ANNO@IBAN', text)

  #filter Email
  text = re.sub(r'[\w.+-]+@[\w-]+\.[\w.-]+','ANNO@MAIL',text)

  #filter phone numbers
  phone_pattern = '((?:\+\d{2}[-\.\s]??|\d{4}[-\.\s]??)?(?:\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4}))'
  text = re.sub(phone_pattern,'ANNO@PHONE',text)

  street_pattern = '((Ober|Unter den|An |Im |Platz |Berg |Am |Alt\-).+|(?:([A-Z][a-zäüö-]+){1,2})).([Cc]haussee|[Aa]llee|[sS]tr(\.|(a(ss|ß)e))|[Rr]ing|berg|gasse|grund|hörn| Nord|graben|[mM]arkt|[Uu]fer|[Ss]tieg|[Ll]inden|[Dd]amm|[pP]latz|brücke|Steinbüchel|Burg|stiege|[Ww]eg|rain|park|[Ww]eide|[Hh][oö]f|pfad|garten|bogen|passage).+?(\d{1,4})([a-zäöüß]+)?(\-?\d{1,4}[a-zäöüß]?)?'
  text = re.sub(street_pattern,'STREET',text)

  doc = nlp(text)
  for ent in doc.ents:
    #hash people and tiktok names
    if ent.label_ == 'PER' or ent.label_ == 'PER_USER':
      text_salt = ent.text + SALT
      text = text.replace(ent.text, 'PerHash_' +str(int(hashlib.md5(text_salt.encode('utf-8')).hexdigest(), 16)))
    #Hash locations
    if ent.label_ == 'LOC':
      text_salt = ent.text + SALT
      text = text.replace(ent.text, 'LocHash_' +str(int(hashlib.md5(text_salt.encode('utf-8')).hexdigest(), 16)))
  return text


#Test sentences
test_names = "@Kira und Tino sind nicht bekannt aber gute Kumpels. Angela Merkel, Olaf Scholz und Trump kenne ich  auch nicht, alle leben in Mainz."
test_mails_phone = "Hallo meine email ist test@gmail.com und hi@tester.com. Du kannst mich auch unter +49 172 615 5172 erreichen "
test_IBAN = "Hallo meine IBAN ist DE89 3704 0044 0532 0130 00"
test_location = "Bist du aus Deutschland? Kommst du nicht aus Mainz? Wohnst du nicht in der Kantstr. 77?"


print(test_location)
res = anno(test_location, SALT, nlp)
print(res)

print(test_names)
res = anno(test_names, SALT, nlp)

print(res)

for comment in tiktok_comments:
  print(comment)
  print(anno(comment, SALT, nlp))


Bist du aus Deutschland? Kommst du nicht aus Mainz? Wohnst du nicht in der Kantstr. 77?
Bist du aus Deutschland? Kommst du nicht aus Mainz? Wohnst du nicht in der STREET?
@Kira und Tino sind nicht bekannt aber gute Kumpels. Angela Merkel, Olaf Scholz und Trump kenne ich  auch nicht, alle leben in Mainz.
PerHash_37871226675315786569218850759815368814 und PerHash_64847616030378914486786183224515508550 sind nicht bekannt aber gute Kumpels. Angela Merkel, Olaf Scholz und Trump kenne ich  auch nicht, alle leben in Mainz.
"@Marabu @Idle miner ehrlich keine verbrecher"
"PerHash_61462611943297774319910676056363745630 PerHash_121330025448368954729412145924354487368 miner ehrlich keine verbrecher"
"@Kira @franka ich schaffs nicht mehr mit der AfD"
"PerHash_37871226675315786569218850759815368814 PerHash_93433298092815671817665368115717297163 ich schaffs nicht mehr mit der AfD"
"@currently in love digga gefühlt keine deutschen Freunde mehr . . . Bin einzigste haha"
"PerHash_16841571026874828783374