In [3]:
import PyPDF2
import spacy
import logging, sys
logging.basicConfig(filename='reiseberichte.log', encoding='utf-8', level=logging.DEBUG)

In [3]:
# Zugriff auf Terminal, Language models in Umgebung runterladen
%run -m spacy download en_core_web_sm
%run -m spacy download de_core_news_sm

Collecting en-core-web-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting de-core-news-sm==3.7.0
  Using cached https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [1]:
#test-pdfs
protokoll_file = (r"Sitzung Thalmann_26.6.2023_beispiel.pdf")
artikel_file = (r"fables.pdf")
philo_file = (r"scotus.pdf")
reise_file = (r"reise.pdf")

In [5]:
def read_pdf(file_path): 
    # PDF öffnen und den Text extrahieren
    with open(file_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        text = ''
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

In [6]:
nlp_en = spacy.load("en_core_web_sm")
doc = nlp_en("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dep xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


In [7]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY


In [8]:
philo_text = read_pdf(philo_file)

In [9]:
philo_doc = nlp_en(philo_text)

In [55]:
places = set()
for token in philo_doc:
        if token.ent_type_ in ["GPE"] :
            places.add(token.text)
print(places)

{'eadem', 'Context', 'NC', 'Metaphysics', 'concipitur', 'philosophica', 'York', '998b22–27', 'Solchen', 'malis', 'Igitur', 'Donati', 'Wolter', 'Primus', 'City', 'Marrone', 'Thomas', 'quae', 'Amsterdam', 'Metaphysik', 'New', 'Venice', 'Armand', 'Notabilia', 'Andersen', 'DV', 'fundamenta', 'Rome', 'Vatican', 'the', 'Dumont', 'IV', 'Scoto', 'Paris', 'Porphyrii', 'of', 'Metaphysicae', 'nominetur', 'St.', 'Las', 'Turin', 'Aertsen', 'Leibniz', 'Ashworth', 'pp', 'Les', 'SE', 'Gegenstand', 'Maurer', 'Spain', 'Librum', 'essendo', 'Bonaventure', 'Aristotle', '.', 'Cross', 'Berlin', 'Vegas', 'Cat', 'Forlivesi', 'Cajetan', 'Philosophica', ',', 'Metaphor', 'Scotus', 'Marietti', 'Duns', 'analogia', 'Smith', 'Hochschild'}


In [11]:
nlp_de = spacy.load("de_core_news_sm")

In [15]:
reise_text = read_pdf(reise_file)

In [35]:
reise_doc = nlp_de(reise_text)
test_doc = nlp_de("ipsum larum Löffelstiel, wer nicht fragt der ist nicht viel. Er geht auch nicht nach Rom, nach Paris oder auf den Mount Everest")

In [56]:
places = set()
for token in test_doc:
        if token.ent_type_ in ["GPE", "LOC"] :
            places.add(token.text)
print(places)

{'Mount', 'Rom', 'Paris', 'Everest'}


In [58]:
places = set()
for token in reise_doc:
        if token.ent_type_ in ["GPE", "WORK_OF_ART", "EVENT", "LOC"]: # "LOC", "ORG" findet zu viel. Tags unter https://spacy.io/api/token
            places.add(token.text)
print(places)

{'Windes', 'München', 'Herzogtum', 'hohen', ' ', 'Zusam', 'Quinaults', 'Czaar', 'Washington', 'van', 'en', 'King', '2', 'Bayern', 'nirgens', 'Kloveniersburgwal', 'Orchestergräben', 'Kurfürstentümern', 'ondé', 'Damaskus', 'hinleget', 'Withall60', 'Pons', '[', 'di', 'continu=', 'Hannover', 'Histoire', 'Bridge', 'England', 'Antwerpener', 'Sak', 'beÿ', 'nd', 'Ijsselmeeres', 'au', 'XVIIe', 'Whitehall', 'Onyeka', 'Athlone', 'Portugal', 'Sabbath', 'holländer', 'Schiffer', 'Fort', 'Agenten31', 'officiers', 'läger', 'betten', 'Soho', 'holland', 'auff', 'Allard', 'Niederlanden', 'Litauen', 'Stadt', 'Ukraine', 'l', 'Hausgebrauch', 'Monmouths', 'gantzen', 'Kanal', 'Russland', 'Vornem', 'degelin', 'Great', 'actum', '3', 'Harwich', 'Schrecken', 'Stadtmauer', 'Europa', 'Reiches', 'Carel', 'haüser', 'Bulgarien', 'Schmiden', 'gespielet', 'Interregnum', 'Frisur', 'guide', 'o=', 'Gürtel', 'Martone', 'ß', 'Cathedral', 'e', 'Southwark', 'Dictionnaire', 'Samson', 'Herrschersitz', 'Anglikanische', 'D.C.', '-

In [78]:
# Nächstes Beispiel, 19. Jh Buch von e-rara, mitgeliefertes OCR
# Erst einmal Textencodings testen
with open("e-rara-102727.txt") as file:
    rara_text_d = file.read().replace('\n',' ')
with open("e-rara-102727.txt", encoding = "latin-1") as file:
    rara_text_l = file.read().replace('\n',' ')
with open("e-rara-102727.txt", encoding = "utf-8") as file:
    rara_text_u = file.read().replace('\n',' ')


In [81]:
print(rara_text_d[100:300])
print(rara_text_l[100:300])
print(rara_text_u[100:300]) #Besser!

kn Â«0Ltt8cMi.k: 6L8eÂ«L!^L^'  !er 10. August 1782 mit besonderer RÃ¼cksicht auf die Haltung des Schweizer- Garderegiments von Dr. Ã„ugnst von Goiyenbach, Awcsmem Slaatsschrcibcr der tchwrizechcheÂ» E
kn Â«0Ltt8cMi.k: 6L8eÂ«L!^L^'  !er 10. August 1782 mit besonderer RÃ¼cksicht auf die Haltung des Schweizer- Garderegiments von Dr. Ãugnst von Goiyenbach, Awcsmem Slaatsschrcibcr der tchwrizechcheÂ» E
 «0Ltt8cMi.k: 6L8e«L!^L^'  !er 10. August 1782 mit besonderer Rücksicht auf die Haltung des Schweizer- Garderegiments von Dr. Äugnst von Goiyenbach, Awcsmem Slaatsschrcibcr der tchwrizechche» Eidgmoff


In [82]:
rara_doc = nlp_de(rara_text_u)

In [87]:
#print(rara_doc[20:100]) # OCR ist schlecht

print(rara_doc[259]) # "Tuilerien" korrekt erkannt
print(rara_doc[259].ent_type_) # Aber type ist trotzdem leer, die .ent scheint nicht gut zu funktionieren

Tuilerien



In [88]:
cities_and_people = set()
for token in rara_doc:
        if token.ent_type_ in ["GPE", "PERSON", "LOC"] :
            places.add(token.text)

In [89]:
print(len(cities_and_people))

0
